Initial commit: cuGenOpt GPU optimization solver

This commit is contained in:
L-yang-yang 2026-03-20 00:33:45 +08:00
commit fc5a0ff4af
117 changed files with 25545 additions and 0 deletions

View file

@ -0,0 +1,90 @@
/**
* cuda_utils.cuh - CUDA 工具集
*
* 职责:错误检查、设备信息、随机数工具
* 规则:所有 CUDA API 调用都必须用 CUDA_CHECK 包裹
*/
#pragma once
#include <cstdio>
#include <cstdlib>
#include <curand_kernel.h>
// ============================================================
// 错误检查
// ============================================================
#define CUDA_CHECK(call) do { \
cudaError_t err = (call); \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d: %s\n", \
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while(0)
// kernel launch 后检查(捕获异步错误)
#define CUDA_CHECK_LAST() do { \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA kernel error at %s:%d: %s\n", \
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while(0)
// ============================================================
// 设备信息
// ============================================================
inline void print_device_info() {
int device;
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDevice(&device));
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
printf("GPU: %s\n", prop.name);
printf(" SM count: %d\n", prop.multiProcessorCount);
printf(" Max threads/SM: %d\n", prop.maxThreadsPerMultiProcessor);
printf(" Shared mem/blk: %zu KB\n", prop.sharedMemPerBlock / 1024);
printf(" Global mem: %.1f GB\n", prop.totalGlobalMem / 1e9);
printf(" Compute cap: %d.%d\n", prop.major, prop.minor);
}
// ============================================================
// 随机数工具 (Device 端)
// ============================================================
// 初始化 curand 状态,每个线程一个
__global__ void init_curand_kernel(curandState* states, unsigned long long seed, int n) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) {
curand_init(seed, tid, 0, &states[tid]);
}
}
// Device 端:生成 [0, bound) 的随机整数
__device__ inline int rand_int(curandState* state, int bound) {
return curand(state) % bound;
}
// Device 端Fisher-Yates shuffle对 arr[0..n-1] 做随机排列
__device__ inline void shuffle(int* arr, int n, curandState* state) {
for (int i = n - 1; i > 0; i--) {
int j = rand_int(state, i + 1);
int tmp = arr[i];
arr[i] = arr[j];
arr[j] = tmp;
}
}
// ============================================================
// Kernel 启动参数计算
// ============================================================
inline int div_ceil(int a, int b) { return (a + b - 1) / b; }
// 计算合适的 block 数量
inline int calc_grid_size(int n, int block_size = 256) {
return div_ceil(n, block_size);
}

View file

@ -0,0 +1,141 @@
/**
* gpu_cache.cuh - GPU 全局内存哈希表(通用缓存组件)
*
* 设计:
* - 开放寻址固定容量power of 2线性探测
* - key = uint64_t由 Problem 自行计算 hash
* - value = float单个指标值
* - 无锁:允许 race condition缓存语义偶尔脏读可接受
* - 自带命中/未命中原子计数器
*
* 用法:
* GpuCache cache = GpuCache::allocate(65536); // host
* // ... pass cache as Problem member to kernels ...
* cache.print_stats(); // host
* cache.destroy(); // host
*
* 参考scute 项目 LRUCachekey = metric_type + content_hash
*/
#pragma once
#include "cuda_utils.cuh"
#include <cstdint>
// ============================================================
// 常量
// ============================================================
static constexpr uint64_t CACHE_EMPTY_KEY = 0xFFFFFFFFFFFFFFFFULL;
static constexpr int CACHE_MAX_PROBE = 8; // 最大线性探测步数
// ============================================================
// GpuCache 结构体POD可安全拷贝到 kernel
// ============================================================
struct GpuCache {
uint64_t* keys; // GPU 全局内存
float* values; // GPU 全局内存
unsigned int* d_hits; // 原子计数器GPU
unsigned int* d_misses; // 原子计数器GPU
int capacity; // 必须是 2 的幂
int mask; // = capacity - 1
// ---- Host 操作 ----
static GpuCache allocate(int cap = 65536) {
GpuCache c;
c.capacity = cap;
c.mask = cap - 1;
CUDA_CHECK(cudaMalloc(&c.keys, sizeof(uint64_t) * cap));
CUDA_CHECK(cudaMalloc(&c.values, sizeof(float) * cap));
CUDA_CHECK(cudaMalloc(&c.d_hits, sizeof(unsigned int)));
CUDA_CHECK(cudaMalloc(&c.d_misses, sizeof(unsigned int)));
c.clear();
return c;
}
static GpuCache disabled() {
GpuCache c;
c.keys = nullptr; c.values = nullptr;
c.d_hits = nullptr; c.d_misses = nullptr;
c.capacity = 0; c.mask = 0;
return c;
}
bool is_enabled() const { return keys != nullptr; }
void clear() {
CUDA_CHECK(cudaMemset(keys, 0xFF, sizeof(uint64_t) * capacity));
CUDA_CHECK(cudaMemset(d_hits, 0, sizeof(unsigned int)));
CUDA_CHECK(cudaMemset(d_misses, 0, sizeof(unsigned int)));
}
void destroy() {
if (keys) cudaFree(keys);
if (values) cudaFree(values);
if (d_hits) cudaFree(d_hits);
if (d_misses) cudaFree(d_misses);
keys = nullptr; values = nullptr;
d_hits = nullptr; d_misses = nullptr;
}
void print_stats() const {
if (!keys) { printf(" Cache: disabled\n"); return; }
unsigned int h = 0, m = 0;
CUDA_CHECK(cudaMemcpy(&h, d_hits, sizeof(unsigned int), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(&m, d_misses, sizeof(unsigned int), cudaMemcpyDeviceToHost));
unsigned int total = h + m;
float rate = total > 0 ? (float)h / total * 100.0f : 0.0f;
printf(" Cache: %u lookups | %u hits + %u misses | hit rate = %.1f%%\n",
total, h, m, rate);
printf(" Cache: capacity = %d entries (%.1f KB)\n",
capacity, capacity * (sizeof(uint64_t) + sizeof(float)) / 1024.0f);
}
};
// ============================================================
// Device 函数:哈希 / 查找 / 插入
// ============================================================
/// FNV-1a 哈希:对一段有序 int 序列(如路线中的客户 ID
__device__ inline uint64_t route_hash(const int* data, int len) {
uint64_t h = 14695981039346656037ULL; // FNV offset basis
for (int i = 0; i < len; i++) {
h ^= (uint64_t)(unsigned int)data[i];
h *= 1099511628211ULL; // FNV prime
}
return (h == CACHE_EMPTY_KEY) ? h - 1 : h; // 避免与哨兵值碰撞
}
/// 查找:命中返回 true + 写入 out
__device__ inline bool cache_lookup(const GpuCache& c, uint64_t key, float& out) {
int slot = (int)(key & (uint64_t)c.mask);
for (int p = 0; p < CACHE_MAX_PROBE; p++) {
int idx = (slot + p) & c.mask;
uint64_t k = c.keys[idx];
if (k == key) {
out = c.values[idx];
return true;
}
if (k == CACHE_EMPTY_KEY) return false; // 空槽 → 一定不存在
}
return false; // 探测用尽
}
/// 插入:写入 key-value同 key 覆盖,探测满则驱逐首槽
__device__ inline void cache_insert(const GpuCache& c, uint64_t key, float value) {
int slot = (int)(key & (uint64_t)c.mask);
for (int p = 0; p < CACHE_MAX_PROBE; p++) {
int idx = (slot + p) & c.mask;
uint64_t k = c.keys[idx];
if (k == CACHE_EMPTY_KEY || k == key) {
c.keys[idx] = key;
c.values[idx] = value;
return;
}
}
// 探测满:驱逐首槽
int idx = slot & c.mask;
c.keys[idx] = key;
c.values[idx] = value;
}

View file

@ -0,0 +1,121 @@
#pragma once
#include "types.cuh"
#include <vector>
#include <algorithm>
#include <numeric>
namespace heuristic_init {
// 单行排列:所有行填相同排列
template<typename Sol>
static void build_sorted_permutation(Sol& sol, const std::vector<int>& order,
int dim1, int dim2) {
for (int r = 0; r < dim1; r++) {
sol.dim2_sizes[r] = dim2;
for (int c = 0; c < dim2; c++)
sol.data[r][c] = order[c];
}
sol.penalty = 0.0f;
for (int i = 0; i < MAX_OBJ; i++) sol.objectives[i] = 0.0f;
}
// Partition 模式:排列均匀切分到 dim1 行,元素不重复
template<typename Sol>
static void build_partition_from_order(Sol& sol, const std::vector<int>& order,
int dim1, int total_elements) {
int idx = 0;
for (int r = 0; r < dim1; r++) {
int count = total_elements / dim1;
if (r < total_elements % dim1) count++;
sol.dim2_sizes[r] = count;
for (int c = 0; c < count; c++)
sol.data[r][c] = order[idx++];
}
sol.penalty = 0.0f;
for (int i = 0; i < MAX_OBJ; i++) sol.objectives[i] = 0.0f;
}
template<typename Sol>
std::vector<Sol> build_from_matrices(const HeuristicMatrix* matrices, int num_matrices,
int dim1, int dim2, EncodingType encoding,
bool partition_mode = false, int total_elements = 0) {
std::vector<Sol> results;
if (encoding != EncodingType::Permutation) return results;
int elem_count = partition_mode ? total_elements : dim2;
if (num_matrices <= 0 || elem_count <= 0) return results;
auto make_sol = [&](const std::vector<int>& order) {
Sol sol{};
if (partition_mode)
build_partition_from_order(sol, order, dim1, total_elements);
else
build_sorted_permutation(sol, order, dim1, dim2);
return sol;
};
for (int m = 0; m < num_matrices; m++) {
const float* mat = matrices[m].data;
int N = matrices[m].N;
if (!mat || N < elem_count) continue;
std::vector<float> row_sum(N, 0.0f);
std::vector<float> col_sum(N, 0.0f);
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++) {
row_sum[i] += mat[i * N + j];
col_sum[j] += mat[i * N + j];
}
// 对于 Partition (VRPTW),距离矩阵含 depot (index 0)
// 排序只针对客户 (index 1..N-1),输出值为 0-based 客户编号
std::vector<int> idx;
if (partition_mode && N > elem_count) {
for (int i = 1; i <= elem_count; i++) idx.push_back(i);
} else {
idx.resize(elem_count);
std::iota(idx.begin(), idx.end(), 0);
}
auto to_customer = [&](std::vector<int>& order) {
if (partition_mode && N > elem_count) {
for (auto& v : order) v -= 1;
}
};
// row_sum ascending
{
auto order = idx;
std::sort(order.begin(), order.end(),
[&](int a, int b) { return row_sum[a] < row_sum[b]; });
to_customer(order);
results.push_back(make_sol(order));
}
// row_sum descending
{
auto order = idx;
std::sort(order.begin(), order.end(),
[&](int a, int b) { return row_sum[a] > row_sum[b]; });
to_customer(order);
results.push_back(make_sol(order));
}
// col_sum ascending
{
auto order = idx;
std::sort(order.begin(), order.end(),
[&](int a, int b) { return col_sum[a] < col_sum[b]; });
to_customer(order);
results.push_back(make_sol(order));
}
// col_sum descending
{
auto order = idx;
std::sort(order.begin(), order.end(),
[&](int a, int b) { return col_sum[a] > col_sum[b]; });
to_customer(order);
results.push_back(make_sol(order));
}
}
return results;
}
} // namespace heuristic_init

View file

@ -0,0 +1,258 @@
/**
* init_selection.cuh - 初始解采样择优 + NSGA-II 选择
*
* Host 端逻辑,在 solver 初始化阶段调用一次。
* 从 K × pop_size 个候选解中选出 pop_size 个作为初始种群。
*
* 选择策略:
* 1. 核心目标预留名额(按 importance 分配)
* 2. NSGA-II 选择(非支配排序 + 加权拥挤度)
* 3. 纯随机保底(多样性)
*
* 单目标时自动退化为 top-N 排序,无需分支。
*/
#pragma once
#include "types.cuh"
#include <algorithm>
#include <vector>
#include <cmath>
#include <cstring>
namespace init_sel {
// ============================================================
// 候选解的目标信息(从 GPU 下载后在 host 端使用)
// ============================================================
struct CandidateInfo {
int idx; // 在候选数组中的原始索引
float objs[MAX_OBJ]; // 归一化后的目标值(越小越好)
float penalty;
int rank; // 非支配排序层级0 = Pareto 前沿)
float crowding; // 拥挤度距离
bool selected; // 是否已被选中
};
// ============================================================
// 非支配排序Fast Non-dominated Sort
// ============================================================
// 复杂度O(M × N²)M = 目标数N = 候选数
// 对初始化场景N ≤ 几千M ≤ 4完全可接受
inline void fast_nondominated_sort(std::vector<CandidateInfo>& cands,
int num_obj,
std::vector<std::vector<int>>& fronts) {
int n = (int)cands.size();
std::vector<int> dom_count(n, 0); // 被多少个解支配
std::vector<std::vector<int>> dom_set(n); // 支配了哪些解
// 判断 a 是否支配 ba 在所有目标上 ≤ b且至少一个 <
// 先处理 penalty可行解支配不可行解
auto dominates = [&](int a, int b) -> bool {
const auto& ca = cands[a];
const auto& cb = cands[b];
// penalty 处理
if (ca.penalty <= 0.0f && cb.penalty > 0.0f) return true;
if (ca.penalty > 0.0f && cb.penalty <= 0.0f) return false;
if (ca.penalty > 0.0f && cb.penalty > 0.0f) return ca.penalty < cb.penalty;
bool all_leq = true;
bool any_lt = false;
for (int m = 0; m < num_obj; m++) {
if (ca.objs[m] > cb.objs[m]) { all_leq = false; break; }
if (ca.objs[m] < cb.objs[m]) any_lt = true;
}
return all_leq && any_lt;
};
// 计算支配关系
for (int i = 0; i < n; i++) {
for (int j = i + 1; j < n; j++) {
if (dominates(i, j)) {
dom_set[i].push_back(j);
dom_count[j]++;
} else if (dominates(j, i)) {
dom_set[j].push_back(i);
dom_count[i]++;
}
}
}
// 提取各层前沿
fronts.clear();
std::vector<int> current_front;
for (int i = 0; i < n; i++) {
if (dom_count[i] == 0) {
cands[i].rank = 0;
current_front.push_back(i);
}
}
int front_idx = 0;
while (!current_front.empty()) {
fronts.push_back(current_front);
std::vector<int> next_front;
for (int i : current_front) {
for (int j : dom_set[i]) {
dom_count[j]--;
if (dom_count[j] == 0) {
cands[j].rank = front_idx + 1;
next_front.push_back(j);
}
}
}
current_front = next_front;
front_idx++;
}
}
// ============================================================
// 加权拥挤度距离
// ============================================================
// 标准拥挤度 + importance 加权:核心目标维度上的间距贡献更大
inline void weighted_crowding_distance(std::vector<CandidateInfo>& cands,
const std::vector<int>& front,
int num_obj,
const float* importance) {
int n = (int)front.size();
if (n <= 2) {
for (int i : front) cands[i].crowding = 1e18f; // 边界解无穷大
return;
}
for (int i : front) cands[i].crowding = 0.0f;
std::vector<int> sorted_idx(front.begin(), front.end());
for (int m = 0; m < num_obj; m++) {
// 按目标 m 排序
std::sort(sorted_idx.begin(), sorted_idx.end(),
[&](int a, int b) { return cands[a].objs[m] < cands[b].objs[m]; });
float range = cands[sorted_idx[n-1]].objs[m] - cands[sorted_idx[0]].objs[m];
if (range < 1e-12f) continue; // 该目标无区分度
// 边界解设为无穷大
cands[sorted_idx[0]].crowding += 1e18f;
cands[sorted_idx[n-1]].crowding += 1e18f;
// 中间解:相邻间距 × importance 权重
float w = importance[m];
for (int i = 1; i < n - 1; i++) {
float gap = cands[sorted_idx[i+1]].objs[m] - cands[sorted_idx[i-1]].objs[m];
cands[sorted_idx[i]].crowding += w * (gap / range);
}
}
}
// ============================================================
// 主选择函数:从 N 个候选中选出 target 个
// ============================================================
// 返回被选中的候选索引
inline std::vector<int> nsga2_select(std::vector<CandidateInfo>& cands,
int num_obj,
const float* importance,
int target,
int num_reserved_random) {
// --- 1. 核心目标预留名额 ---
int num_reserve_total = target - num_reserved_random;
// 预留比例importance[i] × 30% 的名额(剩余 70% 给 NSGA-II
float reserve_ratio = 0.3f;
std::vector<int> selected;
selected.reserve(target);
// 对每个目标,按该目标排序取 top
for (int m = 0; m < num_obj; m++) {
int quota = (int)(num_reserve_total * importance[m] * reserve_ratio);
if (quota < 1 && num_obj > 1) quota = 1; // 每个目标至少 1 个
// 按目标 m 排序(越小越好)
std::vector<int> by_obj(cands.size());
for (int i = 0; i < (int)cands.size(); i++) by_obj[i] = i;
std::sort(by_obj.begin(), by_obj.end(),
[&](int a, int b) { return cands[a].objs[m] < cands[b].objs[m]; });
int added = 0;
for (int i = 0; i < (int)by_obj.size() && added < quota; i++) {
int idx = by_obj[i];
if (!cands[idx].selected) {
cands[idx].selected = true;
selected.push_back(idx);
added++;
}
}
}
// --- 2. NSGA-II 选择填充剩余名额 ---
int remaining = target - num_reserved_random - (int)selected.size();
if (remaining > 0) {
// 非支配排序
std::vector<std::vector<int>> fronts;
fast_nondominated_sort(cands, num_obj, fronts);
for (auto& front : fronts) {
if (remaining <= 0) break;
// 过滤已选中的
std::vector<int> available;
for (int i : front) {
if (!cands[i].selected) available.push_back(i);
}
if ((int)available.size() <= remaining) {
// 整层都选
for (int i : available) {
cands[i].selected = true;
selected.push_back(i);
remaining--;
}
} else {
// 该层需要截断:按加权拥挤度选
weighted_crowding_distance(cands, available, num_obj, importance);
std::sort(available.begin(), available.end(),
[&](int a, int b) { return cands[a].crowding > cands[b].crowding; });
for (int i = 0; i < remaining; i++) {
cands[available[i]].selected = true;
selected.push_back(available[i]);
}
remaining = 0;
}
}
}
return selected;
}
// ============================================================
// 单目标快速路径:直接按标量排序取 top
// ============================================================
inline std::vector<int> top_n_select(std::vector<CandidateInfo>& cands,
int target,
int num_reserved_random) {
int to_select = target - num_reserved_random;
// 按 penalty 优先,然后按 objs[0](已归一化为越小越好)
std::vector<int> indices(cands.size());
for (int i = 0; i < (int)cands.size(); i++) indices[i] = i;
std::sort(indices.begin(), indices.end(), [&](int a, int b) {
if (cands[a].penalty <= 0.0f && cands[b].penalty > 0.0f) return true;
if (cands[a].penalty > 0.0f && cands[b].penalty <= 0.0f) return false;
if (cands[a].penalty > 0.0f && cands[b].penalty > 0.0f)
return cands[a].penalty < cands[b].penalty;
return cands[a].objs[0] < cands[b].objs[0];
});
std::vector<int> selected;
selected.reserve(to_select);
for (int i = 0; i < to_select && i < (int)indices.size(); i++) {
selected.push_back(indices[i]);
cands[indices[i]].selected = true;
}
return selected;
}
} // namespace init_sel

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,212 @@
/**
* population.cuh - 种群管理
*
* v2.0: Block 级架构
* - RNG 数组大小 = pop_size * block_size每个 block 内每个线程独立 RNG
* - 初始化 kernel 保持 1-thread-per-solution初始化只做一次不需要并行
* - find_best_kernel 保持单线程(种群规模不大)
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
// ============================================================
// Device 端 Kernel模板化
// ============================================================
template<typename Sol>
__global__ void init_permutation_kernel(Sol* pop, int pop_size,
int dim1, int dim2_default,
curandState* rng_states) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= pop_size) return;
Sol& sol = pop[tid];
curandState* rng = &rng_states[tid];
for (int r = 0; r < dim1; r++) {
sol.dim2_sizes[r] = dim2_default;
for (int c = 0; c < dim2_default; c++) sol.data[r][c] = c;
shuffle(sol.data[r], dim2_default, rng);
}
sol.penalty = 0.0f;
}
template<typename Sol>
__global__ void init_binary_kernel(Sol* pop, int pop_size,
int dim1, int dim2_default,
curandState* rng_states) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= pop_size) return;
Sol& sol = pop[tid];
curandState* rng = &rng_states[tid];
for (int r = 0; r < dim1; r++) {
sol.dim2_sizes[r] = dim2_default;
for (int c = 0; c < dim2_default; c++) sol.data[r][c] = curand(rng) % 2;
}
sol.penalty = 0.0f;
}
template<typename Sol>
__global__ void init_integer_kernel(Sol* pop, int pop_size,
int dim1, int dim2_default,
int lb, int ub,
curandState* rng_states) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= pop_size) return;
Sol& sol = pop[tid];
curandState* rng = &rng_states[tid];
int range = ub - lb + 1;
for (int r = 0; r < dim1; r++) {
sol.dim2_sizes[r] = dim2_default;
for (int c = 0; c < dim2_default; c++)
sol.data[r][c] = lb + (curand(rng) % range);
}
sol.penalty = 0.0f;
}
// ============================================================
// 多重集排列初始化 — 每个值 [0, N) 重复 R 次,总长度 N*R
// ============================================================
// 用于 JSP 工序排列编码N=num_jobs, R=num_ops值 j 出现 R 次表示工件 j
template<typename Sol>
__global__ void init_multiset_perm_kernel(Sol* pop, int pop_size,
int dim1, int num_values, int repeat_count,
curandState* rng_states) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= pop_size) return;
Sol& sol = pop[tid];
curandState* rng = &rng_states[tid];
int total = num_values * repeat_count;
for (int r = 0; r < dim1; r++) {
sol.dim2_sizes[r] = total;
int idx = 0;
for (int v = 0; v < num_values; v++)
for (int k = 0; k < repeat_count; k++)
sol.data[r][idx++] = v;
shuffle(sol.data[r], total, rng);
}
sol.penalty = 0.0f;
}
// ============================================================
// 分区初始化 — 元素 {0..total_elements-1} 不重复分配到 dim1 行
// ============================================================
template<typename Sol>
__global__ void init_partition_kernel(Sol* pop, int pop_size,
int dim1, int total_elements,
curandState* rng_states) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= pop_size) return;
Sol& sol = pop[tid];
curandState* rng = &rng_states[tid];
for (int i = 0; i < total_elements; i++) sol.data[0][i] = i;
shuffle(sol.data[0], total_elements, rng);
int idx = 0;
for (int r = 0; r < dim1; r++) {
int count = total_elements / dim1;
if (r < total_elements % dim1) count++;
sol.dim2_sizes[r] = count;
if (r > 0) {
for (int c = 0; c < count; c++)
sol.data[r][c] = sol.data[0][idx + c];
}
idx += count;
}
sol.penalty = 0.0f;
}
template<typename Sol>
__global__ void find_best_kernel(const Sol* pop, int pop_size,
ObjConfig oc, int* best_idx) {
if (threadIdx.x != 0 || blockIdx.x != 0) return;
int best = 0;
for (int i = 1; i < pop_size; i++)
if (is_better(pop[i], pop[best], oc)) best = i;
*best_idx = best;
}
// ============================================================
// Host 端 RAII 类(模板化)
// ============================================================
template<typename Sol>
class Population {
public:
Sol* d_solutions = nullptr;
curandState* d_rng_states = nullptr; // 大小 = pop_size * block_size
int size = 0;
int rng_count = 0; // RNG 状态总数
Population() = default;
// block_size: Block 级架构下每个 block 的线程数
// RNG 数组大小 = pop_size * block_size每个 block 内每个线程独立 RNG
void allocate(int pop_size, int block_size = 128) {
size = pop_size;
rng_count = pop_size * block_size;
CUDA_CHECK(cudaMalloc(&d_solutions, sizeof(Sol) * size));
CUDA_CHECK(cudaMalloc(&d_rng_states, sizeof(curandState) * rng_count));
}
void init_rng(unsigned seed, int block_size = 256) {
int grid = calc_grid_size(rng_count, block_size);
init_curand_kernel<<<grid, block_size>>>(d_rng_states, seed, rng_count);
CUDA_CHECK_LAST();
}
void init_population(const ProblemConfig& cfg, int block_size = 256) {
int grid = calc_grid_size(size, block_size);
if (cfg.row_mode == RowMode::Partition) {
init_partition_kernel<<<grid, block_size>>>(
d_solutions, size, cfg.dim1, cfg.total_elements, d_rng_states);
} else if (cfg.encoding == EncodingType::Permutation && cfg.perm_repeat_count > 1) {
int num_values = cfg.dim2_default / cfg.perm_repeat_count;
init_multiset_perm_kernel<<<grid, block_size>>>(
d_solutions, size, cfg.dim1, num_values, cfg.perm_repeat_count, d_rng_states);
} else {
switch (cfg.encoding) {
case EncodingType::Permutation:
init_permutation_kernel<<<grid, block_size>>>(
d_solutions, size, cfg.dim1, cfg.dim2_default, d_rng_states);
break;
case EncodingType::Binary:
init_binary_kernel<<<grid, block_size>>>(
d_solutions, size, cfg.dim1, cfg.dim2_default, d_rng_states);
break;
case EncodingType::Integer:
init_integer_kernel<<<grid, block_size>>>(
d_solutions, size, cfg.dim1, cfg.dim2_default,
cfg.value_lower_bound, cfg.value_upper_bound,
d_rng_states);
break;
}
}
CUDA_CHECK_LAST();
}
Sol download_solution(int idx) const {
Sol h_sol;
CUDA_CHECK(cudaMemcpy(&h_sol, d_solutions + idx, sizeof(Sol), cudaMemcpyDeviceToHost));
return h_sol;
}
~Population() {
if (d_solutions) cudaFree(d_solutions);
if (d_rng_states) cudaFree(d_rng_states);
}
Population(const Population&) = delete;
Population& operator=(const Population&) = delete;
Population(Population&& o) noexcept
: d_solutions(o.d_solutions), d_rng_states(o.d_rng_states),
size(o.size), rng_count(o.rng_count) {
o.d_solutions = nullptr; o.d_rng_states = nullptr;
o.size = 0; o.rng_count = 0;
}
};

View file

@ -0,0 +1,125 @@
/**
* relation_matrix.cuh - G/O 关系矩阵管理
*
* G[i][j]: 分组倾向(元素 i 和 j 应在同一行的倾向,对称)
* O[i][j]: 排序倾向(元素 i 应排在 j 前面的倾向,不对称)
*
* 更新来源:历史最优解统计
* 每当 host 端获取到当前 best 解,扫描所有元素对关系:
* - 同行 → G[i][j] 增强
* - i 在 j 前 → O[i][j] 增强
* 使用 EMA 衰减M[i][j] = α * M[i][j] + (1-α) * signal
*
* 生命周期:
* 1. relation_matrix_create(N) — 分配 host/device 内存,初始化为 0
* 2. relation_matrix_update(rm, sol, dim1) — 从一个解更新 G/Ohost 端)
* 3. relation_matrix_upload(rm) — 上传 h_G/h_O 到 d_G/d_O
* 4. relation_matrix_destroy(rm) — 释放内存
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
#include <cstring>
// ============================================================
// 创建 / 销毁
// ============================================================
inline RelationMatrix relation_matrix_create(int N, float decay = 0.95f) {
RelationMatrix rm;
rm.N = N;
rm.decay = decay;
rm.update_count = 0;
size_t bytes = (size_t)N * N * sizeof(float);
rm.h_G = new float[N * N];
rm.h_O = new float[N * N];
memset(rm.h_G, 0, bytes);
memset(rm.h_O, 0, bytes);
CUDA_CHECK(cudaMalloc(&rm.d_G, bytes));
CUDA_CHECK(cudaMalloc(&rm.d_O, bytes));
CUDA_CHECK(cudaMemset(rm.d_G, 0, bytes));
CUDA_CHECK(cudaMemset(rm.d_O, 0, bytes));
return rm;
}
inline void relation_matrix_destroy(RelationMatrix& rm) {
delete[] rm.h_G;
delete[] rm.h_O;
CUDA_CHECK(cudaFree(rm.d_G));
CUDA_CHECK(cudaFree(rm.d_O));
rm.h_G = rm.h_O = nullptr;
rm.d_G = rm.d_O = nullptr;
rm.N = 0;
}
// ============================================================
// 从一个解更新 G/Ohost 端)
// ============================================================
// sol: 当前最优解(已下载到 host
// dim1: 实际使用的行数
//
// 逻辑:
// 对 sol 中每对元素 (val_a, val_b)
// 如果在同一行 → G[val_a][val_b] 增强
// 如果 val_a 在 val_b 前面 → O[val_a][val_b] 增强
//
// 注意:元素值 val 必须在 [0, N) 范围内才有意义
// 对于 partition 编码VRP元素值就是客户编号
// 对于单行排列TSP元素值就是城市编号
template<typename Sol>
void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) {
int N = rm.N;
float alpha = rm.decay;
float signal_strength = 1.0f;
// 衰减所有现有值
for (int i = 0; i < N * N; i++) {
rm.h_G[i] *= alpha;
rm.h_O[i] *= alpha;
}
// 扫描解中的元素对关系
for (int r = 0; r < dim1; r++) {
int sz = sol.dim2_sizes[r];
for (int c1 = 0; c1 < sz; c1++) {
int val_a = sol.data[r][c1];
if (val_a < 0 || val_a >= N) continue;
for (int c2 = c1 + 1; c2 < sz; c2++) {
int val_b = sol.data[r][c2];
if (val_b < 0 || val_b >= N) continue;
// 同行 → G 增强(对称)
rm.h_G[val_a * N + val_b] += (1.0f - alpha) * signal_strength;
rm.h_G[val_b * N + val_a] += (1.0f - alpha) * signal_strength;
// val_a 在 val_b 前 → O[val_a][val_b] 增强
rm.h_O[val_a * N + val_b] += (1.0f - alpha) * signal_strength;
}
}
}
// 裁剪到 [0, 1]
for (int i = 0; i < N * N; i++) {
if (rm.h_G[i] > 1.0f) rm.h_G[i] = 1.0f;
if (rm.h_O[i] > 1.0f) rm.h_O[i] = 1.0f;
}
rm.update_count++;
}
// ============================================================
// 上传到 GPU
// ============================================================
inline void relation_matrix_upload(const RelationMatrix& rm) {
size_t bytes = (size_t)rm.N * rm.N * sizeof(float);
CUDA_CHECK(cudaMemcpy(rm.d_G, rm.h_G, bytes, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(rm.d_O, rm.h_O, bytes, cudaMemcpyHostToDevice));
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,721 @@
/**
* types.cuh - 核心类型定义
*
* 包含编码类型、Solution 模板、ProblemConfig/SolverConfig、
* SeqRegistryAOS 序列级权重、KStepConfig多步执行
* RelationMatrixG/O 关系矩阵、ProblemBaseCRTP 基类)
*/
#pragma once
// ============================================================
// 编译时常量
// ============================================================
constexpr int MAX_OBJ = 4; // 最多 4 个目标16字节不值得模板化
constexpr int MAX_SEQ = 32; // 最大序列数(内置 ~16 + 自定义算子 ≤8留余量
constexpr int MAX_K = 3; // 多步执行的最大步数K=1,2,3
// AOS 权重上下限(归一化后)
constexpr float AOS_WEIGHT_FLOOR = 0.05f; // 最低权重保底(确保充分探索)
constexpr float AOS_WEIGHT_CAP = 0.35f; // 最高权重上限(防止赢者通吃)
// ============================================================
// 枚举类型
// ============================================================
enum class EncodingType {
Permutation, // 排列:元素不重复
Binary, // 0-1flip 是主要算子
Integer // 有界整数
};
enum class RowMode {
Single, // dim1=1单行TSP/QAP/Knapsack 等大部分问题)
Fixed, // dim1>1行等长不可变JSP-Int/Schedule禁止 SPLIT/MERGE
Partition // dim1>1元素分区到各行行长可变CVRP/VRPTW
};
enum class ObjDir {
Minimize,
Maximize
};
// 多目标比较模式
enum class CompareMode {
Weighted, // 加权求和sum(weight[i] * obj[i]),越小越好
Lexicographic // 字典法:按优先级逐目标比较,前面的目标优先
};
enum class MigrateStrategy {
Ring, // 环形:各岛最优→邻岛最差(慢传播,高多样性)
TopN, // 全局 Top-N 轮转分发(快传播,强收敛)
Hybrid // 两者兼顾Top-N 替换最差 + Ring 替换次差
};
// ============================================================
// SeqID — 统一的 OperationSequence 编号
// ============================================================
// 每个 SeqID 对应一种具体的搜索操作(原子或多步)
// AOS 权重跟踪粒度 = SeqID每个序列独立权重
//
// 命名规则SEQ_{编码}_{操作名}
// 跨编码共享的行级操作统一编号
namespace seq {
// --- Permutation 行内(元素级)---
constexpr int SEQ_PERM_SWAP = 0; // swap 两个位置
constexpr int SEQ_PERM_REVERSE = 1; // 2-opt反转区间
constexpr int SEQ_PERM_INSERT = 2; // insert移动到新位置
constexpr int SEQ_PERM_3OPT = 3; // 3-opt断 3 边重连)
// --- Permutation 行内(片段级)---
constexpr int SEQ_PERM_OR_OPT = 4; // or-opt移动连续 k 个元素)
// --- Permutation 行内(组合级)---
constexpr int SEQ_PERM_DOUBLE_SWAP = 30; // 连续两次 swap同行
constexpr int SEQ_PERM_TRIPLE_SWAP = 31; // 连续三次 swap同行
// --- Permutation 跨行(元素级)---
constexpr int SEQ_PERM_CROSS_RELOCATE = 5; // 单元素移行
constexpr int SEQ_PERM_CROSS_SWAP = 6; // 单元素换行
// --- Permutation 跨行(片段级)---
constexpr int SEQ_PERM_SEG_RELOCATE = 7; // 片段移行
constexpr int SEQ_PERM_SEG_SWAP = 8; // 片段换行2-opt*
constexpr int SEQ_PERM_CROSS_EXCHANGE = 9; // 片段互换(保序)
// --- Binary 行内(元素级)---
constexpr int SEQ_BIN_FLIP = 0; // 翻转一个位
constexpr int SEQ_BIN_SWAP = 1; // 交换两个位
// --- Binary 行内(片段级)---
constexpr int SEQ_BIN_SEG_FLIP = 2; // 翻转连续 k 个位
constexpr int SEQ_BIN_K_FLIP = 3; // 同时翻转 k 个随机位
// --- Binary 跨行 ---
constexpr int SEQ_BIN_CROSS_SWAP = 4; // 两行各一个位互换
constexpr int SEQ_BIN_SEG_CROSS_SWAP = 5; // 两行各取一段互换
// --- 共享:行级(编码无关)---
constexpr int SEQ_ROW_SWAP = 10; // 交换两行
constexpr int SEQ_ROW_REVERSE = 11; // 反转行排列
constexpr int SEQ_ROW_SPLIT = 12; // 一行拆两行
constexpr int SEQ_ROW_MERGE = 13; // 两行合并
// --- 特殊 ---
constexpr int SEQ_PERTURBATION = 14; // 扰动(多步不可逆)
// --- Integer 行内(元素级)---
constexpr int SEQ_INT_RANDOM_RESET = 0; // 随机一个位置重置为 [lb, ub] 内随机值
constexpr int SEQ_INT_DELTA = 1; // 随机一个位置 ±kclamp 到 [lb, ub]
constexpr int SEQ_INT_SWAP = 2; // 交换两个位置的值
// --- Integer 行内(片段级)---
constexpr int SEQ_INT_SEG_RESET = 3; // 连续 k 个位置全部重置
constexpr int SEQ_INT_K_DELTA = 4; // 随机 k 个位置各自 ±1
// --- Integer 跨行 ---
constexpr int SEQ_INT_CROSS_SWAP = 5; // 两行各一个位置互换
// --- LNS大邻域搜索---
constexpr int SEQ_LNS_SEGMENT_SHUFFLE = 20; // 打乱连续片段
constexpr int SEQ_LNS_SCATTER_SHUFFLE = 21; // 打乱随机分散位置
constexpr int SEQ_LNS_GUIDED_REBUILD = 22; // 关系矩阵引导重建
} // namespace seq
// ============================================================
// RelationMatrix — G/O 关系矩阵GPU global memory
// ============================================================
// G[i][j]: 元素 i 和 j 的分组倾向(对称,越大越倾向同组)
// O[i][j]: 元素 i 排在 j 前面的倾向(不对称)
// 存储为一维数组 [N * N],行优先
// 小规模 N<200 直接 DenseP2 再做稀疏化
//
// 更新时机host 端,每个 batch 间隙
// 使用时机kernel 中 SEQ_LNS_GUIDED_REBUILD 读取
struct RelationMatrix {
float* d_G; // GPU 上的 G 矩阵 [N * N]
float* d_O; // GPU 上的 O 矩阵 [N * N]
float* h_G; // Host 上的 G 矩阵 [N * N](用于更新后上传)
float* h_O; // Host 上的 O 矩阵 [N * N]
int N; // 元素总数
float decay; // 衰减系数 α(默认 0.95
int update_count; // 已更新次数(用于冷启动判断)
};
// ============================================================
// SeqRegistry — 运行时可用序列注册表
// ============================================================
// 根据 EncodingType 和 dim1 自动确定哪些序列可用
// 传到 GPU 供 sample_sequence() 使用
enum class SeqCategory : int {
InRow = 0, // 行内算子swap, reverse, insert, ...
CrossRow = 1, // 跨行算子cross_relocate, cross_swap, seg_relocate, ...
RowLevel = 2, // 行级算子row_swap, row_reverse, split, merge
LNS = 3, // 大邻域搜索
};
struct SeqRegistry {
int ids[MAX_SEQ]; // 可用序列的 SeqID 列表
int count; // 可用序列数量
float weights[MAX_SEQ]; // 每个序列的当前权重(归一化后用于采样)
float max_w[MAX_SEQ]; // 每个序列的权重上限0 = 不限,用全局 cap
SeqCategory categories[MAX_SEQ]; // 每个序列的分类(约束导向用)
};
// ============================================================
// KStepConfig — 多步执行的步数选择配置
// ============================================================
// K=1: 单步当前行为K=2/3: 连续执行多个序列后再评估
// 两层权重体系的第一层
//
// 自适应策略:
// - 初始 K=1 权重很大保守K>1 权重小
// - K>1 带来改进 → 增大该 K 的权重
// - 长时间无改进 → 重置/增大 K>1 权重(跳出局部最优)
struct KStepConfig {
float weights[MAX_K]; // K=1,2,3 的采样权重(归一化)
int stagnation_count; // 连续无改进的 batch 数(用于触发重置)
int stagnation_limit; // 触发重置的阈值(默认 5 个 batch
};
// 构建默认 K 步配置
inline KStepConfig build_kstep_config() {
KStepConfig kc;
kc.weights[0] = 0.80f; // K=1: 初始主导
kc.weights[1] = 0.15f; // K=2: 少量探索
kc.weights[2] = 0.05f; // K=3: 极少探索
kc.stagnation_count = 0;
kc.stagnation_limit = 5;
return kc;
};
// ============================================================
// ProblemProfile — 基于结构特征推断的问题画像
// ============================================================
// 第一层:纯结构推断(不感知语义),用于驱动算子注册和初始权重
// 未来第二层:可扩展更细粒度的画像(如多属性、高约束等)
enum class ScaleClass { Small, Medium, Large };
enum class StructClass { SingleSeq, MultiFixed, MultiPartition };
struct ProblemProfile {
EncodingType encoding;
ScaleClass scale;
StructClass structure;
float cross_row_prob;
};
// classify_problem() 定义在 ProblemConfig 之后
// ============================================================
// 权重预设 — 由 ScaleClass 驱动
// ============================================================
struct WeightPreset {
float w_cubic;
float w_quadratic;
float w_lns;
float lns_cap;
};
inline WeightPreset get_weight_preset(ScaleClass scale) {
switch (scale) {
case ScaleClass::Small: return { 0.50f, 0.80f, 0.006f, 0.01f };
case ScaleClass::Medium: return { 0.30f, 0.70f, 0.004f, 0.01f };
case ScaleClass::Large: return { 0.05f, 0.30f, 0.001f, 0.01f };
}
return { 0.50f, 0.80f, 0.006f, 0.01f };
}
// classify_problem() 和 build_seq_registry() 定义在 ProblemConfig 之后
// ============================================================
// Solution<D1, D2> — 解的模板化表示
// ============================================================
// D1: 行数上限 (TSP=1, VRP≤16, Schedule≤8)
// D2: 每行列数上限 (TSP≤64, 背包≤32)
// 每个 Problem 选择最小够用的 D1/D2编译器生成紧凑的结构
template<int D1, int D2>
struct Solution {
static constexpr int DIM1 = D1; // 编译时行数上限
static constexpr int DIM2 = D2; // 编译时列数上限
int data[D1][D2]; // D1×D2×4 字节
int dim2_sizes[D1]; // D1×4 字节
float objectives[MAX_OBJ]; // 16 字节(固定)
float penalty; // 4 字节
};
// ============================================================
// ProblemConfig — 问题的运行时元信息
// ============================================================
struct ProblemConfig {
EncodingType encoding;
int dim1; // 实际使用的行数 (≤ D1)
int dim2_default; // 实际使用的列数 (≤ D2)
int num_objectives;
ObjDir obj_dirs[MAX_OBJ];
float obj_weights[MAX_OBJ]; // Weighted 模式下的权重
// 多目标比较
CompareMode compare_mode = CompareMode::Weighted;
int obj_priority[MAX_OBJ] = {0, 1, 2, 3}; // Lexicographic 模式下的比较顺序(索引)
float obj_tolerance[MAX_OBJ] = {0.0f, 0.0f, 0.0f, 0.0f}; // 字典法容差:差值 <= tol 视为相等
int value_lower_bound;
int value_upper_bound;
// v3.4: 统一行模式
RowMode row_mode = RowMode::Single; // 行模式Single/Fixed/Partition
float cross_row_prob = 0.0f; // 跨行 move 概率0=纯行内操作)
int total_elements = 0; // Partition 模式下的总元素数
int perm_repeat_count = 1; // 排列中每个值的重复次数1=标准排列,>1=多重集排列)
};
// ============================================================
// SolverConfig — 求解器参数
// ============================================================
struct SolverConfig {
int pop_size = 0; // 种群大小0 = 自动匹配 GPU 最大并行度)
int max_gen = 1000;
float mutation_rate = 0.1f;
unsigned seed = 42;
bool verbose = true;
int print_every = 100;
// 岛屿模型参数
int num_islands = 1; // 0 = 自适应1 = 纯爬山(无岛屿),>1 = 岛屿模型
int migrate_interval = 100; // 每隔多少代执行一次迁移
MigrateStrategy migrate_strategy = MigrateStrategy::Hybrid;
// 模拟退火参数
float sa_temp_init = 0.0f; // 初始温度0 = 禁用 SA纯爬山
float sa_alpha = 0.998f; // 冷却率(每代乘以 alpha
// v1.0: 交叉参数
float crossover_rate = 0.1f; // 每代中执行交叉的概率vs 变异)
// v2.0: 自适应算子选择
bool use_aos = false; // 启用 AOSbatch 间更新算子权重)
float aos_weight_floor = AOS_WEIGHT_FLOOR; // 运行时可覆盖的 floor
float aos_weight_cap = AOS_WEIGHT_CAP; // 运行时可覆盖的 cap
// v2.1: 初始解策略
int init_oversample = 4; // 采样倍数1 = 不做采样择优,即纯随机)
float init_random_ratio = 0.3f; // 纯随机解占比(多样性保底)
// v3.0: 工程可用性
float time_limit_sec = 0.0f; // 时间限制0 = 不限制,按 max_gen 跑完)
int stagnation_limit = 0; // 收敛检测:连续多少个 batch 无改进后 reheat0 = 禁用)
float reheat_ratio = 0.5f; // reheat 时温度恢复到初始温度的比例
// v3.5: CUDA Graph
bool use_cuda_graph = false; // 启用 CUDA Graph减少 kernel launch 开销)
// v3.6: AOS 更新频率控制
int aos_update_interval = 10; // 每隔多少个 batch 更新一次 AOS 权重(降低 cudaMemcpy 同步频率)
// v4.0: 约束导向 + 分层搜索
bool use_constraint_directed = false; // 启用约束导向(根据 penalty 比例动态调整跨行算子权重)
bool use_phased_search = false; // 启用分层搜索(按进度调整全局 floor/cap
// 分层搜索参数:三期阈值
float phase_explore_end = 0.30f; // 探索期结束(进度比例)
float phase_refine_start = 0.70f; // 精细期开始(进度比例)
// 约束导向参数
float constraint_boost_max = 2.5f; // 高约束时跨行算子 cap 提升倍率上限
};
// ============================================================
// classify_problem — 从 ProblemConfig 推断问题画像
// ============================================================
inline ProblemProfile classify_problem(const ProblemConfig& pcfg) {
ProblemProfile p;
p.encoding = pcfg.encoding;
if (pcfg.dim2_default <= 100) p.scale = ScaleClass::Small;
else if (pcfg.dim2_default <= 250) p.scale = ScaleClass::Medium;
else p.scale = ScaleClass::Large;
if (pcfg.dim1 <= 1)
p.structure = StructClass::SingleSeq;
else if (pcfg.row_mode == RowMode::Partition)
p.structure = StructClass::MultiPartition;
else
p.structure = StructClass::MultiFixed;
p.cross_row_prob = pcfg.cross_row_prob;
return p;
}
// ============================================================
// build_seq_registry — 由 ProblemProfile 驱动的算子注册
// ============================================================
inline SeqRegistry build_seq_registry(const ProblemProfile& prof) {
SeqRegistry reg;
reg.count = 0;
for (int i = 0; i < MAX_SEQ; i++) {
reg.ids[i] = -1; reg.weights[i] = 0.0f;
reg.max_w[i] = 0.0f; reg.categories[i] = SeqCategory::InRow;
}
auto add = [&](int id, float w, SeqCategory cat, float cap = 0.0f) {
if (reg.count >= MAX_SEQ) return;
reg.ids[reg.count] = id;
reg.weights[reg.count] = w;
reg.max_w[reg.count] = cap;
reg.categories[reg.count] = cat;
reg.count++;
};
WeightPreset wp = get_weight_preset(prof.scale);
bool multi_row = (prof.structure != StructClass::SingleSeq);
float cr = prof.cross_row_prob;
if (prof.encoding == EncodingType::Permutation) {
add(seq::SEQ_PERM_SWAP, 1.0f, SeqCategory::InRow);
add(seq::SEQ_PERM_REVERSE, 1.0f, SeqCategory::InRow);
add(seq::SEQ_PERM_INSERT, 1.0f, SeqCategory::InRow);
add(seq::SEQ_PERM_DOUBLE_SWAP, 0.5f, SeqCategory::InRow);
add(seq::SEQ_PERM_TRIPLE_SWAP, 0.3f, SeqCategory::InRow);
add(seq::SEQ_PERM_3OPT, wp.w_cubic, SeqCategory::InRow);
add(seq::SEQ_PERM_OR_OPT, wp.w_quadratic, SeqCategory::InRow);
if (multi_row && cr > 0.0f) {
add(seq::SEQ_PERM_CROSS_RELOCATE, 0.6f * cr, SeqCategory::CrossRow);
add(seq::SEQ_PERM_CROSS_SWAP, 0.6f * cr, SeqCategory::CrossRow);
add(seq::SEQ_PERM_SEG_RELOCATE, 0.5f * cr, SeqCategory::CrossRow);
add(seq::SEQ_PERM_SEG_SWAP, 0.5f * cr, SeqCategory::CrossRow);
add(seq::SEQ_PERM_CROSS_EXCHANGE, 0.4f * cr, SeqCategory::CrossRow);
}
if (multi_row) {
add(seq::SEQ_ROW_SWAP, 0.3f, SeqCategory::RowLevel);
add(seq::SEQ_ROW_REVERSE, 0.2f, SeqCategory::RowLevel);
if (prof.structure == StructClass::MultiPartition) {
add(seq::SEQ_ROW_SPLIT, 0.2f, SeqCategory::RowLevel);
add(seq::SEQ_ROW_MERGE, 0.2f, SeqCategory::RowLevel);
}
}
add(seq::SEQ_LNS_SEGMENT_SHUFFLE, wp.w_lns, SeqCategory::LNS, wp.lns_cap);
add(seq::SEQ_LNS_SCATTER_SHUFFLE, wp.w_lns, SeqCategory::LNS, wp.lns_cap);
add(seq::SEQ_LNS_GUIDED_REBUILD, wp.w_lns, SeqCategory::LNS, wp.lns_cap);
}
else if (prof.encoding == EncodingType::Binary) {
add(seq::SEQ_BIN_FLIP, 1.0f, SeqCategory::InRow);
add(seq::SEQ_BIN_SWAP, 0.8f, SeqCategory::InRow);
add(seq::SEQ_BIN_SEG_FLIP, 0.6f, SeqCategory::InRow);
add(seq::SEQ_BIN_K_FLIP, 0.6f, SeqCategory::InRow);
if (multi_row && cr > 0.0f) {
add(seq::SEQ_BIN_CROSS_SWAP, 0.5f * cr, SeqCategory::CrossRow);
add(seq::SEQ_BIN_SEG_CROSS_SWAP, 0.4f * cr, SeqCategory::CrossRow);
}
if (multi_row) {
add(seq::SEQ_ROW_SWAP, 0.3f, SeqCategory::RowLevel);
add(seq::SEQ_ROW_REVERSE, 0.2f, SeqCategory::RowLevel);
if (prof.structure == StructClass::MultiPartition) {
add(seq::SEQ_ROW_SPLIT, 0.2f, SeqCategory::RowLevel);
add(seq::SEQ_ROW_MERGE, 0.2f, SeqCategory::RowLevel);
}
}
}
else if (prof.encoding == EncodingType::Integer) {
add(seq::SEQ_INT_RANDOM_RESET, 1.0f, SeqCategory::InRow);
add(seq::SEQ_INT_DELTA, 1.0f, SeqCategory::InRow);
add(seq::SEQ_INT_SWAP, 0.8f, SeqCategory::InRow);
add(seq::SEQ_INT_SEG_RESET, 0.6f, SeqCategory::InRow);
add(seq::SEQ_INT_K_DELTA, 0.6f, SeqCategory::InRow);
if (multi_row && cr > 0.0f) {
add(seq::SEQ_INT_CROSS_SWAP, 0.5f * cr, SeqCategory::CrossRow);
}
if (multi_row) {
add(seq::SEQ_ROW_SWAP, 0.3f, SeqCategory::RowLevel);
add(seq::SEQ_ROW_REVERSE, 0.2f, SeqCategory::RowLevel);
if (prof.structure == StructClass::MultiPartition) {
add(seq::SEQ_ROW_SPLIT, 0.2f, SeqCategory::RowLevel);
add(seq::SEQ_ROW_MERGE, 0.2f, SeqCategory::RowLevel);
}
}
}
float sum = 0.0f;
for (int i = 0; i < reg.count; i++) sum += reg.weights[i];
if (sum > 0.0f) {
for (int i = 0; i < reg.count; i++) reg.weights[i] /= sum;
}
return reg;
}
// ============================================================
// ObjConfig — 传到 GPU 的目标比较配置(紧凑结构)
// ============================================================
struct ObjConfig {
int num_obj;
CompareMode mode;
ObjDir dirs[MAX_OBJ]; // 每个目标的方向
float weights[MAX_OBJ]; // Weighted 模式下的权重
int priority[MAX_OBJ]; // Lexicographic 模式下的比较顺序
float tolerance[MAX_OBJ]; // Lexicographic 模式下的容差
};
// 从 ProblemConfig 构造 ObjConfigCPU 端)
inline ObjConfig make_obj_config(const ProblemConfig& pcfg) {
ObjConfig oc;
oc.num_obj = pcfg.num_objectives;
oc.mode = pcfg.compare_mode;
for (int i = 0; i < MAX_OBJ; i++) {
oc.dirs[i] = pcfg.obj_dirs[i];
oc.weights[i] = pcfg.obj_weights[i];
oc.priority[i] = pcfg.obj_priority[i];
oc.tolerance[i] = pcfg.obj_tolerance[i];
}
return oc;
}
// ============================================================
// SolveResult — solve() 的返回值
// ============================================================
enum class StopReason { MaxGen, TimeLimit, Stagnation };
template<typename Sol>
struct SolveResult {
Sol best_solution;
float elapsed_ms = 0.0f;
int generations = 0;
StopReason stop_reason = StopReason::MaxGen;
};
// ============================================================
// 目标重要性映射 — 统一 Weighted / Lexicographic 的重要性度量
// ============================================================
// 用于初始化选种NSGA-II 加权拥挤度 + 核心目标预留名额)
// Weighted: importance[i] = weight[i] / Σweight
// Lexicographic: importance[i] = 0.5^rank[i] / Σ(0.5^rank)
// → 第一优先级 ~57%,第二 ~29%,第三 ~14%
inline void compute_importance(const ObjConfig& oc, float* importance) {
float sum = 0.0f;
for (int i = 0; i < oc.num_obj; i++) {
if (oc.mode == CompareMode::Weighted) {
importance[i] = oc.weights[i];
} else {
int rank = oc.priority[i];
importance[i] = 1.0f;
for (int r = 0; r < rank; r++) importance[i] *= 0.5f; // 0.5^rank
}
sum += importance[i];
}
if (sum > 0.0f) {
for (int i = 0; i < oc.num_obj; i++)
importance[i] /= sum;
}
}
// ============================================================
// 比较工具 — 支持 Weighted / Lexicographic
// ============================================================
// 将目标值统一为"越小越好"Maximize 目标取负
__device__ __host__ inline float normalize_obj(float val, ObjDir dir) {
return (dir == ObjDir::Maximize) ? -val : val;
}
// 核心比较a 是否优于 b
template<typename Sol>
__device__ inline bool is_better(const Sol& a, const Sol& b,
const ObjConfig& oc) {
// penalty 优先:可行解一定优于不可行解
if (a.penalty <= 0.0f && b.penalty > 0.0f) return true;
if (a.penalty > 0.0f && b.penalty <= 0.0f) return false;
if (a.penalty > 0.0f && b.penalty > 0.0f) return a.penalty < b.penalty;
if (oc.mode == CompareMode::Weighted) {
// 加权求和权重已包含方向信息Maximize 目标用负权重,或由 normalize_obj 处理)
float sum_a = 0.0f, sum_b = 0.0f;
for (int i = 0; i < oc.num_obj; i++) {
float na = normalize_obj(a.objectives[i], oc.dirs[i]);
float nb = normalize_obj(b.objectives[i], oc.dirs[i]);
sum_a += oc.weights[i] * na;
sum_b += oc.weights[i] * nb;
}
return sum_a < sum_b;
} else {
// 字典法:按 priority 顺序逐目标比较
for (int p = 0; p < oc.num_obj; p++) {
int idx = oc.priority[p];
float va = normalize_obj(a.objectives[idx], oc.dirs[idx]);
float vb = normalize_obj(b.objectives[idx], oc.dirs[idx]);
float diff = va - vb;
if (diff < -oc.tolerance[idx]) return true; // a 明显更好
if (diff > oc.tolerance[idx]) return false; // b 明显更好
// 在容差内视为相等 → 继续比较下一个目标
}
return false; // 所有目标都在容差内相等
}
}
// 标量化SA 接受概率用):返回越小越好的标量
template<typename Sol>
__device__ __host__ inline float scalar_objective(const Sol& sol,
const ObjConfig& oc) {
if (oc.mode == CompareMode::Weighted) {
float sum = 0.0f;
for (int i = 0; i < oc.num_obj; i++)
sum += oc.weights[i] * normalize_obj(sol.objectives[i], oc.dirs[i]);
return sum;
} else {
// 字典法下 SA 用第一优先级目标作为标量
int idx = oc.priority[0];
return normalize_obj(sol.objectives[idx], oc.dirs[idx]);
}
}
// 轻量比较:直接操作 float[] 目标数组(避免复制整个 Sol
__device__ inline bool obj_is_better(const float* new_objs, const float* old_objs,
const ObjConfig& oc) {
if (oc.mode == CompareMode::Weighted) {
float sum_new = 0.0f, sum_old = 0.0f;
for (int i = 0; i < oc.num_obj; i++) {
sum_new += oc.weights[i] * normalize_obj(new_objs[i], oc.dirs[i]);
sum_old += oc.weights[i] * normalize_obj(old_objs[i], oc.dirs[i]);
}
return sum_new < sum_old;
} else {
for (int p = 0; p < oc.num_obj; p++) {
int idx = oc.priority[p];
float va = normalize_obj(new_objs[idx], oc.dirs[idx]);
float vb = normalize_obj(old_objs[idx], oc.dirs[idx]);
float diff = va - vb;
if (diff < -oc.tolerance[idx]) return true;
if (diff > oc.tolerance[idx]) return false;
}
return false;
}
}
// 轻量标量化:直接操作 float[] 目标数组
__device__ __host__ inline float obj_scalar(const float* objs, const ObjConfig& oc) {
if (oc.mode == CompareMode::Weighted) {
float sum = 0.0f;
for (int i = 0; i < oc.num_obj; i++)
sum += oc.weights[i] * normalize_obj(objs[i], oc.dirs[i]);
return sum;
} else {
int idx = oc.priority[0];
return normalize_obj(objs[idx], oc.dirs[idx]);
}
}
// ============================================================
// AOSStats — 自适应算子选择统计(每个 block 一份)
// ============================================================
// v3.0: 粒度从 3 层 → MAX_SEQ 个序列
// 记录每个序列的使用次数和改进次数
// batch 结束后由 host 聚合,更新 SeqRegistry 权重
struct AOSStats {
// 算子层统计(第二层)
int usage[MAX_SEQ]; // 各序列使用次数
int improvement[MAX_SEQ]; // 各序列改进次数delta < 0 且被接受)
// K 步数层统计(第一层)
int k_usage[MAX_K]; // K=1,2,3 各自使用次数
int k_improvement[MAX_K]; // K=1,2,3 各自改进次数
};
// ============================================================
// ObjDef — 单个目标的定义(编译期常量)
// ============================================================
struct ObjDef {
ObjDir dir; // 优化方向
float weight; // Weighted 模式下的权重
float tolerance; // Lexicographic 模式下的容差
};
// ============================================================
// HeuristicMatrix — 启发式初始解构造用的数据矩阵描述
// ============================================================
struct HeuristicMatrix {
const float* data; // host 端 N*N 矩阵
int N; // 维度
};
// ============================================================
// ProblemBase<Derived, D1, D2> — CRTP 基类
//
// 用户继承此基类,提供:
// static constexpr ObjDef OBJ_DEFS[] = {...}; — 目标元信息
// __device__ float compute_obj(int idx, ...) const; — 目标分发
// __device__ float compute_penalty(...) const;
//
// 约定OBJ_DEFS 和 compute_obj 紧挨着写case N 对应 OBJ_DEFS[N]
// NUM_OBJ 由 sizeof(OBJ_DEFS) 自动推导,无需手动维护
//
// 基类自动提供:
// evaluate(sol) — 遍历目标列表调用 compute_obj
// fill_obj_config(cfg) — 从 OBJ_DEFS 自动填充 ProblemConfig
// obj_config() — 直接生成 ObjConfig
// ============================================================
template<typename Derived, int D1_, int D2_>
struct ProblemBase {
static constexpr int D1 = D1_;
static constexpr int D2 = D2_;
using Sol = Solution<D1, D2>;
// NUM_OBJ 从 OBJ_DEFS 数组自动推导
static constexpr int NUM_OBJ = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
// 自动评估:遍历目标列表
__device__ void evaluate(Sol& sol) const {
const auto& self = static_cast<const Derived&>(*this);
constexpr int n = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
for (int i = 0; i < n; i++)
sol.objectives[i] = self.compute_obj(i, sol);
sol.penalty = self.compute_penalty(sol);
}
// 从 OBJ_DEFS 自动填充 ProblemConfig 的目标部分
void fill_obj_config(ProblemConfig& cfg) const {
constexpr int n = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
cfg.num_objectives = n;
for (int i = 0; i < n; i++) {
cfg.obj_dirs[i] = Derived::OBJ_DEFS[i].dir;
cfg.obj_weights[i] = Derived::OBJ_DEFS[i].weight;
cfg.obj_tolerance[i] = Derived::OBJ_DEFS[i].tolerance;
cfg.obj_priority[i] = i; // 列表顺序即优先级
}
}
// 直接生成 ObjConfig供 solver 使用)
ObjConfig obj_config() const {
ProblemConfig pcfg;
fill_obj_config(pcfg);
return make_obj_config(pcfg);
}
// 每个 block 在 global memory 中的热数据工作集大小(字节)
// 用于 auto pop_size 估算 L2 cache 压力
// 默认 = shared_mem_bytes()(数据在 smem 时gmem 工作集为 0 不影响)
// 子类覆盖:当 shared_mem_bytes() 返回 0数据放不进 smem
// 返回实际数据大小(如距离矩阵 n*n*sizeof(float)
size_t working_set_bytes() const {
return static_cast<const Derived&>(*this).shared_mem_bytes();
}
// 可选:初始化 G/O 关系矩阵(为 GUIDED_REBUILD 提供先验知识)
// G[i*N+j]: 元素 i 和 j 的分组倾向(对称,[0,1],越大越倾向同组)
// O[i*N+j]: 元素 i 排在 j 前面的倾向(不对称,[0,1]
// 默认不提供(全零),搜索过程中通过 EMA 从历史好解积累
// 用户覆盖示例:距离近 → G 和 O 都高
void init_relation_matrix(float* h_G, float* h_O, int N) const {
(void)h_G; (void)h_O; (void)N; // 默认:不做任何事(保持全零)
}
// 可选:返回 host 端数据矩阵供启发式初始解构造
// 默认返回 0不提供子类 override 后填充 out 数组并返回实际数量
int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
(void)out; (void)max_count;
return 0;
}
};