cuGenOpt/python/cugenopt/include/core/solver.cuh
2026-03-20 00:33:45 +08:00

1530 lines
64 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* solver.cuh - 主求解循环
*
* v2.0: Block 级架构重构
* - 1 block = 1 solution邻域并行
* - Solution 存放在 shared memory
* - 每代K 个线程各自生成候选 move + 评估 delta → 归约选最优 → thread 0 执行
* - 交叉暂用简化版thread 0 执行,其余线程等待)
* - 迁移/精英注入保持单线程 kernel操作全局内存
*
* 要求 Problem 接口:
* size_t shared_mem_bytes() const;
* __device__ void load_shared(char* smem, int tid, int bsz);
* __device__ void evaluate(Sol& sol) const;
*/
#pragma once
#include "types.cuh"
#include "population.cuh"
#include "operators.cuh"
#include "relation_matrix.cuh"
#include "cuda_utils.cuh"
#include "init_selection.cuh"
#include "init_heuristic.cuh"
#include <cmath>
// ============================================================
// 编译时常量
// ============================================================
constexpr int BLOCK_LEVEL_THREADS = 128; // Block 级架构的默认线程数/block
// ============================================================
// EvolveParams — CUDA Graph 可变参数device memory
// ============================================================
// 将每个 batch 会变化的参数集中到一个 struct 中,
// evolve_block_kernel 通过指针读取CUDA Graph 录制时绑定指针。
// 每次 replay 前只需 cudaMemcpy 更新这块 device memory。
struct EvolveParams {
float temp_start;
int gens_per_batch;
SeqRegistry seq_reg;
KStepConfig kstep;
int migrate_round;
ObjConfig oc;
};
// ============================================================
// 工具:协作加载/存储 Solutionshared memory ↔ global memory
// ============================================================
template<typename Sol>
__device__ inline void cooperative_load_sol(Sol& dst, const Sol& src,
int tid, int num_threads) {
// 按 int 粒度协作拷贝整个 Solution 结构体
const int* src_ptr = reinterpret_cast<const int*>(&src);
int* dst_ptr = reinterpret_cast<int*>(&dst);
constexpr int n_ints = (sizeof(Sol) + sizeof(int) - 1) / sizeof(int);
for (int i = tid; i < n_ints; i += num_threads)
dst_ptr[i] = src_ptr[i];
}
template<typename Sol>
__device__ inline void cooperative_store_sol(Sol& dst, const Sol& src,
int tid, int num_threads) {
cooperative_load_sol(dst, src, tid, num_threads); // 同样的拷贝逻辑
}
// ============================================================
// Kernel 1: 初始评估只调用一次1 block = 1 solution
// ============================================================
template<typename Problem, typename Sol>
__global__ void evaluate_kernel(Problem prob, Sol* pop, int pop_size,
size_t smem_size) {
extern __shared__ char smem[];
Problem lp = prob;
if (smem_size > 0) { lp.load_shared(smem, threadIdx.x, blockDim.x); __syncthreads(); }
// 1-thread-per-solution 初始评估(保持简单,只调用一次)
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < pop_size) lp.evaluate(pop[tid]);
}
// ============================================================
// Kernel 2: Block 级批量进化(邻域并行)
// ============================================================
//
// 每代流程:
// 1. K 个线程各自生成一个候选 move
// 2. K 个线程各自评估 move 的 delta不修改 shared memory 中的 sol
// 3. Block 内归约:选 delta 最小的 move
// 4. Thread 0 决定是否接受SA / HC
// 5. Thread 0 执行最优 move 并更新 sol
// 6. __syncthreads() 让所有线程看到更新后的 sol
//
// Solution 在 shared memory 中Problem 数据也在 shared memory 中
// ============================================================
// MultiStepCandidate — 多步执行结果(用于归约)
// ============================================================
struct MultiStepCandidate {
float delta;
float new_penalty;
int seq_indices[MAX_K];
int k_steps;
int winner_tid;
};
template<typename Problem, typename Sol>
__global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size,
EncodingType encoding, int dim1,
ObjConfig oc_legacy,
curandState* rng_states,
float alpha,
size_t prob_smem_size,
AOSStats* d_aos_stats,
const float* d_G,
const float* d_O,
int rel_N,
int val_lb,
int val_ub,
const EvolveParams* d_params) {
extern __shared__ char smem[];
int bid = blockIdx.x;
int tid = threadIdx.x;
int num_threads = blockDim.x;
if (bid >= pop_size) return;
const int gens_per_batch = d_params->gens_per_batch;
const SeqRegistry seq_reg = d_params->seq_reg;
const KStepConfig kstep = d_params->kstep;
const float temp_start = d_params->temp_start;
const ObjConfig oc = d_params->oc;
// --- shared memory 布局 ---
// [0 .. sizeof(Sol)-1] : Solution
// [sizeof(Sol) .. sizeof(Sol)+prob_smem-1] : Problem 数据
// [之后 .. ] : MultiStepCandidate[num_threads] 归约工作区
// [之后 .. ] : AOSStats (如果启用)
Sol* s_sol = reinterpret_cast<Sol*>(smem);
char* prob_smem_ptr = smem + sizeof(Sol);
MultiStepCandidate* s_cands = reinterpret_cast<MultiStepCandidate*>(
smem + sizeof(Sol) + prob_smem_size);
// AOS 统计(在 MultiStepCandidate 数组之后)
AOSStats* s_aos = nullptr;
if (d_aos_stats) {
s_aos = reinterpret_cast<AOSStats*>(
smem + sizeof(Sol) + prob_smem_size + sizeof(MultiStepCandidate) * num_threads);
// Thread 0 初始化 AOS 计数器
if (tid == 0) {
for (int i = 0; i < MAX_SEQ; i++) {
s_aos->usage[i] = 0;
s_aos->improvement[i] = 0;
}
for (int i = 0; i < MAX_K; i++) {
s_aos->k_usage[i] = 0;
s_aos->k_improvement[i] = 0;
}
}
}
// 加载 Problem 数据到 shared memory
Problem lp = prob;
if (prob_smem_size > 0) {
lp.load_shared(prob_smem_ptr, tid, num_threads);
}
// 协作加载 Solution 到 shared memory
cooperative_load_sol(*s_sol, pop[bid], tid, num_threads);
__syncthreads();
int rng_idx = bid * num_threads + tid;
curandState rng = rng_states[rng_idx];
float temp = temp_start;
for (int g = 0; g < gens_per_batch; g++) {
// ============================================================
// Step 1: 每个线程独立采样 K 步数 + K 个序列,在 local copy 上执行
// ============================================================
// 采样 K步数按 kstep.weights 权重
float kr = curand_uniform(&rng);
int my_k = 1; // 默认 K=1
{
float cum = 0.0f;
for (int i = 0; i < MAX_K; i++) {
cum += kstep.weights[i];
if (kr < cum) { my_k = i + 1; break; }
}
}
// 在 local memory 拷贝 sol执行 K 步 move
Sol local_sol = *s_sol;
MultiStepCandidate my_cand;
my_cand.k_steps = my_k;
my_cand.winner_tid = tid;
for (int i = 0; i < MAX_K; i++) {
my_cand.seq_indices[i] = -1;
}
bool all_noop = true;
for (int step = 0; step < my_k; step++) {
int seq_idx = -1;
bool changed = ops::sample_and_execute(
seq_reg, local_sol, dim1, encoding, &rng, seq_idx,
d_G, d_O, rel_N, val_lb, val_ub,
static_cast<const void*>(&lp));
my_cand.seq_indices[step] = seq_idx;
if (changed) all_noop = false;
}
// Step 2: 评估最终 deltaK 步之后 vs 原始 sol
if (all_noop) {
my_cand.delta = 1e30f;
my_cand.new_penalty = s_sol->penalty;
} else {
lp.evaluate(local_sol);
float old_scalar = obj_scalar(s_sol->objectives, oc);
float new_scalar = obj_scalar(local_sol.objectives, oc);
bool old_feasible = (s_sol->penalty <= 0.0f);
bool new_feasible = (local_sol.penalty <= 0.0f);
if (new_feasible && !old_feasible) {
my_cand.delta = -1e20f;
} else if (!new_feasible && old_feasible) {
my_cand.delta = 1e20f;
} else if (!new_feasible && !old_feasible) {
my_cand.delta = local_sol.penalty - s_sol->penalty;
} else {
my_cand.delta = new_scalar - old_scalar;
}
my_cand.new_penalty = local_sol.penalty;
}
s_cands[tid] = my_cand;
__syncthreads();
// Step 3: Block 内并行归约,找 delta 最小的 candidate
for (int stride = num_threads / 2; stride > 0; stride >>= 1) {
if (tid < stride) {
if (s_cands[tid + stride].delta < s_cands[tid].delta)
s_cands[tid] = s_cands[tid + stride];
}
__syncthreads();
}
// Step 4: Thread 0 决定是否接受
if (tid == 0) {
MultiStepCandidate& best = s_cands[0];
bool has_valid = (best.delta < 1e29f);
if (has_valid) {
bool improved = (best.delta < 0.0f);
bool accept;
if (improved) {
accept = true;
} else if (temp > 0.0f && s_sol->penalty <= 0.0f && best.new_penalty <= 0.0f) {
accept = curand_uniform(&rng) < expf(-best.delta / temp);
} else {
accept = false;
}
if (accept) {
// AOS 统计K 层 + 算子层
if (s_aos) {
int ki = best.k_steps - 1;
if (ki >= 0 && ki < MAX_K) {
s_aos->k_usage[ki]++;
if (improved) s_aos->k_improvement[ki]++;
}
for (int step = 0; step < best.k_steps; step++) {
int si = best.seq_indices[step];
if (si >= 0 && si < seq_reg.count) {
s_aos->usage[si]++;
if (improved) s_aos->improvement[si]++;
}
}
}
// Signal: keep winner_tid as-is (accept)
} else {
s_cands[0].winner_tid = -1; // Signal: reject
}
} else {
s_cands[0].winner_tid = -1; // Signal: no valid candidate
}
temp *= alpha;
}
__syncthreads();
// Step 5: Winner thread writes local_sol to s_sol
int winner = s_cands[0].winner_tid;
if (winner >= 0 && tid == winner) {
*s_sol = local_sol;
}
__syncthreads();
}
// 写回 Solution 到全局内存
cooperative_store_sol(pop[bid], *s_sol, tid, num_threads);
// AOS 统计写回全局内存
if (d_aos_stats && tid == 0) {
d_aos_stats[bid] = *s_aos;
}
// 保存 RNG 状态
rng_states[rng_idx] = rng;
}
// ============================================================
// Kernel 2b: Block 级交叉操作
// ============================================================
// 简化版thread 0 执行交叉逻辑,其余线程协作加载/存储
// 后续 Phase 3 会实现多线程协作交叉
template<typename Problem, typename Sol>
__global__ void crossover_block_kernel(Problem prob, Sol* pop, int pop_size,
EncodingType encoding, int dim1,
ObjConfig oc,
curandState* rng_states,
float crossover_rate,
size_t prob_smem_size,
int total_elements = 0) {
extern __shared__ char smem[];
int bid = blockIdx.x;
int tid = threadIdx.x;
int K = blockDim.x;
if (bid >= pop_size) return;
// shared memory 布局Sol + Problem data
Sol* s_sol = reinterpret_cast<Sol*>(smem);
char* prob_smem_ptr = smem + sizeof(Sol);
Problem lp = prob;
if (prob_smem_size > 0) {
lp.load_shared(prob_smem_ptr, tid, K);
}
cooperative_load_sol(*s_sol, pop[bid], tid, K);
__syncthreads();
// Thread 0 执行交叉逻辑
if (tid == 0) {
int rng_idx = bid * K;
curandState rng = rng_states[rng_idx];
if (curand_uniform(&rng) < crossover_rate) {
int c1 = rand_int(&rng, pop_size);
int c2 = rand_int(&rng, pop_size - 1);
if (c2 >= c1) c2++;
int mate_idx = is_better(pop[c1], pop[c2], oc) ? c1 : c2;
if (mate_idx != bid) {
const Sol& mate = pop[mate_idx];
Sol child;
bool did_crossover = false;
if (encoding == EncodingType::Permutation) {
int te = total_elements;
if (te <= 0) te = s_sol->dim2_sizes[0];
ops::perm_ox_crossover(child, *s_sol, mate, dim1, te, &rng);
did_crossover = true;
} else if (encoding == EncodingType::Binary) {
ops::uniform_crossover(child, *s_sol, mate, dim1, &rng);
did_crossover = true;
}
if (did_crossover) {
lp.evaluate(child);
if (is_better(child, *s_sol, oc)) {
*s_sol = child;
}
}
}
}
rng_states[rng_idx] = rng;
}
__syncthreads();
// 写回(可能被交叉更新了)
cooperative_store_sol(pop[bid], *s_sol, tid, K);
}
// ============================================================
// Kernel 3: 岛屿间迁移(保持不变,单线程 kernel
// ============================================================
template<typename Sol>
__device__ inline int find_worst_in_island(const Sol* pop, int base, int island_size,
const ObjConfig& oc) {
int worst = base;
for (int i = base + 1; i < base + island_size; i++)
if (is_better(pop[worst], pop[i], oc)) worst = i;
return worst;
}
template<typename Sol>
__global__ void migrate_kernel(Sol* pop, int pop_size, int island_size,
ObjConfig oc,
MigrateStrategy strategy,
const EvolveParams* d_params) {
if (threadIdx.x != 0 || blockIdx.x != 0) return;
int round = d_params->migrate_round;
int num_islands = pop_size / island_size;
int candidates[64];
for (int isle = 0; isle < num_islands; isle++) {
int base = isle * island_size;
int best = base;
for (int i = base + 1; i < base + island_size; i++)
if (is_better(pop[i], pop[best], oc)) best = i;
candidates[isle] = best;
}
int topn[64];
if (strategy == MigrateStrategy::TopN || strategy == MigrateStrategy::Hybrid) {
bool selected[64] = {};
for (int t = 0; t < num_islands; t++) {
int best_c = -1;
for (int c = 0; c < num_islands; c++) {
if (selected[c]) continue;
if (best_c < 0 || is_better(pop[candidates[c]], pop[candidates[best_c]], oc))
best_c = c;
}
topn[t] = candidates[best_c];
selected[best_c] = true;
}
for (int i = 0; i < num_islands; i++) {
int dst_isle = (i + round) % num_islands;
int dst_base = dst_isle * island_size;
int worst = find_worst_in_island(pop, dst_base, island_size, oc);
if (is_better(pop[topn[i]], pop[worst], oc))
pop[worst] = pop[topn[i]];
}
}
if (strategy == MigrateStrategy::Ring || strategy == MigrateStrategy::Hybrid) {
for (int isle = 0; isle < num_islands; isle++) {
int dst_isle = (isle + 1) % num_islands;
int dst_base = dst_isle * island_size;
int worst = find_worst_in_island(pop, dst_base, island_size, oc);
int src = candidates[isle];
if (is_better(pop[src], pop[worst], oc))
pop[worst] = pop[src];
}
}
}
// ============================================================
// Kernel 4: 精英注入(保持不变)
// ============================================================
template<typename Sol>
__global__ void elite_inject_kernel(Sol* pop, int pop_size,
Sol* global_best, ObjConfig oc) {
if (threadIdx.x != 0 || blockIdx.x != 0) return;
int best_idx = 0;
for (int i = 1; i < pop_size; i++)
if (is_better(pop[i], pop[best_idx], oc)) best_idx = i;
if (is_better(pop[best_idx], *global_best, oc))
*global_best = pop[best_idx];
int worst_idx = 0;
for (int i = 1; i < pop_size; i++)
if (is_better(pop[worst_idx], pop[i], oc)) worst_idx = i;
if (is_better(*global_best, pop[worst_idx], oc))
pop[worst_idx] = *global_best;
}
// ============================================================
// solve<Problem>: 主循环Block 级架构)
// ============================================================
using RegistryCallback = void(*)(SeqRegistry&);
template<typename Problem>
SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
const typename Problem::Sol* init_solutions = nullptr,
int num_init_solutions = 0,
RegistryCallback custom_registry_fn = nullptr) {
using Sol = typename Problem::Sol;
ProblemConfig pcfg = prob.config();
SolveResult<Sol> result;
bool use_sa = cfg.sa_temp_init > 0.0f;
bool use_crossover = cfg.crossover_rate > 0.0f;
bool use_aos = cfg.use_aos;
bool use_time_limit = cfg.time_limit_sec > 0.0f;
bool use_stagnation = cfg.stagnation_limit > 0;
// Block 级参数
const int block_threads = BLOCK_LEVEL_THREADS; // 128 线程/block
// --- 0. Shared memory 计算(需要在 pop_size 确定之前完成,用于 occupancy 查询)---
size_t prob_smem = prob.shared_mem_bytes();
// v3.1: 归约工作区为 MultiStepCandidate含 K 步 moves + seq_indices
size_t total_smem = sizeof(Sol) + prob_smem + sizeof(MultiStepCandidate) * block_threads;
if (use_aos) total_smem += sizeof(AOSStats);
// 查询 GPU 硬件属性
cudaDeviceProp prop;
int device;
CUDA_CHECK(cudaGetDevice(&device));
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
// 尝试扩展 shared memory 上限V100: 96KB, A100: 164KB 等)
size_t max_smem = (size_t)prop.sharedMemPerBlock;
if (total_smem > 48 * 1024) {
cudaError_t err1 = cudaFuncSetAttribute(
evolve_block_kernel<Problem, Sol>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
(int)total_smem);
cudaError_t err2 = cudaFuncSetAttribute(
crossover_block_kernel<Problem, Sol>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
(int)total_smem);
if (err1 == cudaSuccess && err2 == cudaSuccess) {
max_smem = total_smem;
}
}
// 检查 shared memory 上限
bool smem_overflow = false;
if (total_smem > max_smem) {
smem_overflow = (prob_smem > 0);
prob_smem = 0;
total_smem = sizeof(Sol) + sizeof(MultiStepCandidate) * block_threads;
if (use_aos) total_smem += sizeof(AOSStats);
}
// --- 0b. 确定 pop_size自动或用户指定---
int pop_size = cfg.pop_size;
bool auto_pop = (pop_size <= 0);
if (auto_pop) {
// 查询 occupancy每个 SM 能同时运行多少个 block
int max_blocks_per_sm = 0;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm,
evolve_block_kernel<Problem, Sol>,
block_threads,
total_smem);
int full_capacity = max_blocks_per_sm * prop.multiProcessorCount;
if (prob_smem > 0) {
// 问题数据在 shared memory → 无 L2 cache 压力,打满 SM
pop_size = full_capacity;
} else {
// 问题数据在 global memory → 根据 L2 cache 容量估算合理并发度
//
// 模型pop = L2_size / working_set_bytes
// 所有 block 访问同一份只读数据L2/ws 反映 cache 能支撑的并发度
//
// SM 下限策略L2/ws >= sm_min/2 时拉升到 sm_min允许一定 cache 压力换取种群多样性)
// ch150: L2/ws=70, sm_min=128 → 70 >= 64 → 拉升到 128 ✓(多样性优先)
// pcb442: L2/ws=8, sm_min=128 → 8 < 64 → 不拉升 ✓(避免 thrashing
size_t ws = prob.working_set_bytes();
if (ws > 0) {
int l2_pop = (int)((size_t)prop.l2CacheSize / ws);
pop_size = (l2_pop < full_capacity) ? l2_pop : full_capacity;
} else {
pop_size = full_capacity / 4;
}
int sm_min = 1;
while (sm_min < prop.multiProcessorCount) sm_min *= 2;
if (pop_size < sm_min) {
bool l2_can_afford = (ws == 0) ||
((size_t)prop.l2CacheSize / ws >= (size_t)sm_min / 2);
if (l2_can_afford) pop_size = sm_min;
}
}
// 向下取整到 2 的幂warp 对齐、归约友好、islands 整除)
{
int p = 1;
while (p * 2 <= pop_size) p *= 2;
pop_size = p;
}
// 绝对下限32保证至少 1 岛 × 32 解的最小可用规模)
if (pop_size < 32) pop_size = 32;
}
// 自适应岛屿数量num_islands=0 时启用)
int num_islands = cfg.num_islands;
if (num_islands == 0) {
// 策略:每岛至少 32 个个体,最多 8 岛
// pop < 64 → 1 岛(纯 HC
// 64-127 → 2 岛
// 128-255 → 4 岛
// 256-511 → 8 岛
// >= 512 → 8 岛
if (pop_size < 64) {
num_islands = 1;
} else if (pop_size < 128) {
num_islands = 2;
} else if (pop_size < 256) {
num_islands = 4;
} else {
num_islands = 8;
}
}
bool use_islands = num_islands > 1;
int island_size = use_islands ? pop_size / num_islands : pop_size;
if (cfg.verbose) {
const char* enc_name = pcfg.encoding == EncodingType::Permutation ? "Perm"
: pcfg.encoding == EncodingType::Binary ? "Bin" : "Int";
const char* strat_name =
cfg.migrate_strategy == MigrateStrategy::Ring ? "Ring" :
cfg.migrate_strategy == MigrateStrategy::TopN ? "TopN" : "Hybrid";
printf("\n[GenSolver v2.0 Block] %s%s [%d][%d] pop=%d%s gen=%d blk=%d",
enc_name, pcfg.row_mode == RowMode::Partition ? "/Part" : "",
pcfg.dim1, pcfg.row_mode == RowMode::Partition ? pcfg.total_elements : pcfg.dim2_default,
pop_size, auto_pop ? "(auto)" : "",
cfg.max_gen, block_threads);
if (auto_pop) {
size_t ws = prob.working_set_bytes();
if (prob_smem > 0) {
printf("\n [AUTO] GPU=%s SM=%d strategy=full(smem) → pop=%d",
prop.name, prop.multiProcessorCount, pop_size);
} else {
printf("\n [AUTO] GPU=%s SM=%d L2=%dKB ws=%zuKB → pop=%d",
prop.name, prop.multiProcessorCount,
prop.l2CacheSize / 1024, ws / 1024, pop_size);
}
}
if (smem_overflow) {
printf("\n [WARN] Shared memory overflow, problem data stays in global memory");
}
if (use_islands) {
if (cfg.num_islands == 0) {
printf(" isl=%dx%d/%s(auto)", num_islands, island_size, strat_name);
} else {
printf(" isl=%dx%d/%s", num_islands, island_size, strat_name);
}
}
if (use_sa) printf(" SA=%.0f/%.4f", cfg.sa_temp_init, cfg.sa_alpha);
if (use_crossover) printf(" CX=%.0f%%", cfg.crossover_rate * 100.0f);
if (use_aos) printf(" AOS");
if (use_time_limit) printf(" T=%.1fs", cfg.time_limit_sec);
if (use_stagnation) printf(" stag=%d", cfg.stagnation_limit);
if (num_init_solutions > 0) printf(" init=%d", num_init_solutions);
if (cfg.use_cuda_graph) printf(" GRAPH");
printf(" seed=%u\n", cfg.seed);
}
// --- 1. 分配 ---
// crossover 栈需求thread 0 在 local memory 中构造 child
if (use_crossover) {
size_t ox_arrays = Sol::DIM1 * Sol::DIM2 * sizeof(bool)
+ 512 * sizeof(bool)
+ 512 * sizeof(int);
size_t need = sizeof(Sol) + ox_arrays + 512;
if (need > 1024) cudaDeviceSetLimit(cudaLimitStackSize, need);
}
ObjConfig oc = make_obj_config(pcfg);
// --- 1b. 采样择优初始化 ---
int oversample = cfg.init_oversample;
if (oversample < 1) oversample = 1;
int candidate_size = pop_size * oversample;
bool do_oversample = (oversample > 1);
Population<Sol> pop;
if (do_oversample) {
// 生成 K × pop_size 个候选解
Population<Sol> candidates;
candidates.allocate(candidate_size, block_threads);
candidates.init_rng(cfg.seed, 256);
candidates.init_population(pcfg, 256);
// 启发式初始解注入(替换候选池尾部)
if (pcfg.encoding == EncodingType::Permutation) {
HeuristicMatrix heur_mats[8];
int num_mats = prob.heuristic_matrices(heur_mats, 8);
if (num_mats > 0) {
bool is_partition = (pcfg.row_mode == RowMode::Partition);
auto heur_sols = heuristic_init::build_from_matrices<Sol>(
heur_mats, num_mats, pcfg.dim1, pcfg.dim2_default, pcfg.encoding,
is_partition, pcfg.total_elements);
int inject = (int)heur_sols.size();
if (inject > candidate_size / 8) inject = candidate_size / 8;
if (inject > 0) {
CUDA_CHECK(cudaMemcpy(
candidates.d_solutions + candidate_size - inject,
heur_sols.data(), sizeof(Sol) * inject,
cudaMemcpyHostToDevice));
if (cfg.verbose) {
printf(" [INIT] injected %d heuristic solutions into candidate pool\n", inject);
}
}
}
}
// GPU 上评估所有候选
{
size_t eval_smem = prob.shared_mem_bytes();
if (eval_smem > 48 * 1024) {
cudaFuncSetAttribute(evaluate_kernel<Problem, Sol>,
cudaFuncAttributeMaxDynamicSharedMemorySize, (int)eval_smem);
}
int eval_grid = calc_grid_size(candidate_size, block_threads);
evaluate_kernel<<<eval_grid, block_threads, eval_smem>>>(
prob, candidates.d_solutions, candidate_size, eval_smem);
CUDA_CHECK(cudaDeviceSynchronize());
}
// 下载所有候选解到 host
Sol* h_candidates = new Sol[candidate_size];
CUDA_CHECK(cudaMemcpy(h_candidates, candidates.d_solutions,
sizeof(Sol) * candidate_size, cudaMemcpyDeviceToHost));
// 构建候选信息
std::vector<init_sel::CandidateInfo> cand_info(candidate_size);
for (int i = 0; i < candidate_size; i++) {
cand_info[i].idx = i;
cand_info[i].penalty = h_candidates[i].penalty;
cand_info[i].rank = 0;
cand_info[i].crowding = 0.0f;
cand_info[i].selected = false;
for (int m = 0; m < oc.num_obj; m++) {
cand_info[i].objs[m] = normalize_obj(
h_candidates[i].objectives[m], oc.dirs[m]);
}
}
// 计算目标重要性
float importance[MAX_OBJ];
compute_importance(oc, importance);
// 纯随机保底名额
int num_random = (int)(pop_size * cfg.init_random_ratio);
if (num_random < 1) num_random = 1;
if (num_random > pop_size / 2) num_random = pop_size / 2;
// 选择
std::vector<int> selected;
if (oc.num_obj == 1) {
selected = init_sel::top_n_select(cand_info, pop_size, num_random);
} else {
selected = init_sel::nsga2_select(cand_info, oc.num_obj, importance,
pop_size, num_random);
}
// 分配最终种群
pop.allocate(pop_size, block_threads);
// 复用候选的 RNG 状态(取前 pop_size 份)
// 重新初始化 RNG 更安全(候选的 RNG 状态已被使用过)
pop.init_rng(cfg.seed + 1, 256);
// 上传选中的解到种群前部
int num_selected = (int)selected.size();
for (int i = 0; i < num_selected; i++) {
CUDA_CHECK(cudaMemcpy(pop.d_solutions + i,
candidates.d_solutions + selected[i],
sizeof(Sol), cudaMemcpyDeviceToDevice));
}
// 剩余位置(纯随机保底):从候选中随机选未被选中的
// 简单做法:直接用候选中排在后面的未选中解
if (num_selected < pop_size) {
int fill_idx = num_selected;
for (int i = 0; i < candidate_size && fill_idx < pop_size; i++) {
if (!cand_info[i].selected) {
CUDA_CHECK(cudaMemcpy(pop.d_solutions + fill_idx,
candidates.d_solutions + i,
sizeof(Sol), cudaMemcpyDeviceToDevice));
fill_idx++;
}
}
}
if (cfg.verbose) {
// 统计选中解的平均质量 vs 全部候选的平均质量
float sel_avg = 0.0f, all_avg = 0.0f;
for (int i = 0; i < candidate_size; i++) all_avg += cand_info[i].objs[0];
all_avg /= candidate_size;
for (int i = 0; i < num_selected; i++) sel_avg += cand_info[selected[i]].objs[0];
if (num_selected > 0) sel_avg /= num_selected;
const char* method = (oc.num_obj > 1) ? "NSGA-II" : "top-N";
printf(" [INIT] oversample=%dx → %d candidates, %s select %d + %d random",
oversample, candidate_size, method, num_selected,
pop_size - num_selected);
printf(" (obj0 avg: %.1f → %.1f, %.1f%% better)\n",
all_avg, sel_avg,
all_avg != 0.0f ? (1.0f - sel_avg / all_avg) * 100.0f : 0.0f);
}
delete[] h_candidates;
// candidates 析构自动释放 GPU 内存
} else {
// oversample=1纯随机和之前一样
pop.allocate(pop_size, block_threads);
pop.init_rng(cfg.seed, 256);
pop.init_population(pcfg, 256);
}
// --- 1c. 注入用户提供的初始解 ---
// 策略:校验合法性 → 合法解替换种群尾部(保留 oversample 选出的好解在前部)
if (init_solutions && num_init_solutions > 0) {
int max_inject = pop_size / 16; // 最多占种群 ~6%(保留多样性)
if (max_inject < 1) max_inject = 1;
if (max_inject > 16) max_inject = 16; // 绝对上限
int want = num_init_solutions;
if (want > max_inject) want = max_inject;
int injected = 0;
for (int i = 0; i < want; i++) {
const Sol& s = init_solutions[i];
bool valid = true;
// 基本维度检查
for (int r = 0; r < pcfg.dim1 && valid; r++) {
if (s.dim2_sizes[r] < 0 || s.dim2_sizes[r] > Sol::DIM2) {
valid = false; break;
}
}
// 编码特定检查
if (valid && pcfg.encoding == EncodingType::Permutation) {
if (pcfg.row_mode == RowMode::Partition) {
// 分区模式:跨行元素不重复,总数 = total_elements
bool seen[512] = {};
int total = 0;
for (int r = 0; r < pcfg.dim1 && valid; r++) {
for (int c = 0; c < s.dim2_sizes[r] && valid; c++) {
int v = s.data[r][c];
if (v < 0 || v >= pcfg.total_elements) { valid = false; break; }
if (v < 512 && seen[v]) { valid = false; break; }
if (v < 512) seen[v] = true;
total++;
}
}
if (valid && total != pcfg.total_elements) valid = false;
} else if (pcfg.perm_repeat_count > 1) {
// 多重集排列:每行中每个值 [0, N) 恰好出现 repeat_count 次
int R = pcfg.perm_repeat_count;
int N = pcfg.dim2_default / R;
for (int r = 0; r < pcfg.dim1 && valid; r++) {
if (s.dim2_sizes[r] != pcfg.dim2_default) { valid = false; break; }
int cnt[512] = {};
for (int c = 0; c < s.dim2_sizes[r] && valid; c++) {
int v = s.data[r][c];
if (v < 0 || v >= N) { valid = false; break; }
if (v < 512) cnt[v]++;
}
if (valid) {
for (int v = 0; v < N && v < 512 && valid; v++)
if (cnt[v] != R) valid = false;
}
}
} else {
// 标准排列:每行元素 [0, dim2_default) 不重复
for (int r = 0; r < pcfg.dim1 && valid; r++) {
if (s.dim2_sizes[r] != pcfg.dim2_default) { valid = false; break; }
bool seen[512] = {};
for (int c = 0; c < s.dim2_sizes[r] && valid; c++) {
int v = s.data[r][c];
if (v < 0 || v >= pcfg.dim2_default) { valid = false; break; }
if (v < 512 && seen[v]) { valid = false; break; }
if (v < 512) seen[v] = true;
}
}
}
} else if (valid && pcfg.encoding == EncodingType::Binary) {
for (int r = 0; r < pcfg.dim1 && valid; r++) {
for (int c = 0; c < s.dim2_sizes[r] && valid; c++) {
if (s.data[r][c] != 0 && s.data[r][c] != 1) { valid = false; break; }
}
}
}
if (valid) {
// 注入到种群尾部(从后往前填,保留前部的 oversample 好解)
int target_idx = pop_size - 1 - injected;
CUDA_CHECK(cudaMemcpy(pop.d_solutions + target_idx, &s,
sizeof(Sol), cudaMemcpyHostToDevice));
injected++;
} else if (cfg.verbose) {
printf(" [INIT] user solution #%d invalid, skipped\n", i);
}
}
if (cfg.verbose && injected > 0) {
printf(" [INIT] injected %d/%d user solutions (tail of population)\n",
injected, num_init_solutions);
}
}
// v3.0: 构建序列注册表(替代旧的 d_op_weights
ProblemProfile profile = classify_problem(pcfg);
SeqRegistry seq_reg = build_seq_registry(profile);
if (custom_registry_fn) {
custom_registry_fn(seq_reg);
}
// v3.1: K 步配置(多步执行)
KStepConfig kstep = build_kstep_config();
if (cfg.verbose) {
const char* scale_names[] = {"Small", "Medium", "Large"};
const char* struct_names[] = {"SingleSeq", "MultiFixed", "MultiPartition"};
printf(" [PROFILE] scale=%s structure=%s\n",
scale_names[(int)profile.scale], struct_names[(int)profile.structure]);
printf(" [SEQ] %d sequences registered:", seq_reg.count);
for (int i = 0; i < seq_reg.count; i++)
printf(" %d(%.2f)", seq_reg.ids[i], seq_reg.weights[i]);
printf("\n");
printf(" [K-STEP] K weights: K1=%.2f K2=%.2f K3=%.2f\n",
kstep.weights[0], kstep.weights[1], kstep.weights[2]);
}
int* d_best_idx;
CUDA_CHECK(cudaMalloc(&d_best_idx, sizeof(int)));
Sol* d_global_best = nullptr;
if (use_sa) {
CUDA_CHECK(cudaMalloc(&d_global_best, sizeof(Sol)));
}
// AOS: 分配全局内存统计缓冲区(序列级粒度)
AOSStats* d_aos_stats = nullptr;
AOSStats* h_aos_stats = nullptr;
if (use_aos) {
CUDA_CHECK(cudaMalloc(&d_aos_stats, sizeof(AOSStats) * pop_size));
h_aos_stats = new AOSStats[pop_size];
}
// --- 关系矩阵G/O用于 SEQ_LNS_GUIDED_REBUILD ---
// 仅 Permutation 编码 + 有 GUIDED_REBUILD 序列时启用
bool use_relation_matrix = false;
RelationMatrix rel_mat = {};
int rel_N = 0;
if (pcfg.encoding == EncodingType::Permutation) {
for (int i = 0; i < seq_reg.count; i++) {
if (seq_reg.ids[i] == seq::SEQ_LNS_GUIDED_REBUILD) {
use_relation_matrix = true;
break;
}
}
}
if (use_relation_matrix) {
// N = dim2_default排列中的元素数
rel_N = pcfg.dim2_default;
if (rel_N > 0) {
rel_mat = relation_matrix_create(rel_N, 0.95f);
// 让用户提供先验知识初始化 G/O可选默认不做任何事
prob.init_relation_matrix(rel_mat.h_G, rel_mat.h_O, rel_N);
relation_matrix_upload(rel_mat);
} else {
use_relation_matrix = false;
}
}
// grid = pop_size每个 block 处理一个解)
int grid = pop_size;
// --- 2. 初始评估 ---
// 采样择优路径中已经评估过候选,但最终种群可能包含随机解,需要重新评估
{
size_t eval_smem = prob.shared_mem_bytes();
if (eval_smem > 48 * 1024) {
cudaFuncSetAttribute(evaluate_kernel<Problem, Sol>,
cudaFuncAttributeMaxDynamicSharedMemorySize, (int)eval_smem);
}
int eval_grid = calc_grid_size(pop_size, block_threads);
evaluate_kernel<<<eval_grid, block_threads, eval_smem>>>(
prob, pop.d_solutions, pop_size, eval_smem);
}
if (use_sa) {
find_best_kernel<<<1, 1>>>(pop.d_solutions, pop_size, oc, d_best_idx);
CUDA_CHECK(cudaDeviceSynchronize());
int idx; CUDA_CHECK(cudaMemcpy(&idx, d_best_idx, sizeof(int), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(d_global_best, pop.d_solutions + idx, sizeof(Sol), cudaMemcpyDeviceToDevice));
}
// --- 3. 主循环 ---
// batch 大小决定了 AOS/关系矩阵/收敛检测的更新频率
// 需要平衡:太小 → 同步开销大,太大 → 反应迟钝
int batch;
if (use_islands)
batch = cfg.migrate_interval;
else if (cfg.verbose)
batch = cfg.print_every;
else
batch = cfg.max_gen;
// 需要定期更新的功能:强制 batch ≤ 200
if (use_relation_matrix || use_aos || use_time_limit || use_stagnation) {
if (batch > 200) batch = 200;
}
int gen_done = 0;
int migrate_round = 0;
StopReason stop_reason = StopReason::MaxGen;
// 收敛检测状态
float prev_best_scalar = 1e30f;
int stagnation_count = 0;
// --- EvolveParams: 可变参数device memory---
EvolveParams h_params;
h_params.temp_start = 0.0f;
h_params.gens_per_batch = batch;
h_params.seq_reg = seq_reg;
h_params.kstep = kstep;
h_params.migrate_round = 0;
h_params.oc = oc;
EvolveParams* d_params = nullptr;
CUDA_CHECK(cudaMalloc(&d_params, sizeof(EvolveParams)));
CUDA_CHECK(cudaMemcpy(d_params, &h_params, sizeof(EvolveParams), cudaMemcpyHostToDevice));
// --- CUDA Graph ---
const bool use_graph = cfg.use_cuda_graph;
cudaGraph_t graph = nullptr;
cudaGraphExec_t graph_exec = nullptr;
cudaStream_t stream = nullptr;
if (use_graph) {
CUDA_CHECK(cudaStreamCreate(&stream));
}
// lambda: 在 stream 上发射一个 batch 的 GPU kernel 序列
auto launch_batch_kernels = [&](cudaStream_t s) {
evolve_block_kernel<<<grid, block_threads, total_smem, s>>>(
prob, pop.d_solutions, pop_size,
pcfg.encoding, pcfg.dim1,
oc, pop.d_rng_states,
cfg.sa_alpha, prob_smem,
d_aos_stats,
use_relation_matrix ? rel_mat.d_G : nullptr,
use_relation_matrix ? rel_mat.d_O : nullptr,
rel_N,
pcfg.value_lower_bound, pcfg.value_upper_bound,
d_params);
if (use_crossover) {
crossover_block_kernel<<<grid, block_threads, total_smem, s>>>(
prob, pop.d_solutions, pop_size,
pcfg.encoding, pcfg.dim1,
oc, pop.d_rng_states,
cfg.crossover_rate, prob_smem,
pcfg.row_mode == RowMode::Partition ? pcfg.total_elements : pcfg.dim2_default);
}
if (use_islands) {
migrate_kernel<<<1, 1, 0, s>>>(pop.d_solutions, pop_size,
island_size, oc,
cfg.migrate_strategy, d_params);
}
if (use_sa) {
elite_inject_kernel<<<1, 1, 0, s>>>(pop.d_solutions, pop_size,
d_global_best, oc);
}
};
// 捕获 CUDA Graph首次
if (use_graph) {
CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
launch_batch_kernels(stream);
CUDA_CHECK(cudaStreamEndCapture(stream, &graph));
// 5-arg form: compatible with CUDA 10+; 3-arg form requires CUDA 12+
#if CUDART_VERSION >= 12000
CUDA_CHECK(cudaGraphInstantiate(&graph_exec, graph, 0));
#else
CUDA_CHECK(cudaGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
#endif
if (cfg.verbose) printf(" [CUDA Graph] captured and instantiated\n");
}
cudaEvent_t t_start, t_stop;
CUDA_CHECK(cudaEventCreate(&t_start));
CUDA_CHECK(cudaEventCreate(&t_stop));
CUDA_CHECK(cudaEventRecord(t_start));
// 时间感知 AOS窗口累积器
int win_seq_usage[MAX_SEQ] = {};
int win_seq_improve[MAX_SEQ] = {};
int win_k_usage[MAX_K] = {};
int win_k_improve[MAX_K] = {};
int batch_count = 0;
const int aos_interval = (cfg.aos_update_interval > 0) ? cfg.aos_update_interval : 1;
// v4.0: 约束导向 + 分层搜索
const bool use_constraint_directed = cfg.use_constraint_directed && use_aos;
const bool use_phased_search = cfg.use_phased_search && use_aos;
float base_max_w[MAX_SEQ];
for (int i = 0; i < seq_reg.count; i++) base_max_w[i] = seq_reg.max_w[i];
if (cfg.verbose && (use_constraint_directed || use_phased_search)) {
printf(" [P2] constraint_directed=%s phased_search=%s\n",
use_constraint_directed ? "ON" : "OFF",
use_phased_search ? "ON" : "OFF");
if (use_phased_search)
printf(" [P2] phases: explore=[0,%.0f%%) transition=[%.0f%%,%.0f%%) refine=[%.0f%%,100%%]\n",
cfg.phase_explore_end * 100, cfg.phase_explore_end * 100,
cfg.phase_refine_start * 100, cfg.phase_refine_start * 100);
}
while (gen_done < cfg.max_gen) {
int gens = batch;
if (gen_done + gens > cfg.max_gen) gens = cfg.max_gen - gen_done;
float temp = use_sa ? cfg.sa_temp_init * powf(cfg.sa_alpha, (float)gen_done) : 0.0f;
// 更新 device 端可变参数
h_params.temp_start = temp;
h_params.gens_per_batch = gens;
h_params.seq_reg = seq_reg;
h_params.kstep = kstep;
h_params.migrate_round = migrate_round;
CUDA_CHECK(cudaMemcpy(d_params, &h_params, sizeof(EvolveParams), cudaMemcpyHostToDevice));
// 发射 GPU kernel 序列
if (use_graph) {
CUDA_CHECK(cudaGraphLaunch(graph_exec, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
} else {
launch_batch_kernels(nullptr);
}
gen_done += gens;
if (use_islands) migrate_round++;
batch_count++;
// AOS: 两层权重更新EMA+ 停滞检测
if (use_aos && (batch_count % aos_interval == 0)) {
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaMemcpy(h_aos_stats, d_aos_stats,
sizeof(AOSStats) * pop_size,
cudaMemcpyDeviceToHost));
// --- 聚合当前 batch 的统计到窗口累积器 ---
for (int b = 0; b < pop_size; b++) {
for (int i = 0; i < seq_reg.count; i++) {
win_seq_usage[i] += h_aos_stats[b].usage[i];
win_seq_improve[i] += h_aos_stats[b].improvement[i];
}
for (int i = 0; i < MAX_K; i++) {
win_k_usage[i] += h_aos_stats[b].k_usage[i];
win_k_improve[i] += h_aos_stats[b].k_improvement[i];
}
}
constexpr float AOS_ALPHA = 0.6f;
// --- v4.0: 约束导向 — 计算种群约束违反率 ---
float penalty_ratio = 0.0f;
if (use_constraint_directed) {
Sol* h_pop_snap = new Sol[pop_size];
CUDA_CHECK(cudaMemcpy(h_pop_snap, pop.d_solutions,
sizeof(Sol) * pop_size, cudaMemcpyDeviceToHost));
int infeasible = 0;
for (int b = 0; b < pop_size; b++) {
if (h_pop_snap[b].penalty > 0.0f) infeasible++;
}
penalty_ratio = (float)infeasible / (float)pop_size;
delete[] h_pop_snap;
}
// --- v4.0: 分层搜索 — 计算当前阶段的 floor/cap 调整 ---
float phase_floor_mult = 1.0f;
float phase_cap_mult = 1.0f;
if (use_phased_search) {
float progress;
if (use_time_limit && cfg.time_limit_sec > 0.0f) {
float elapsed_ms = 0.0f;
CUDA_CHECK(cudaEventRecord(t_stop));
CUDA_CHECK(cudaEventSynchronize(t_stop));
CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, t_start, t_stop));
progress = elapsed_ms / (cfg.time_limit_sec * 1000.0f);
if (progress > 1.0f) progress = 1.0f;
} else {
progress = (float)gen_done / (float)cfg.max_gen;
}
if (progress < cfg.phase_explore_end) {
phase_floor_mult = 1.5f; // 探索期:抬高 floor → 更均匀
phase_cap_mult = 0.7f; // 探索期:压低 cap → 防止过早集中
} else if (progress >= cfg.phase_refine_start) {
phase_floor_mult = 0.5f; // 精细期:降低 floor → 允许弱算子退出
phase_cap_mult = 1.5f; // 精细期:抬高 cap → 集中利用强算子
}
}
// --- 第二层算子权重更新EMA ---
{
float new_w[MAX_SEQ];
float sum = 0.0f;
for (int i = 0; i < seq_reg.count; i++) {
float signal = (win_seq_usage[i] > 0)
? (float)win_seq_improve[i] / (float)win_seq_usage[i]
: 0.0f;
new_w[i] = AOS_ALPHA * seq_reg.weights[i]
+ (1.0f - AOS_ALPHA) * (signal + AOS_WEIGHT_FLOOR);
sum += new_w[i];
}
if (sum > 0.0f) {
for (int i = 0; i < seq_reg.count; i++)
seq_reg.weights[i] = new_w[i] / sum;
}
float uniform = 1.0f / seq_reg.count;
float base_floor = cfg.aos_weight_floor / seq_reg.count;
if (base_floor < uniform * 0.5f) base_floor = uniform * 0.5f;
float floor_val = base_floor * phase_floor_mult;
float global_cap = cfg.aos_weight_cap * phase_cap_mult;
// --- v4.0: 约束导向 — boost 跨行/行级算子权重 + 放宽 cap ---
if (use_constraint_directed && penalty_ratio > 0.1f) {
float boost = 1.0f + (penalty_ratio - 0.1f) / 0.9f
* (cfg.constraint_boost_max - 1.0f);
for (int i = 0; i < seq_reg.count; i++) {
if (seq_reg.categories[i] == SeqCategory::CrossRow ||
seq_reg.categories[i] == SeqCategory::RowLevel) {
seq_reg.weights[i] *= boost;
float orig = (base_max_w[i] > 0.0f) ? base_max_w[i] : AOS_WEIGHT_CAP;
seq_reg.max_w[i] = orig * boost;
}
}
float wsum = 0.0f;
for (int i = 0; i < seq_reg.count; i++) wsum += seq_reg.weights[i];
if (wsum > 0.0f)
for (int i = 0; i < seq_reg.count; i++) seq_reg.weights[i] /= wsum;
} else if (use_constraint_directed) {
for (int i = 0; i < seq_reg.count; i++)
seq_reg.max_w[i] = base_max_w[i];
}
bool need_renorm = false;
for (int i = 0; i < seq_reg.count; i++) {
float cap_val = (seq_reg.max_w[i] > 0.0f) ? seq_reg.max_w[i] : global_cap;
if (seq_reg.weights[i] < floor_val) { seq_reg.weights[i] = floor_val; need_renorm = true; }
if (seq_reg.weights[i] > cap_val) { seq_reg.weights[i] = cap_val; need_renorm = true; }
}
if (need_renorm) {
sum = 0.0f;
for (int i = 0; i < seq_reg.count; i++) sum += seq_reg.weights[i];
if (sum > 0.0f) for (int i = 0; i < seq_reg.count; i++) seq_reg.weights[i] /= sum;
}
}
// --- 第一层K 步数权重更新EMA ---
{
float new_w[MAX_K];
float sum = 0.0f;
for (int i = 0; i < MAX_K; i++) {
float rate = (win_k_usage[i] > 0)
? (float)win_k_improve[i] / (float)win_k_usage[i]
: 0.0f;
new_w[i] = AOS_ALPHA * kstep.weights[i]
+ (1.0f - AOS_ALPHA) * (rate + AOS_WEIGHT_FLOOR);
sum += new_w[i];
}
if (sum > 0.0f) {
for (int i = 0; i < MAX_K; i++)
kstep.weights[i] = new_w[i] / sum;
}
float floor_val = cfg.aos_weight_floor;
float cap_val = 0.95f;
bool need_renorm = false;
for (int i = 0; i < MAX_K; i++) {
if (kstep.weights[i] < floor_val) { kstep.weights[i] = floor_val; need_renorm = true; }
if (kstep.weights[i] > cap_val) { kstep.weights[i] = cap_val; need_renorm = true; }
}
if (need_renorm) {
sum = 0.0f;
for (int i = 0; i < MAX_K; i++) sum += kstep.weights[i];
if (sum > 0.0f) for (int i = 0; i < MAX_K; i++) kstep.weights[i] /= sum;
}
}
// --- Debug: 前 5 个 batch 打印统计 ---
if (cfg.verbose && gen_done <= batch * 5) {
fprintf(stderr, " [AOS batch g=%d] usage:", gen_done);
for (int i = 0; i < seq_reg.count; i++) fprintf(stderr, " %d", win_seq_usage[i]);
fprintf(stderr, " | improve:");
for (int i = 0; i < seq_reg.count; i++) fprintf(stderr, " %d", win_seq_improve[i]);
fprintf(stderr, " | w:");
for (int i = 0; i < seq_reg.count; i++) fprintf(stderr, " %.3f", seq_reg.weights[i]);
fprintf(stderr, " | K: %.2f/%.2f/%.2f stag=%d",
kstep.weights[0], kstep.weights[1], kstep.weights[2], kstep.stagnation_count);
if (use_constraint_directed)
fprintf(stderr, " | pen=%.1f%%", penalty_ratio * 100.0f);
if (use_phased_search)
fprintf(stderr, " | phase_f=%.2f phase_c=%.2f", phase_floor_mult, phase_cap_mult);
fprintf(stderr, "\n");
}
// --- 停滞检测 ---
{
int total_improve_all = 0;
for (int i = 0; i < seq_reg.count; i++)
total_improve_all += win_seq_improve[i];
if (total_improve_all == 0) {
kstep.stagnation_count++;
} else {
kstep.stagnation_count = 0;
}
if (kstep.stagnation_count >= kstep.stagnation_limit) {
kstep.weights[0] = 0.80f;
kstep.weights[1] = 0.15f;
kstep.weights[2] = 0.05f;
kstep.stagnation_count = 0;
}
}
// --- 清零窗口累积器 ---
memset(win_seq_usage, 0, sizeof(win_seq_usage));
memset(win_seq_improve, 0, sizeof(win_seq_improve));
memset(win_k_usage, 0, sizeof(win_k_usage));
memset(win_k_improve, 0, sizeof(win_k_improve));
}
// --- 关系矩阵更新(每个 batch 间隙,从种群 top-K 解统计)---
// 多个好解贡献 G/O 信号,加速矩阵信息积累
if (use_relation_matrix) {
if (!use_aos) {
CUDA_CHECK(cudaDeviceSynchronize());
}
// 下载整个种群的目标值,找 top-K
constexpr int REL_TOP_K = 4;
int top_indices[REL_TOP_K];
{
// 简单方法:下载所有解的 scalar 目标host 端排序取 top-K
float* h_scores = new float[pop_size];
Sol* h_pop_ptr = new Sol[pop_size];
CUDA_CHECK(cudaMemcpy(h_pop_ptr, pop.d_solutions,
sizeof(Sol) * pop_size, cudaMemcpyDeviceToHost));
for (int b = 0; b < pop_size; b++) {
h_scores[b] = scalar_objective(h_pop_ptr[b], oc);
if (h_pop_ptr[b].penalty > 0.0f) h_scores[b] = 1e30f;
}
// 找 top-K 最小值
for (int k = 0; k < REL_TOP_K && k < pop_size; k++) {
int mi = 0;
for (int b = 1; b < pop_size; b++) {
if (h_scores[b] < h_scores[mi]) mi = b;
}
top_indices[k] = mi;
h_scores[mi] = 1e30f; // 标记已选
}
// 从 top-K 解更新 G/O
int actual_k = (pop_size < REL_TOP_K) ? pop_size : REL_TOP_K;
for (int k = 0; k < actual_k; k++) {
relation_matrix_update(rel_mat, h_pop_ptr[top_indices[k]], pcfg.dim1);
}
delete[] h_scores;
delete[] h_pop_ptr;
}
relation_matrix_upload(rel_mat);
}
// 交叉 / 迁移 / 精英注入 已在 launch_batch_kernels 中统一发射
// --- 时间限制检查 ---
if (use_time_limit) {
CUDA_CHECK(cudaEventRecord(t_stop));
CUDA_CHECK(cudaEventSynchronize(t_stop));
float ms_so_far = 0;
CUDA_CHECK(cudaEventElapsedTime(&ms_so_far, t_start, t_stop));
if (ms_so_far >= cfg.time_limit_sec * 1000.0f) {
stop_reason = StopReason::TimeLimit;
if (cfg.verbose) printf(" [STOP] time limit %.1fs reached at gen %d\n",
cfg.time_limit_sec, gen_done);
break;
}
}
// --- 收敛检测 + reheat ---
if (use_stagnation) {
find_best_kernel<<<1, 1>>>(pop.d_solutions, pop_size, oc, d_best_idx);
CUDA_CHECK(cudaDeviceSynchronize());
int bi; CUDA_CHECK(cudaMemcpy(&bi, d_best_idx, sizeof(int), cudaMemcpyDeviceToHost));
Sol cur_best = pop.download_solution(bi);
float cur_scalar = scalar_objective(cur_best, oc);
if (cur_best.penalty > 0.0f) cur_scalar = 1e30f;
constexpr float IMPROVE_EPS = 1e-6f;
if (prev_best_scalar - cur_scalar > IMPROVE_EPS) {
prev_best_scalar = cur_scalar;
stagnation_count = 0;
} else {
stagnation_count++;
}
if (stagnation_count >= cfg.stagnation_limit) {
if (use_sa && cfg.reheat_ratio > 0.0f) {
// reheat将温度恢复到初始温度的 reheat_ratio 倍
// 通过回退 gen_done 实现(温度 = init * alpha^gen_done
float target_temp = cfg.sa_temp_init * cfg.reheat_ratio;
int reheat_gen = (int)(logf(target_temp / cfg.sa_temp_init) / logf(cfg.sa_alpha));
if (reheat_gen < 0) reheat_gen = 0;
// 不真正回退 gen_done会影响终止条件而是记录一个 temp_offset
// 简化做法:直接在下一轮 batch 中 temp 会自然从 reheat 后的值开始
// 这里通过修改 gen_done 的等效温度来实现
if (cfg.verbose) {
float cur_temp = cfg.sa_temp_init * powf(cfg.sa_alpha, (float)gen_done);
printf(" [REHEAT] stagnation=%d at gen %d, temp %.4f → %.4f\n",
cfg.stagnation_limit, gen_done, cur_temp, target_temp);
}
// 将 gen_done 回退到对应 target_temp 的位置(但不超过已完成代数的一半)
int min_gen = gen_done / 2;
if (reheat_gen < min_gen) reheat_gen = min_gen;
gen_done = reheat_gen;
stagnation_count = 0;
} else {
// 无 SA 时,收敛检测触发 → 提前终止
stop_reason = StopReason::Stagnation;
if (cfg.verbose) printf(" [STOP] stagnation=%d at gen %d, no SA to reheat\n",
cfg.stagnation_limit, gen_done);
break;
}
}
}
// 打印进度
if (cfg.verbose && gen_done % cfg.print_every == 0) {
if (!use_stagnation) {
find_best_kernel<<<1, 1>>>(pop.d_solutions, pop_size, oc, d_best_idx);
CUDA_CHECK(cudaDeviceSynchronize());
}
int idx; CUDA_CHECK(cudaMemcpy(&idx, d_best_idx, sizeof(int), cudaMemcpyDeviceToHost));
Sol best = pop.download_solution(idx);
printf(" [%5d]", gen_done);
for (int i = 0; i < pcfg.num_objectives; i++)
printf(" %.1f", best.objectives[i]);
if (best.penalty > 0.0f) printf(" P=%.1f", best.penalty);
printf("\n");
}
}
CUDA_CHECK(cudaEventRecord(t_stop));
CUDA_CHECK(cudaEventSynchronize(t_stop));
float elapsed_ms = 0;
CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, t_start, t_stop));
// --- 4. 最终结果 ---
Sol best;
if (use_sa) {
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaMemcpy(&best, d_global_best, sizeof(Sol), cudaMemcpyDeviceToHost));
} else {
find_best_kernel<<<1, 1>>>(pop.d_solutions, pop_size, oc, d_best_idx);
CUDA_CHECK(cudaDeviceSynchronize());
int h_best_idx;
CUDA_CHECK(cudaMemcpy(&h_best_idx, d_best_idx, sizeof(int), cudaMemcpyDeviceToHost));
best = pop.download_solution(h_best_idx);
}
if (cfg.verbose) {
const char* reason_str = stop_reason == StopReason::TimeLimit ? " [time]" :
stop_reason == StopReason::Stagnation ? " [stag]" : "";
printf(" Result:");
for (int i = 0; i < pcfg.num_objectives; i++)
printf(" obj%d=%.2f", i, best.objectives[i]);
if (best.penalty > 0.0f) printf(" INFEASIBLE(%.2f)", best.penalty);
printf(" %.0fms %dgen%s\n", elapsed_ms, gen_done, reason_str);
}
if (cfg.verbose) {
for (int r = 0; r < pcfg.dim1; r++) {
printf(" row[%d]:", r);
int show = best.dim2_sizes[r] < 20 ? best.dim2_sizes[r] : 20;
for (int c = 0; c < show; c++) printf(" %d", best.data[r][c]);
if (best.dim2_sizes[r] > 20) printf(" ...(%d)", best.dim2_sizes[r]);
printf("\n");
}
}
// AOS: 打印最终两层权重
if (use_aos && cfg.verbose) {
printf(" AOS K-step weights: K1=%.3f K2=%.3f K3=%.3f\n",
kstep.weights[0], kstep.weights[1], kstep.weights[2]);
printf(" AOS seq weights:");
for (int i = 0; i < seq_reg.count; i++)
printf(" [%d]=%.3f", seq_reg.ids[i], seq_reg.weights[i]);
printf("\n");
}
// 填充返回值
result.best_solution = best;
result.elapsed_ms = elapsed_ms;
result.generations = gen_done;
result.stop_reason = stop_reason;
CUDA_CHECK(cudaFree(d_best_idx));
if (d_global_best) CUDA_CHECK(cudaFree(d_global_best));
if (d_aos_stats) CUDA_CHECK(cudaFree(d_aos_stats));
if (h_aos_stats) delete[] h_aos_stats;
if (use_relation_matrix) relation_matrix_destroy(rel_mat);
CUDA_CHECK(cudaFree(d_params));
if (graph_exec) CUDA_CHECK(cudaGraphExecDestroy(graph_exec));
if (graph) CUDA_CHECK(cudaGraphDestroy(graph));
if (stream) CUDA_CHECK(cudaStreamDestroy(stream));
CUDA_CHECK(cudaEventDestroy(t_start));
CUDA_CHECK(cudaEventDestroy(t_stop));
return result;
}