Initial commit: cuGenOpt GPU optimization solver

2026-06-09 19:15:13 +02:00 · 2026-03-20 00:33:45 +08:00 · 2026-03-20 00:33:45 +08:00 · fc5a0ff4af
commit fc5a0ff4af
117 changed files with 25545 additions and 0 deletions
--- a/python/cugenopt/include/core/cuda_utils.cuh
+++ b/python/cugenopt/include/core/cuda_utils.cuh
@ -0,0 +1,90 @@
+/**
+ * cuda_utils.cuh - CUDA 工具集
+ * 
+ * 职责：错误检查、设备信息、随机数工具
+ * 规则：所有 CUDA API 调用都必须用 CUDA_CHECK 包裹
+ */
+
+#pragma once
+#include <cstdio>
+#include <cstdlib>
+#include <curand_kernel.h>
+
+// ============================================================
+// 错误检查
+// ============================================================
+
+#define CUDA_CHECK(call) do {                                       \
+    cudaError_t err = (call);                                       \
+    if (err != cudaSuccess) {                                       \
+        fprintf(stderr, "CUDA error at %s:%d: %s\n",               \
+                __FILE__, __LINE__, cudaGetErrorString(err));       \
+        exit(EXIT_FAILURE);                                         \
+    }                                                               \
+} while(0)
+
+// kernel launch 后检查（捕获异步错误）
+#define CUDA_CHECK_LAST() do {                                      \
+    cudaError_t err = cudaGetLastError();                            \
+    if (err != cudaSuccess) {                                       \
+        fprintf(stderr, "CUDA kernel error at %s:%d: %s\n",        \
+                __FILE__, __LINE__, cudaGetErrorString(err));       \
+        exit(EXIT_FAILURE);                                         \
+    }                                                               \
+} while(0)
+
+// ============================================================
+// 设备信息
+// ============================================================
+
+inline void print_device_info() {
+    int device;
+    cudaDeviceProp prop;
+    CUDA_CHECK(cudaGetDevice(&device));
+    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+    
+    printf("GPU: %s\n", prop.name);
+    printf("  SM count:       %d\n", prop.multiProcessorCount);
+    printf("  Max threads/SM: %d\n", prop.maxThreadsPerMultiProcessor);
+    printf("  Shared mem/blk: %zu KB\n", prop.sharedMemPerBlock / 1024);
+    printf("  Global mem:     %.1f GB\n", prop.totalGlobalMem / 1e9);
+    printf("  Compute cap:    %d.%d\n", prop.major, prop.minor);
+}
+
+// ============================================================
+// 随机数工具 (Device 端)
+// ============================================================
+
+// 初始化 curand 状态，每个线程一个
+__global__ void init_curand_kernel(curandState* states, unsigned long long seed, int n) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < n) {
+        curand_init(seed, tid, 0, &states[tid]);
+    }
+}
+
+// Device 端：生成 [0, bound) 的随机整数
+__device__ inline int rand_int(curandState* state, int bound) {
+    return curand(state) % bound;
+}
+
+// Device 端：Fisher-Yates shuffle，对 arr[0..n-1] 做随机排列
+__device__ inline void shuffle(int* arr, int n, curandState* state) {
+    for (int i = n - 1; i > 0; i--) {
+        int j = rand_int(state, i + 1);
+        int tmp = arr[i];
+        arr[i] = arr[j];
+        arr[j] = tmp;
+    }
+}
+
+// ============================================================
+// Kernel 启动参数计算
+// ============================================================
+
+inline int div_ceil(int a, int b) { return (a + b - 1) / b; }
+
+// 计算合适的 block 数量
+inline int calc_grid_size(int n, int block_size = 256) {
+    return div_ceil(n, block_size);
+}
--- a/python/cugenopt/include/core/gpu_cache.cuh
+++ b/python/cugenopt/include/core/gpu_cache.cuh
@ -0,0 +1,141 @@
+/**
+ * gpu_cache.cuh - GPU 全局内存哈希表（通用缓存组件）
+ * 
+ * 设计：
+ *   - 开放寻址，固定容量（power of 2），线性探测
+ *   - key = uint64_t（由 Problem 自行计算 hash）
+ *   - value = float（单个指标值）
+ *   - 无锁：允许 race condition（缓存语义，偶尔脏读可接受）
+ *   - 自带命中/未命中原子计数器
+ * 
+ * 用法：
+ *   GpuCache cache = GpuCache::allocate(65536);   // host
+ *   // ... pass cache as Problem member to kernels ...
+ *   cache.print_stats();                           // host
+ *   cache.destroy();                               // host
+ * 
+ * 参考：scute 项目 LRUCache（key = metric_type + content_hash）
+ */
+
+#pragma once
+#include "cuda_utils.cuh"
+#include <cstdint>
+
+// ============================================================
+// 常量
+// ============================================================
+
+static constexpr uint64_t CACHE_EMPTY_KEY = 0xFFFFFFFFFFFFFFFFULL;
+static constexpr int CACHE_MAX_PROBE = 8;   // 最大线性探测步数
+
+// ============================================================
+// GpuCache 结构体（POD，可安全拷贝到 kernel）
+// ============================================================
+
+struct GpuCache {
+    uint64_t* keys;             // GPU 全局内存
+    float*    values;           // GPU 全局内存
+    unsigned int* d_hits;       // 原子计数器（GPU）
+    unsigned int* d_misses;     // 原子计数器（GPU）
+    int capacity;               // 必须是 2 的幂
+    int mask;                   // = capacity - 1
+    
+    // ---- Host 操作 ----
+    
+    static GpuCache allocate(int cap = 65536) {
+        GpuCache c;
+        c.capacity = cap;
+        c.mask = cap - 1;
+        CUDA_CHECK(cudaMalloc(&c.keys,     sizeof(uint64_t) * cap));
+        CUDA_CHECK(cudaMalloc(&c.values,   sizeof(float) * cap));
+        CUDA_CHECK(cudaMalloc(&c.d_hits,   sizeof(unsigned int)));
+        CUDA_CHECK(cudaMalloc(&c.d_misses, sizeof(unsigned int)));
+        c.clear();
+        return c;
+    }
+    
+    static GpuCache disabled() {
+        GpuCache c;
+        c.keys = nullptr;  c.values = nullptr;
+        c.d_hits = nullptr; c.d_misses = nullptr;
+        c.capacity = 0;  c.mask = 0;
+        return c;
+    }
+    
+    bool is_enabled() const { return keys != nullptr; }
+    
+    void clear() {
+        CUDA_CHECK(cudaMemset(keys, 0xFF, sizeof(uint64_t) * capacity));
+        CUDA_CHECK(cudaMemset(d_hits,   0, sizeof(unsigned int)));
+        CUDA_CHECK(cudaMemset(d_misses, 0, sizeof(unsigned int)));
+    }
+    
+    void destroy() {
+        if (keys)     cudaFree(keys);
+        if (values)   cudaFree(values);
+        if (d_hits)   cudaFree(d_hits);
+        if (d_misses) cudaFree(d_misses);
+        keys = nullptr; values = nullptr;
+        d_hits = nullptr; d_misses = nullptr;
+    }
+    
+    void print_stats() const {
+        if (!keys) { printf("  Cache: disabled\n"); return; }
+        unsigned int h = 0, m = 0;
+        CUDA_CHECK(cudaMemcpy(&h, d_hits,   sizeof(unsigned int), cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpy(&m, d_misses, sizeof(unsigned int), cudaMemcpyDeviceToHost));
+        unsigned int total = h + m;
+        float rate = total > 0 ? (float)h / total * 100.0f : 0.0f;
+        printf("  Cache: %u lookups | %u hits + %u misses | hit rate = %.1f%%\n",
+               total, h, m, rate);
+        printf("  Cache: capacity = %d entries (%.1f KB)\n",
+               capacity, capacity * (sizeof(uint64_t) + sizeof(float)) / 1024.0f);
+    }
+};
+
+// ============================================================
+// Device 函数：哈希 / 查找 / 插入
+// ============================================================
+
+/// FNV-1a 哈希：对一段有序 int 序列（如路线中的客户 ID）
+__device__ inline uint64_t route_hash(const int* data, int len) {
+    uint64_t h = 14695981039346656037ULL;   // FNV offset basis
+    for (int i = 0; i < len; i++) {
+        h ^= (uint64_t)(unsigned int)data[i];
+        h *= 1099511628211ULL;               // FNV prime
+    }
+    return (h == CACHE_EMPTY_KEY) ? h - 1 : h;  // 避免与哨兵值碰撞
+}
+
+/// 查找：命中返回 true + 写入 out
+__device__ inline bool cache_lookup(const GpuCache& c, uint64_t key, float& out) {
+    int slot = (int)(key & (uint64_t)c.mask);
+    for (int p = 0; p < CACHE_MAX_PROBE; p++) {
+        int idx = (slot + p) & c.mask;
+        uint64_t k = c.keys[idx];
+        if (k == key) {
+            out = c.values[idx];
+            return true;
+        }
+        if (k == CACHE_EMPTY_KEY) return false;  // 空槽 → 一定不存在
+    }
+    return false;   // 探测用尽
+}
+
+/// 插入：写入 key-value，同 key 覆盖，探测满则驱逐首槽
+__device__ inline void cache_insert(const GpuCache& c, uint64_t key, float value) {
+    int slot = (int)(key & (uint64_t)c.mask);
+    for (int p = 0; p < CACHE_MAX_PROBE; p++) {
+        int idx = (slot + p) & c.mask;
+        uint64_t k = c.keys[idx];
+        if (k == CACHE_EMPTY_KEY || k == key) {
+            c.keys[idx]   = key;
+            c.values[idx] = value;
+            return;
+        }
+    }
+    // 探测满：驱逐首槽
+    int idx = slot & c.mask;
+    c.keys[idx]   = key;
+    c.values[idx] = value;
+}
--- a/python/cugenopt/include/core/init_heuristic.cuh
+++ b/python/cugenopt/include/core/init_heuristic.cuh
@ -0,0 +1,121 @@
+#pragma once
+#include "types.cuh"
+#include <vector>
+#include <algorithm>
+#include <numeric>
+
+namespace heuristic_init {
+
+// 单行排列：所有行填相同排列
+template<typename Sol>
+static void build_sorted_permutation(Sol& sol, const std::vector<int>& order,
+                                     int dim1, int dim2) {
+    for (int r = 0; r < dim1; r++) {
+        sol.dim2_sizes[r] = dim2;
+        for (int c = 0; c < dim2; c++)
+            sol.data[r][c] = order[c];
+    }
+    sol.penalty = 0.0f;
+    for (int i = 0; i < MAX_OBJ; i++) sol.objectives[i] = 0.0f;
+}
+
+// Partition 模式：排列均匀切分到 dim1 行，元素不重复
+template<typename Sol>
+static void build_partition_from_order(Sol& sol, const std::vector<int>& order,
+                                       int dim1, int total_elements) {
+    int idx = 0;
+    for (int r = 0; r < dim1; r++) {
+        int count = total_elements / dim1;
+        if (r < total_elements % dim1) count++;
+        sol.dim2_sizes[r] = count;
+        for (int c = 0; c < count; c++)
+            sol.data[r][c] = order[idx++];
+    }
+    sol.penalty = 0.0f;
+    for (int i = 0; i < MAX_OBJ; i++) sol.objectives[i] = 0.0f;
+}
+
+template<typename Sol>
+std::vector<Sol> build_from_matrices(const HeuristicMatrix* matrices, int num_matrices,
+                                     int dim1, int dim2, EncodingType encoding,
+                                     bool partition_mode = false, int total_elements = 0) {
+    std::vector<Sol> results;
+    if (encoding != EncodingType::Permutation) return results;
+    int elem_count = partition_mode ? total_elements : dim2;
+    if (num_matrices <= 0 || elem_count <= 0) return results;
+
+    auto make_sol = [&](const std::vector<int>& order) {
+        Sol sol{};
+        if (partition_mode)
+            build_partition_from_order(sol, order, dim1, total_elements);
+        else
+            build_sorted_permutation(sol, order, dim1, dim2);
+        return sol;
+    };
+
+    for (int m = 0; m < num_matrices; m++) {
+        const float* mat = matrices[m].data;
+        int N = matrices[m].N;
+        if (!mat || N < elem_count) continue;
+
+        std::vector<float> row_sum(N, 0.0f);
+        std::vector<float> col_sum(N, 0.0f);
+        for (int i = 0; i < N; i++)
+            for (int j = 0; j < N; j++) {
+                row_sum[i] += mat[i * N + j];
+                col_sum[j] += mat[i * N + j];
+            }
+
+        // 对于 Partition (VRPTW)，距离矩阵含 depot (index 0)，
+        // 排序只针对客户 (index 1..N-1)，输出值为 0-based 客户编号
+        std::vector<int> idx;
+        if (partition_mode && N > elem_count) {
+            for (int i = 1; i <= elem_count; i++) idx.push_back(i);
+        } else {
+            idx.resize(elem_count);
+            std::iota(idx.begin(), idx.end(), 0);
+        }
+
+        auto to_customer = [&](std::vector<int>& order) {
+            if (partition_mode && N > elem_count) {
+                for (auto& v : order) v -= 1;
+            }
+        };
+
+        // row_sum ascending
+        {
+            auto order = idx;
+            std::sort(order.begin(), order.end(),
+                      [&](int a, int b) { return row_sum[a] < row_sum[b]; });
+            to_customer(order);
+            results.push_back(make_sol(order));
+        }
+        // row_sum descending
+        {
+            auto order = idx;
+            std::sort(order.begin(), order.end(),
+                      [&](int a, int b) { return row_sum[a] > row_sum[b]; });
+            to_customer(order);
+            results.push_back(make_sol(order));
+        }
+        // col_sum ascending
+        {
+            auto order = idx;
+            std::sort(order.begin(), order.end(),
+                      [&](int a, int b) { return col_sum[a] < col_sum[b]; });
+            to_customer(order);
+            results.push_back(make_sol(order));
+        }
+        // col_sum descending
+        {
+            auto order = idx;
+            std::sort(order.begin(), order.end(),
+                      [&](int a, int b) { return col_sum[a] > col_sum[b]; });
+            to_customer(order);
+            results.push_back(make_sol(order));
+        }
+    }
+    return results;
+}
+
+} // namespace heuristic_init
--- a/python/cugenopt/include/core/init_selection.cuh
+++ b/python/cugenopt/include/core/init_selection.cuh
@ -0,0 +1,258 @@
+/**
+ * init_selection.cuh - 初始解采样择优 + NSGA-II 选择
+ *
+ * Host 端逻辑，在 solver 初始化阶段调用一次。
+ * 从 K × pop_size 个候选解中选出 pop_size 个作为初始种群。
+ *
+ * 选择策略：
+ *   1. 核心目标预留名额（按 importance 分配）
+ *   2. NSGA-II 选择（非支配排序 + 加权拥挤度）
+ *   3. 纯随机保底（多样性）
+ *
+ * 单目标时自动退化为 top-N 排序，无需分支。
+ */
+
+#pragma once
+#include "types.cuh"
+#include <algorithm>
+#include <vector>
+#include <cmath>
+#include <cstring>
+
+namespace init_sel {
+
+// ============================================================
+// 候选解的目标信息（从 GPU 下载后在 host 端使用）
+// ============================================================
+struct CandidateInfo {
+    int   idx;           // 在候选数组中的原始索引
+    float objs[MAX_OBJ]; // 归一化后的目标值（越小越好）
+    float penalty;
+    int   rank;          // 非支配排序层级（0 = Pareto 前沿）
+    float crowding;      // 拥挤度距离
+    bool  selected;      // 是否已被选中
+};
+
+// ============================================================
+// 非支配排序（Fast Non-dominated Sort）
+// ============================================================
+// 复杂度：O(M × N²)，M = 目标数，N = 候选数
+// 对初始化场景（N ≤ 几千，M ≤ 4）完全可接受
+
+inline void fast_nondominated_sort(std::vector<CandidateInfo>& cands,
+                                    int num_obj,
+                                    std::vector<std::vector<int>>& fronts) {
+    int n = (int)cands.size();
+    std::vector<int> dom_count(n, 0);        // 被多少个解支配
+    std::vector<std::vector<int>> dom_set(n); // 支配了哪些解
+    
+    // 判断 a 是否支配 b：a 在所有目标上 ≤ b，且至少一个 <
+    // 先处理 penalty：可行解支配不可行解
+    auto dominates = [&](int a, int b) -> bool {
+        const auto& ca = cands[a];
+        const auto& cb = cands[b];
+        // penalty 处理
+        if (ca.penalty <= 0.0f && cb.penalty > 0.0f) return true;
+        if (ca.penalty > 0.0f && cb.penalty <= 0.0f) return false;
+        if (ca.penalty > 0.0f && cb.penalty > 0.0f) return ca.penalty < cb.penalty;
+        
+        bool all_leq = true;
+        bool any_lt = false;
+        for (int m = 0; m < num_obj; m++) {
+            if (ca.objs[m] > cb.objs[m]) { all_leq = false; break; }
+            if (ca.objs[m] < cb.objs[m]) any_lt = true;
+        }
+        return all_leq && any_lt;
+    };
+    
+    // 计算支配关系
+    for (int i = 0; i < n; i++) {
+        for (int j = i + 1; j < n; j++) {
+            if (dominates(i, j)) {
+                dom_set[i].push_back(j);
+                dom_count[j]++;
+            } else if (dominates(j, i)) {
+                dom_set[j].push_back(i);
+                dom_count[i]++;
+            }
+        }
+    }
+    
+    // 提取各层前沿
+    fronts.clear();
+    std::vector<int> current_front;
+    for (int i = 0; i < n; i++) {
+        if (dom_count[i] == 0) {
+            cands[i].rank = 0;
+            current_front.push_back(i);
+        }
+    }
+    
+    int front_idx = 0;
+    while (!current_front.empty()) {
+        fronts.push_back(current_front);
+        std::vector<int> next_front;
+        for (int i : current_front) {
+            for (int j : dom_set[i]) {
+                dom_count[j]--;
+                if (dom_count[j] == 0) {
+                    cands[j].rank = front_idx + 1;
+                    next_front.push_back(j);
+                }
+            }
+        }
+        current_front = next_front;
+        front_idx++;
+    }
+}
+
+// ============================================================
+// 加权拥挤度距离
+// ============================================================
+// 标准拥挤度 + importance 加权：核心目标维度上的间距贡献更大
+
+inline void weighted_crowding_distance(std::vector<CandidateInfo>& cands,
+                                        const std::vector<int>& front,
+                                        int num_obj,
+                                        const float* importance) {
+    int n = (int)front.size();
+    if (n <= 2) {
+        for (int i : front) cands[i].crowding = 1e18f;  // 边界解无穷大
+        return;
+    }
+    
+    for (int i : front) cands[i].crowding = 0.0f;
+    
+    std::vector<int> sorted_idx(front.begin(), front.end());
+    
+    for (int m = 0; m < num_obj; m++) {
+        // 按目标 m 排序
+        std::sort(sorted_idx.begin(), sorted_idx.end(),
+                  [&](int a, int b) { return cands[a].objs[m] < cands[b].objs[m]; });
+        
+        float range = cands[sorted_idx[n-1]].objs[m] - cands[sorted_idx[0]].objs[m];
+        if (range < 1e-12f) continue;  // 该目标无区分度
+        
+        // 边界解设为无穷大
+        cands[sorted_idx[0]].crowding += 1e18f;
+        cands[sorted_idx[n-1]].crowding += 1e18f;
+        
+        // 中间解：相邻间距 × importance 权重
+        float w = importance[m];
+        for (int i = 1; i < n - 1; i++) {
+            float gap = cands[sorted_idx[i+1]].objs[m] - cands[sorted_idx[i-1]].objs[m];
+            cands[sorted_idx[i]].crowding += w * (gap / range);
+        }
+    }
+}
+
+// ============================================================
+// 主选择函数：从 N 个候选中选出 target 个
+// ============================================================
+// 返回被选中的候选索引
+
+inline std::vector<int> nsga2_select(std::vector<CandidateInfo>& cands,
+                                      int num_obj,
+                                      const float* importance,
+                                      int target,
+                                      int num_reserved_random) {
+    // --- 1. 核心目标预留名额 ---
+    int num_reserve_total = target - num_reserved_random;
+    // 预留比例：importance[i] × 30% 的名额（剩余 70% 给 NSGA-II）
+    float reserve_ratio = 0.3f;
+    
+    std::vector<int> selected;
+    selected.reserve(target);
+    
+    // 对每个目标，按该目标排序取 top
+    for (int m = 0; m < num_obj; m++) {
+        int quota = (int)(num_reserve_total * importance[m] * reserve_ratio);
+        if (quota < 1 && num_obj > 1) quota = 1;  // 每个目标至少 1 个
+        
+        // 按目标 m 排序（越小越好）
+        std::vector<int> by_obj(cands.size());
+        for (int i = 0; i < (int)cands.size(); i++) by_obj[i] = i;
+        std::sort(by_obj.begin(), by_obj.end(),
+                  [&](int a, int b) { return cands[a].objs[m] < cands[b].objs[m]; });
+        
+        int added = 0;
+        for (int i = 0; i < (int)by_obj.size() && added < quota; i++) {
+            int idx = by_obj[i];
+            if (!cands[idx].selected) {
+                cands[idx].selected = true;
+                selected.push_back(idx);
+                added++;
+            }
+        }
+    }
+    
+    // --- 2. NSGA-II 选择填充剩余名额 ---
+    int remaining = target - num_reserved_random - (int)selected.size();
+    
+    if (remaining > 0) {
+        // 非支配排序
+        std::vector<std::vector<int>> fronts;
+        fast_nondominated_sort(cands, num_obj, fronts);
+        
+        for (auto& front : fronts) {
+            if (remaining <= 0) break;
+            
+            // 过滤已选中的
+            std::vector<int> available;
+            for (int i : front) {
+                if (!cands[i].selected) available.push_back(i);
+            }
+            
+            if ((int)available.size() <= remaining) {
+                // 整层都选
+                for (int i : available) {
+                    cands[i].selected = true;
+                    selected.push_back(i);
+                    remaining--;
+                }
+            } else {
+                // 该层需要截断：按加权拥挤度选
+                weighted_crowding_distance(cands, available, num_obj, importance);
+                std::sort(available.begin(), available.end(),
+                          [&](int a, int b) { return cands[a].crowding > cands[b].crowding; });
+                for (int i = 0; i < remaining; i++) {
+                    cands[available[i]].selected = true;
+                    selected.push_back(available[i]);
+                }
+                remaining = 0;
+            }
+        }
+    }
+    
+    return selected;
+}
+
+// ============================================================
+// 单目标快速路径：直接按标量排序取 top
+// ============================================================
+inline std::vector<int> top_n_select(std::vector<CandidateInfo>& cands,
+                                      int target,
+                                      int num_reserved_random) {
+    int to_select = target - num_reserved_random;
+    
+    // 按 penalty 优先，然后按 objs[0]（已归一化为越小越好）
+    std::vector<int> indices(cands.size());
+    for (int i = 0; i < (int)cands.size(); i++) indices[i] = i;
+    std::sort(indices.begin(), indices.end(), [&](int a, int b) {
+        if (cands[a].penalty <= 0.0f && cands[b].penalty > 0.0f) return true;
+        if (cands[a].penalty > 0.0f && cands[b].penalty <= 0.0f) return false;
+        if (cands[a].penalty > 0.0f && cands[b].penalty > 0.0f)
+            return cands[a].penalty < cands[b].penalty;
+        return cands[a].objs[0] < cands[b].objs[0];
+    });
+    
+    std::vector<int> selected;
+    selected.reserve(to_select);
+    for (int i = 0; i < to_select && i < (int)indices.size(); i++) {
+        selected.push_back(indices[i]);
+        cands[indices[i]].selected = true;
+    }
+    return selected;
+}
+
+} // namespace init_sel
--- a/python/cugenopt/include/core/operators.cuh
+++ b/python/cugenopt/include/core/operators.cuh
--- a/python/cugenopt/include/core/population.cuh
+++ b/python/cugenopt/include/core/population.cuh
@ -0,0 +1,212 @@
+/**
+ * population.cuh - 种群管理
+ * 
+ * v2.0: Block 级架构
+ *   - RNG 数组大小 = pop_size * block_size（每个 block 内每个线程独立 RNG）
+ *   - 初始化 kernel 保持 1-thread-per-solution（初始化只做一次，不需要并行）
+ *   - find_best_kernel 保持单线程（种群规模不大）
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+
+// ============================================================
+// Device 端 Kernel（模板化）
+// ============================================================
+
+template<typename Sol>
+__global__ void init_permutation_kernel(Sol* pop, int pop_size, 
+                                         int dim1, int dim2_default,
+                                         curandState* rng_states) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= pop_size) return;
+    Sol& sol = pop[tid];
+    curandState* rng = &rng_states[tid];
+    for (int r = 0; r < dim1; r++) {
+        sol.dim2_sizes[r] = dim2_default;
+        for (int c = 0; c < dim2_default; c++) sol.data[r][c] = c;
+        shuffle(sol.data[r], dim2_default, rng);
+    }
+    sol.penalty = 0.0f;
+}
+
+template<typename Sol>
+__global__ void init_binary_kernel(Sol* pop, int pop_size,
+                                    int dim1, int dim2_default,
+                                    curandState* rng_states) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= pop_size) return;
+    Sol& sol = pop[tid];
+    curandState* rng = &rng_states[tid];
+    for (int r = 0; r < dim1; r++) {
+        sol.dim2_sizes[r] = dim2_default;
+        for (int c = 0; c < dim2_default; c++) sol.data[r][c] = curand(rng) % 2;
+    }
+    sol.penalty = 0.0f;
+}
+
+template<typename Sol>
+__global__ void init_integer_kernel(Sol* pop, int pop_size,
+                                     int dim1, int dim2_default,
+                                     int lb, int ub,
+                                     curandState* rng_states) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= pop_size) return;
+    Sol& sol = pop[tid];
+    curandState* rng = &rng_states[tid];
+    int range = ub - lb + 1;
+    for (int r = 0; r < dim1; r++) {
+        sol.dim2_sizes[r] = dim2_default;
+        for (int c = 0; c < dim2_default; c++)
+            sol.data[r][c] = lb + (curand(rng) % range);
+    }
+    sol.penalty = 0.0f;
+}
+
+// ============================================================
+// 多重集排列初始化 — 每个值 [0, N) 重复 R 次，总长度 N*R
+// ============================================================
+// 用于 JSP 工序排列编码：N=num_jobs, R=num_ops，值 j 出现 R 次表示工件 j
+
+template<typename Sol>
+__global__ void init_multiset_perm_kernel(Sol* pop, int pop_size,
+                                           int dim1, int num_values, int repeat_count,
+                                           curandState* rng_states) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= pop_size) return;
+    Sol& sol = pop[tid];
+    curandState* rng = &rng_states[tid];
+    int total = num_values * repeat_count;
+    for (int r = 0; r < dim1; r++) {
+        sol.dim2_sizes[r] = total;
+        int idx = 0;
+        for (int v = 0; v < num_values; v++)
+            for (int k = 0; k < repeat_count; k++)
+                sol.data[r][idx++] = v;
+        shuffle(sol.data[r], total, rng);
+    }
+    sol.penalty = 0.0f;
+}
+
+// ============================================================
+// 分区初始化 — 元素 {0..total_elements-1} 不重复分配到 dim1 行
+// ============================================================
+
+template<typename Sol>
+__global__ void init_partition_kernel(Sol* pop, int pop_size,
+                                      int dim1, int total_elements,
+                                      curandState* rng_states) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= pop_size) return;
+    Sol& sol = pop[tid];
+    curandState* rng = &rng_states[tid];
+    
+    for (int i = 0; i < total_elements; i++) sol.data[0][i] = i;
+    shuffle(sol.data[0], total_elements, rng);
+    
+    int idx = 0;
+    for (int r = 0; r < dim1; r++) {
+        int count = total_elements / dim1;
+        if (r < total_elements % dim1) count++;
+        sol.dim2_sizes[r] = count;
+        if (r > 0) {
+            for (int c = 0; c < count; c++)
+                sol.data[r][c] = sol.data[0][idx + c];
+        }
+        idx += count;
+    }
+    
+    sol.penalty = 0.0f;
+}
+
+template<typename Sol>
+__global__ void find_best_kernel(const Sol* pop, int pop_size,
+                                  ObjConfig oc, int* best_idx) {
+    if (threadIdx.x != 0 || blockIdx.x != 0) return;
+    int best = 0;
+    for (int i = 1; i < pop_size; i++)
+        if (is_better(pop[i], pop[best], oc)) best = i;
+    *best_idx = best;
+}
+
+// ============================================================
+// Host 端 RAII 类（模板化）
+// ============================================================
+
+template<typename Sol>
+class Population {
+public:
+    Sol*         d_solutions  = nullptr;
+    curandState* d_rng_states = nullptr;  // 大小 = pop_size * block_size
+    int          size         = 0;
+    int          rng_count    = 0;        // RNG 状态总数
+
+    Population() = default;
+    
+    // block_size: Block 级架构下每个 block 的线程数
+    // RNG 数组大小 = pop_size * block_size（每个 block 内每个线程独立 RNG）
+    void allocate(int pop_size, int block_size = 128) {
+        size = pop_size;
+        rng_count = pop_size * block_size;
+        CUDA_CHECK(cudaMalloc(&d_solutions, sizeof(Sol) * size));
+        CUDA_CHECK(cudaMalloc(&d_rng_states, sizeof(curandState) * rng_count));
+    }
+    
+    void init_rng(unsigned seed, int block_size = 256) {
+        int grid = calc_grid_size(rng_count, block_size);
+        init_curand_kernel<<<grid, block_size>>>(d_rng_states, seed, rng_count);
+        CUDA_CHECK_LAST();
+    }
+    
+    void init_population(const ProblemConfig& cfg, int block_size = 256) {
+        int grid = calc_grid_size(size, block_size);
+        
+        if (cfg.row_mode == RowMode::Partition) {
+            init_partition_kernel<<<grid, block_size>>>(
+                d_solutions, size, cfg.dim1, cfg.total_elements, d_rng_states);
+        } else if (cfg.encoding == EncodingType::Permutation && cfg.perm_repeat_count > 1) {
+            int num_values = cfg.dim2_default / cfg.perm_repeat_count;
+            init_multiset_perm_kernel<<<grid, block_size>>>(
+                d_solutions, size, cfg.dim1, num_values, cfg.perm_repeat_count, d_rng_states);
+        } else {
+            switch (cfg.encoding) {
+                case EncodingType::Permutation:
+                    init_permutation_kernel<<<grid, block_size>>>(
+                        d_solutions, size, cfg.dim1, cfg.dim2_default, d_rng_states);
+                    break;
+                case EncodingType::Binary:
+                    init_binary_kernel<<<grid, block_size>>>(
+                        d_solutions, size, cfg.dim1, cfg.dim2_default, d_rng_states);
+                    break;
+                case EncodingType::Integer:
+                    init_integer_kernel<<<grid, block_size>>>(
+                        d_solutions, size, cfg.dim1, cfg.dim2_default,
+                        cfg.value_lower_bound, cfg.value_upper_bound,
+                        d_rng_states);
+                    break;
+            }
+        }
+        CUDA_CHECK_LAST();
+    }
+    
+    Sol download_solution(int idx) const {
+        Sol h_sol;
+        CUDA_CHECK(cudaMemcpy(&h_sol, d_solutions + idx, sizeof(Sol), cudaMemcpyDeviceToHost));
+        return h_sol;
+    }
+    
+    ~Population() {
+        if (d_solutions)  cudaFree(d_solutions);
+        if (d_rng_states) cudaFree(d_rng_states);
+    }
+    
+    Population(const Population&) = delete;
+    Population& operator=(const Population&) = delete;
+    Population(Population&& o) noexcept 
+        : d_solutions(o.d_solutions), d_rng_states(o.d_rng_states),
+          size(o.size), rng_count(o.rng_count) {
+        o.d_solutions = nullptr; o.d_rng_states = nullptr;
+        o.size = 0; o.rng_count = 0;
+    }
+};
--- a/python/cugenopt/include/core/relation_matrix.cuh
+++ b/python/cugenopt/include/core/relation_matrix.cuh
@ -0,0 +1,125 @@
+/**
+ * relation_matrix.cuh - G/O 关系矩阵管理
+ *
+ * G[i][j]: 分组倾向（元素 i 和 j 应在同一行的倾向，对称）
+ * O[i][j]: 排序倾向（元素 i 应排在 j 前面的倾向，不对称）
+ *
+ * 更新来源：历史最优解统计
+ *   每当 host 端获取到当前 best 解，扫描所有元素对关系：
+ *     - 同行 → G[i][j] 增强
+ *     - i 在 j 前 → O[i][j] 增强
+ *   使用 EMA 衰减：M[i][j] = α * M[i][j] + (1-α) * signal
+ *
+ * 生命周期：
+ *   1. relation_matrix_create(N)  — 分配 host/device 内存，初始化为 0
+ *   2. relation_matrix_update(rm, sol, dim1) — 从一个解更新 G/O（host 端）
+ *   3. relation_matrix_upload(rm) — 上传 h_G/h_O 到 d_G/d_O
+ *   4. relation_matrix_destroy(rm) — 释放内存
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include <cstring>
+
+// ============================================================
+// 创建 / 销毁
+// ============================================================
+
+inline RelationMatrix relation_matrix_create(int N, float decay = 0.95f) {
+    RelationMatrix rm;
+    rm.N = N;
+    rm.decay = decay;
+    rm.update_count = 0;
+    
+    size_t bytes = (size_t)N * N * sizeof(float);
+    
+    rm.h_G = new float[N * N];
+    rm.h_O = new float[N * N];
+    memset(rm.h_G, 0, bytes);
+    memset(rm.h_O, 0, bytes);
+    
+    CUDA_CHECK(cudaMalloc(&rm.d_G, bytes));
+    CUDA_CHECK(cudaMalloc(&rm.d_O, bytes));
+    CUDA_CHECK(cudaMemset(rm.d_G, 0, bytes));
+    CUDA_CHECK(cudaMemset(rm.d_O, 0, bytes));
+    
+    return rm;
+}
+
+inline void relation_matrix_destroy(RelationMatrix& rm) {
+    delete[] rm.h_G;
+    delete[] rm.h_O;
+    CUDA_CHECK(cudaFree(rm.d_G));
+    CUDA_CHECK(cudaFree(rm.d_O));
+    rm.h_G = rm.h_O = nullptr;
+    rm.d_G = rm.d_O = nullptr;
+    rm.N = 0;
+}
+
+// ============================================================
+// 从一个解更新 G/O（host 端）
+// ============================================================
+// sol: 当前最优解（已下载到 host）
+// dim1: 实际使用的行数
+//
+// 逻辑：
+//   对 sol 中每对元素 (val_a, val_b)：
+//     如果在同一行 → G[val_a][val_b] 增强
+//     如果 val_a 在 val_b 前面 → O[val_a][val_b] 增强
+//
+// 注意：元素值 val 必须在 [0, N) 范围内才有意义
+//       对于 partition 编码（VRP），元素值就是客户编号
+//       对于单行排列（TSP），元素值就是城市编号
+
+template<typename Sol>
+void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) {
+    int N = rm.N;
+    float alpha = rm.decay;
+    float signal_strength = 1.0f;
+    
+    // 衰减所有现有值
+    for (int i = 0; i < N * N; i++) {
+        rm.h_G[i] *= alpha;
+        rm.h_O[i] *= alpha;
+    }
+    
+    // 扫描解中的元素对关系
+    for (int r = 0; r < dim1; r++) {
+        int sz = sol.dim2_sizes[r];
+        for (int c1 = 0; c1 < sz; c1++) {
+            int val_a = sol.data[r][c1];
+            if (val_a < 0 || val_a >= N) continue;
+            
+            for (int c2 = c1 + 1; c2 < sz; c2++) {
+                int val_b = sol.data[r][c2];
+                if (val_b < 0 || val_b >= N) continue;
+                
+                // 同行 → G 增强（对称）
+                rm.h_G[val_a * N + val_b] += (1.0f - alpha) * signal_strength;
+                rm.h_G[val_b * N + val_a] += (1.0f - alpha) * signal_strength;
+                
+                // val_a 在 val_b 前 → O[val_a][val_b] 增强
+                rm.h_O[val_a * N + val_b] += (1.0f - alpha) * signal_strength;
+            }
+        }
+    }
+    
+    // 裁剪到 [0, 1]
+    for (int i = 0; i < N * N; i++) {
+        if (rm.h_G[i] > 1.0f) rm.h_G[i] = 1.0f;
+        if (rm.h_O[i] > 1.0f) rm.h_O[i] = 1.0f;
+    }
+    
+    rm.update_count++;
+}
+
+// ============================================================
+// 上传到 GPU
+// ============================================================
+
+inline void relation_matrix_upload(const RelationMatrix& rm) {
+    size_t bytes = (size_t)rm.N * rm.N * sizeof(float);
+    CUDA_CHECK(cudaMemcpy(rm.d_G, rm.h_G, bytes, cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(rm.d_O, rm.h_O, bytes, cudaMemcpyHostToDevice));
+}
--- a/python/cugenopt/include/core/solver.cuh
+++ b/python/cugenopt/include/core/solver.cuh
--- a/python/cugenopt/include/core/types.cuh
+++ b/python/cugenopt/include/core/types.cuh
@ -0,0 +1,721 @@
+/**
+ * types.cuh - 核心类型定义
+ * 
+ * 包含：编码类型、Solution 模板、ProblemConfig/SolverConfig、
+ *       SeqRegistry（AOS 序列级权重）、KStepConfig（多步执行）、
+ *       RelationMatrix（G/O 关系矩阵）、ProblemBase（CRTP 基类）
+ */
+
+#pragma once
+
+// ============================================================
+// 编译时常量
+// ============================================================
+constexpr int MAX_OBJ = 4;    // 最多 4 个目标（16字节，不值得模板化）
+constexpr int MAX_SEQ = 32;   // 最大序列数（内置 ~16 + 自定义算子 ≤8，留余量）
+constexpr int MAX_K   = 3;    // 多步执行的最大步数（K=1,2,3）
+// AOS 权重上下限（归一化后）
+constexpr float AOS_WEIGHT_FLOOR = 0.05f;  // 最低权重保底（确保充分探索）
+constexpr float AOS_WEIGHT_CAP   = 0.35f;  // 最高权重上限（防止赢者通吃）
+
+// ============================================================
+// 枚举类型
+// ============================================================
+
+enum class EncodingType {
+    Permutation,    // 排列：元素不重复
+    Binary,         // 0-1：flip 是主要算子
+    Integer         // 有界整数
+};
+
+enum class RowMode {
+    Single,     // dim1=1，单行（TSP/QAP/Knapsack 等大部分问题）
+    Fixed,      // dim1>1，行等长不可变（JSP-Int/Schedule，禁止 SPLIT/MERGE）
+    Partition   // dim1>1，元素分区到各行，行长可变（CVRP/VRPTW）
+};
+
+enum class ObjDir {
+    Minimize,
+    Maximize
+};
+
+// 多目标比较模式
+enum class CompareMode {
+    Weighted,       // 加权求和：sum(weight[i] * obj[i])，越小越好
+    Lexicographic   // 字典法：按优先级逐目标比较，前面的目标优先
+};
+
+enum class MigrateStrategy {
+    Ring,       // 环形：各岛最优→邻岛最差（慢传播，高多样性）
+    TopN,       // 全局 Top-N 轮转分发（快传播，强收敛）
+    Hybrid      // 两者兼顾：Top-N 替换最差 + Ring 替换次差
+};
+
+
+// ============================================================
+// SeqID — 统一的 OperationSequence 编号
+// ============================================================
+// 每个 SeqID 对应一种具体的搜索操作（原子或多步）
+// AOS 权重跟踪粒度 = SeqID（每个序列独立权重）
+//
+// 命名规则：SEQ_{编码}_{操作名}
+// 跨编码共享的行级操作统一编号
+
+namespace seq {
+
+// --- Permutation 行内（元素级）---
+constexpr int SEQ_PERM_SWAP           = 0;   // swap 两个位置
+constexpr int SEQ_PERM_REVERSE        = 1;   // 2-opt（反转区间）
+constexpr int SEQ_PERM_INSERT         = 2;   // insert（移动到新位置）
+constexpr int SEQ_PERM_3OPT           = 3;   // 3-opt（断 3 边重连）
+
+// --- Permutation 行内（片段级）---
+constexpr int SEQ_PERM_OR_OPT         = 4;   // or-opt（移动连续 k 个元素）
+
+// --- Permutation 行内（组合级）---
+constexpr int SEQ_PERM_DOUBLE_SWAP    = 30;  // 连续两次 swap（同行）
+constexpr int SEQ_PERM_TRIPLE_SWAP    = 31;  // 连续三次 swap（同行）
+
+// --- Permutation 跨行（元素级）---
+constexpr int SEQ_PERM_CROSS_RELOCATE = 5;   // 单元素移行
+constexpr int SEQ_PERM_CROSS_SWAP     = 6;   // 单元素换行
+
+// --- Permutation 跨行（片段级）---
+constexpr int SEQ_PERM_SEG_RELOCATE   = 7;   // 片段移行
+constexpr int SEQ_PERM_SEG_SWAP       = 8;   // 片段换行（2-opt*）
+constexpr int SEQ_PERM_CROSS_EXCHANGE = 9;   // 片段互换（保序）
+
+// --- Binary 行内（元素级）---
+constexpr int SEQ_BIN_FLIP            = 0;   // 翻转一个位
+constexpr int SEQ_BIN_SWAP            = 1;   // 交换两个位
+
+// --- Binary 行内（片段级）---
+constexpr int SEQ_BIN_SEG_FLIP        = 2;   // 翻转连续 k 个位
+constexpr int SEQ_BIN_K_FLIP          = 3;   // 同时翻转 k 个随机位
+
+// --- Binary 跨行 ---
+constexpr int SEQ_BIN_CROSS_SWAP      = 4;   // 两行各一个位互换
+constexpr int SEQ_BIN_SEG_CROSS_SWAP  = 5;   // 两行各取一段互换
+
+// --- 共享：行级（编码无关）---
+constexpr int SEQ_ROW_SWAP            = 10;  // 交换两行
+constexpr int SEQ_ROW_REVERSE         = 11;  // 反转行排列
+constexpr int SEQ_ROW_SPLIT           = 12;  // 一行拆两行
+constexpr int SEQ_ROW_MERGE           = 13;  // 两行合并
+
+// --- 特殊 ---
+constexpr int SEQ_PERTURBATION        = 14;  // 扰动（多步不可逆）
+
+// --- Integer 行内（元素级）---
+constexpr int SEQ_INT_RANDOM_RESET    = 0;   // 随机一个位置重置为 [lb, ub] 内随机值
+constexpr int SEQ_INT_DELTA           = 1;   // 随机一个位置 ±k（clamp 到 [lb, ub]）
+constexpr int SEQ_INT_SWAP            = 2;   // 交换两个位置的值
+
+// --- Integer 行内（片段级）---
+constexpr int SEQ_INT_SEG_RESET       = 3;   // 连续 k 个位置全部重置
+constexpr int SEQ_INT_K_DELTA         = 4;   // 随机 k 个位置各自 ±1
+
+// --- Integer 跨行 ---
+constexpr int SEQ_INT_CROSS_SWAP      = 5;   // 两行各一个位置互换
+
+// --- LNS（大邻域搜索）---
+constexpr int SEQ_LNS_SEGMENT_SHUFFLE = 20;  // 打乱连续片段
+constexpr int SEQ_LNS_SCATTER_SHUFFLE = 21;  // 打乱随机分散位置
+constexpr int SEQ_LNS_GUIDED_REBUILD  = 22;  // 关系矩阵引导重建
+
+}  // namespace seq
+
+// ============================================================
+// RelationMatrix — G/O 关系矩阵（GPU global memory）
+// ============================================================
+// G[i][j]: 元素 i 和 j 的分组倾向（对称，越大越倾向同组）
+// O[i][j]: 元素 i 排在 j 前面的倾向（不对称）
+// 存储为一维数组 [N * N]，行优先
+// 小规模 N<200 直接 Dense，P2 再做稀疏化
+//
+// 更新时机：host 端，每个 batch 间隙
+// 使用时机：kernel 中 SEQ_LNS_GUIDED_REBUILD 读取
+
+struct RelationMatrix {
+    float* d_G;           // GPU 上的 G 矩阵 [N * N]
+    float* d_O;           // GPU 上的 O 矩阵 [N * N]
+    float* h_G;           // Host 上的 G 矩阵 [N * N]（用于更新后上传）
+    float* h_O;           // Host 上的 O 矩阵 [N * N]
+    int    N;             // 元素总数
+    float  decay;         // 衰减系数 α（默认 0.95）
+    int    update_count;  // 已更新次数（用于冷启动判断）
+};
+
+// ============================================================
+// SeqRegistry — 运行时可用序列注册表
+// ============================================================
+// 根据 EncodingType 和 dim1 自动确定哪些序列可用
+// 传到 GPU 供 sample_sequence() 使用
+
+enum class SeqCategory : int {
+    InRow    = 0,   // 行内算子（swap, reverse, insert, ...）
+    CrossRow = 1,   // 跨行算子（cross_relocate, cross_swap, seg_relocate, ...）
+    RowLevel = 2,   // 行级算子（row_swap, row_reverse, split, merge）
+    LNS      = 3,   // 大邻域搜索
+};
+
+struct SeqRegistry {
+    int   ids[MAX_SEQ];       // 可用序列的 SeqID 列表
+    int   count;              // 可用序列数量
+    float weights[MAX_SEQ];   // 每个序列的当前权重（归一化后用于采样）
+    float max_w[MAX_SEQ];     // 每个序列的权重上限（0 = 不限，用全局 cap）
+    SeqCategory categories[MAX_SEQ];  // 每个序列的分类（约束导向用）
+};
+
+// ============================================================
+// KStepConfig — 多步执行的步数选择配置
+// ============================================================
+// K=1: 单步（当前行为），K=2/3: 连续执行多个序列后再评估
+// 两层权重体系的第一层
+//
+// 自适应策略：
+//   - 初始 K=1 权重很大（保守），K>1 权重小
+//   - K>1 带来改进 → 增大该 K 的权重
+//   - 长时间无改进 → 重置/增大 K>1 权重（跳出局部最优）
+
+struct KStepConfig {
+    float weights[MAX_K];     // K=1,2,3 的采样权重（归一化）
+    int   stagnation_count;   // 连续无改进的 batch 数（用于触发重置）
+    int   stagnation_limit;   // 触发重置的阈值（默认 5 个 batch）
+};
+
+// 构建默认 K 步配置
+inline KStepConfig build_kstep_config() {
+    KStepConfig kc;
+    kc.weights[0] = 0.80f;   // K=1: 初始主导
+    kc.weights[1] = 0.15f;   // K=2: 少量探索
+    kc.weights[2] = 0.05f;   // K=3: 极少探索
+    kc.stagnation_count = 0;
+    kc.stagnation_limit = 5;
+    return kc;
+};
+
+// ============================================================
+// ProblemProfile — 基于结构特征推断的问题画像
+// ============================================================
+// 第一层：纯结构推断（不感知语义），用于驱动算子注册和初始权重
+// 未来第二层：可扩展更细粒度的画像（如多属性、高约束等）
+
+enum class ScaleClass  { Small, Medium, Large };
+enum class StructClass { SingleSeq, MultiFixed, MultiPartition };
+
+struct ProblemProfile {
+    EncodingType  encoding;
+    ScaleClass    scale;
+    StructClass   structure;
+    float         cross_row_prob;
+};
+
+// classify_problem() 定义在 ProblemConfig 之后
+
+// ============================================================
+// 权重预设 — 由 ScaleClass 驱动
+// ============================================================
+
+struct WeightPreset {
+    float w_cubic;
+    float w_quadratic;
+    float w_lns;
+    float lns_cap;
+};
+
+inline WeightPreset get_weight_preset(ScaleClass scale) {
+    switch (scale) {
+        case ScaleClass::Small:  return { 0.50f, 0.80f, 0.006f, 0.01f };
+        case ScaleClass::Medium: return { 0.30f, 0.70f, 0.004f, 0.01f };
+        case ScaleClass::Large:  return { 0.05f, 0.30f, 0.001f, 0.01f };
+    }
+    return { 0.50f, 0.80f, 0.006f, 0.01f };
+}
+
+// classify_problem() 和 build_seq_registry() 定义在 ProblemConfig 之后
+
+// ============================================================
+// Solution<D1, D2> — 解的模板化表示
+// ============================================================
+// D1: 行数上限 (TSP=1, VRP≤16, Schedule≤8)
+// D2: 每行列数上限 (TSP≤64, 背包≤32)
+// 每个 Problem 选择最小够用的 D1/D2，编译器生成紧凑的结构
+
+template<int D1, int D2>
+struct Solution {
+    static constexpr int DIM1 = D1;   // 编译时行数上限
+    static constexpr int DIM2 = D2;   // 编译时列数上限
+    int   data[D1][D2];               // D1×D2×4 字节
+    int   dim2_sizes[D1];             // D1×4 字节
+    float objectives[MAX_OBJ];        // 16 字节（固定）
+    float penalty;                    // 4 字节
+};
+
+// ============================================================
+// ProblemConfig — 问题的运行时元信息
+// ============================================================
+
+struct ProblemConfig {
+    EncodingType encoding;
+    int   dim1;                       // 实际使用的行数 (≤ D1)
+    int   dim2_default;               // 实际使用的列数 (≤ D2)
+    int   num_objectives;
+    ObjDir obj_dirs[MAX_OBJ];
+    float obj_weights[MAX_OBJ];       // Weighted 模式下的权重
+    // 多目标比较
+    CompareMode compare_mode = CompareMode::Weighted;
+    int   obj_priority[MAX_OBJ] = {0, 1, 2, 3};  // Lexicographic 模式下的比较顺序（索引）
+    float obj_tolerance[MAX_OBJ] = {0.0f, 0.0f, 0.0f, 0.0f};  // 字典法容差：差值 <= tol 视为相等
+    int   value_lower_bound;
+    int   value_upper_bound;
+    // v3.4: 统一行模式
+    RowMode row_mode      = RowMode::Single;  // 行模式（Single/Fixed/Partition）
+    float cross_row_prob  = 0.0f;     // 跨行 move 概率（0=纯行内操作）
+    int   total_elements  = 0;        // Partition 模式下的总元素数
+    int   perm_repeat_count = 1;      // 排列中每个值的重复次数（1=标准排列，>1=多重集排列）
+};
+
+// ============================================================
+// SolverConfig — 求解器参数
+// ============================================================
+
+struct SolverConfig {
+    int   pop_size         = 0;       // 种群大小（0 = 自动匹配 GPU 最大并行度）
+    int   max_gen          = 1000;
+    float mutation_rate    = 0.1f;
+    unsigned seed          = 42;
+    bool  verbose          = true;
+    int   print_every      = 100;
+    // 岛屿模型参数
+    int   num_islands      = 1;       // 0 = 自适应，1 = 纯爬山（无岛屿），>1 = 岛屿模型
+    int   migrate_interval = 100;     // 每隔多少代执行一次迁移
+    MigrateStrategy migrate_strategy = MigrateStrategy::Hybrid;
+    // 模拟退火参数
+    float sa_temp_init     = 0.0f;    // 初始温度（0 = 禁用 SA，纯爬山）
+    float sa_alpha         = 0.998f;  // 冷却率（每代乘以 alpha）
+    // v1.0: 交叉参数
+    float crossover_rate   = 0.1f;    // 每代中执行交叉的概率（vs 变异）
+    // v2.0: 自适应算子选择
+    bool  use_aos          = false;   // 启用 AOS（batch 间更新算子权重）
+    float aos_weight_floor = AOS_WEIGHT_FLOOR;  // 运行时可覆盖的 floor
+    float aos_weight_cap   = AOS_WEIGHT_CAP;    // 运行时可覆盖的 cap
+    // v2.1: 初始解策略
+    int   init_oversample  = 4;       // 采样倍数（1 = 不做采样择优，即纯随机）
+    float init_random_ratio = 0.3f;   // 纯随机解占比（多样性保底）
+    // v3.0: 工程可用性
+    float time_limit_sec   = 0.0f;   // 时间限制（秒，0 = 不限制，按 max_gen 跑完）
+    int   stagnation_limit = 0;      // 收敛检测：连续多少个 batch 无改进后 reheat（0 = 禁用）
+    float reheat_ratio     = 0.5f;   // reheat 时温度恢复到初始温度的比例
+    // v3.5: CUDA Graph
+    bool  use_cuda_graph   = false;  // 启用 CUDA Graph（减少 kernel launch 开销）
+    // v3.6: AOS 更新频率控制
+    int   aos_update_interval = 10;  // 每隔多少个 batch 更新一次 AOS 权重（降低 cudaMemcpy 同步频率）
+    // v4.0: 约束导向 + 分层搜索
+    bool  use_constraint_directed = false;  // 启用约束导向（根据 penalty 比例动态调整跨行算子权重）
+    bool  use_phased_search       = false;  // 启用分层搜索（按进度调整全局 floor/cap）
+    // 分层搜索参数：三期阈值
+    float phase_explore_end  = 0.30f;  // 探索期结束（进度比例）
+    float phase_refine_start = 0.70f;  // 精细期开始（进度比例）
+    // 约束导向参数
+    float constraint_boost_max = 2.5f; // 高约束时跨行算子 cap 提升倍率上限
+};
+
+// ============================================================
+// classify_problem — 从 ProblemConfig 推断问题画像
+// ============================================================
+
+inline ProblemProfile classify_problem(const ProblemConfig& pcfg) {
+    ProblemProfile p;
+    p.encoding = pcfg.encoding;
+
+    if      (pcfg.dim2_default <= 100) p.scale = ScaleClass::Small;
+    else if (pcfg.dim2_default <= 250) p.scale = ScaleClass::Medium;
+    else                               p.scale = ScaleClass::Large;
+
+    if (pcfg.dim1 <= 1)
+        p.structure = StructClass::SingleSeq;
+    else if (pcfg.row_mode == RowMode::Partition)
+        p.structure = StructClass::MultiPartition;
+    else
+        p.structure = StructClass::MultiFixed;
+
+    p.cross_row_prob = pcfg.cross_row_prob;
+    return p;
+}
+
+// ============================================================
+// build_seq_registry — 由 ProblemProfile 驱动的算子注册
+// ============================================================
+
+inline SeqRegistry build_seq_registry(const ProblemProfile& prof) {
+    SeqRegistry reg;
+    reg.count = 0;
+    for (int i = 0; i < MAX_SEQ; i++) {
+        reg.ids[i] = -1; reg.weights[i] = 0.0f;
+        reg.max_w[i] = 0.0f; reg.categories[i] = SeqCategory::InRow;
+    }
+
+    auto add = [&](int id, float w, SeqCategory cat, float cap = 0.0f) {
+        if (reg.count >= MAX_SEQ) return;
+        reg.ids[reg.count] = id;
+        reg.weights[reg.count] = w;
+        reg.max_w[reg.count] = cap;
+        reg.categories[reg.count] = cat;
+        reg.count++;
+    };
+
+    WeightPreset wp = get_weight_preset(prof.scale);
+    bool multi_row = (prof.structure != StructClass::SingleSeq);
+    float cr = prof.cross_row_prob;
+
+    if (prof.encoding == EncodingType::Permutation) {
+        add(seq::SEQ_PERM_SWAP,    1.0f, SeqCategory::InRow);
+        add(seq::SEQ_PERM_REVERSE, 1.0f, SeqCategory::InRow);
+        add(seq::SEQ_PERM_INSERT,  1.0f, SeqCategory::InRow);
+        add(seq::SEQ_PERM_DOUBLE_SWAP, 0.5f, SeqCategory::InRow);
+        add(seq::SEQ_PERM_TRIPLE_SWAP, 0.3f, SeqCategory::InRow);
+
+        add(seq::SEQ_PERM_3OPT,   wp.w_cubic,     SeqCategory::InRow);
+        add(seq::SEQ_PERM_OR_OPT, wp.w_quadratic,  SeqCategory::InRow);
+
+        if (multi_row && cr > 0.0f) {
+            add(seq::SEQ_PERM_CROSS_RELOCATE, 0.6f * cr, SeqCategory::CrossRow);
+            add(seq::SEQ_PERM_CROSS_SWAP,     0.6f * cr, SeqCategory::CrossRow);
+            add(seq::SEQ_PERM_SEG_RELOCATE,   0.5f * cr, SeqCategory::CrossRow);
+            add(seq::SEQ_PERM_SEG_SWAP,       0.5f * cr, SeqCategory::CrossRow);
+            add(seq::SEQ_PERM_CROSS_EXCHANGE,  0.4f * cr, SeqCategory::CrossRow);
+        }
+        if (multi_row) {
+            add(seq::SEQ_ROW_SWAP,    0.3f, SeqCategory::RowLevel);
+            add(seq::SEQ_ROW_REVERSE, 0.2f, SeqCategory::RowLevel);
+            if (prof.structure == StructClass::MultiPartition) {
+                add(seq::SEQ_ROW_SPLIT,  0.2f, SeqCategory::RowLevel);
+                add(seq::SEQ_ROW_MERGE,  0.2f, SeqCategory::RowLevel);
+            }
+        }
+        add(seq::SEQ_LNS_SEGMENT_SHUFFLE, wp.w_lns, SeqCategory::LNS, wp.lns_cap);
+        add(seq::SEQ_LNS_SCATTER_SHUFFLE, wp.w_lns, SeqCategory::LNS, wp.lns_cap);
+        add(seq::SEQ_LNS_GUIDED_REBUILD,  wp.w_lns, SeqCategory::LNS, wp.lns_cap);
+    }
+    else if (prof.encoding == EncodingType::Binary) {
+        add(seq::SEQ_BIN_FLIP, 1.0f, SeqCategory::InRow);
+        add(seq::SEQ_BIN_SWAP, 0.8f, SeqCategory::InRow);
+        add(seq::SEQ_BIN_SEG_FLIP, 0.6f, SeqCategory::InRow);
+        add(seq::SEQ_BIN_K_FLIP,   0.6f, SeqCategory::InRow);
+        if (multi_row && cr > 0.0f) {
+            add(seq::SEQ_BIN_CROSS_SWAP,     0.5f * cr, SeqCategory::CrossRow);
+            add(seq::SEQ_BIN_SEG_CROSS_SWAP, 0.4f * cr, SeqCategory::CrossRow);
+        }
+        if (multi_row) {
+            add(seq::SEQ_ROW_SWAP,    0.3f, SeqCategory::RowLevel);
+            add(seq::SEQ_ROW_REVERSE, 0.2f, SeqCategory::RowLevel);
+            if (prof.structure == StructClass::MultiPartition) {
+                add(seq::SEQ_ROW_SPLIT,  0.2f, SeqCategory::RowLevel);
+                add(seq::SEQ_ROW_MERGE,  0.2f, SeqCategory::RowLevel);
+            }
+        }
+    }
+    else if (prof.encoding == EncodingType::Integer) {
+        add(seq::SEQ_INT_RANDOM_RESET, 1.0f, SeqCategory::InRow);
+        add(seq::SEQ_INT_DELTA,        1.0f, SeqCategory::InRow);
+        add(seq::SEQ_INT_SWAP,         0.8f, SeqCategory::InRow);
+        add(seq::SEQ_INT_SEG_RESET,    0.6f, SeqCategory::InRow);
+        add(seq::SEQ_INT_K_DELTA,      0.6f, SeqCategory::InRow);
+        if (multi_row && cr > 0.0f) {
+            add(seq::SEQ_INT_CROSS_SWAP, 0.5f * cr, SeqCategory::CrossRow);
+        }
+        if (multi_row) {
+            add(seq::SEQ_ROW_SWAP,    0.3f, SeqCategory::RowLevel);
+            add(seq::SEQ_ROW_REVERSE, 0.2f, SeqCategory::RowLevel);
+            if (prof.structure == StructClass::MultiPartition) {
+                add(seq::SEQ_ROW_SPLIT,  0.2f, SeqCategory::RowLevel);
+                add(seq::SEQ_ROW_MERGE,  0.2f, SeqCategory::RowLevel);
+            }
+        }
+    }
+
+    float sum = 0.0f;
+    for (int i = 0; i < reg.count; i++) sum += reg.weights[i];
+    if (sum > 0.0f) {
+        for (int i = 0; i < reg.count; i++) reg.weights[i] /= sum;
+    }
+    return reg;
+}
+
+// ============================================================
+// ObjConfig — 传到 GPU 的目标比较配置（紧凑结构）
+// ============================================================
+
+struct ObjConfig {
+    int         num_obj;
+    CompareMode mode;
+    ObjDir      dirs[MAX_OBJ];       // 每个目标的方向
+    float       weights[MAX_OBJ];    // Weighted 模式下的权重
+    int         priority[MAX_OBJ];   // Lexicographic 模式下的比较顺序
+    float       tolerance[MAX_OBJ];  // Lexicographic 模式下的容差
+};
+
+// 从 ProblemConfig 构造 ObjConfig（CPU 端）
+inline ObjConfig make_obj_config(const ProblemConfig& pcfg) {
+    ObjConfig oc;
+    oc.num_obj = pcfg.num_objectives;
+    oc.mode = pcfg.compare_mode;
+    for (int i = 0; i < MAX_OBJ; i++) {
+        oc.dirs[i]      = pcfg.obj_dirs[i];
+        oc.weights[i]   = pcfg.obj_weights[i];
+        oc.priority[i]  = pcfg.obj_priority[i];
+        oc.tolerance[i] = pcfg.obj_tolerance[i];
+    }
+    return oc;
+}
+
+// ============================================================
+// SolveResult — solve() 的返回值
+// ============================================================
+
+enum class StopReason { MaxGen, TimeLimit, Stagnation };
+
+template<typename Sol>
+struct SolveResult {
+    Sol         best_solution;
+    float       elapsed_ms     = 0.0f;
+    int         generations    = 0;
+    StopReason  stop_reason    = StopReason::MaxGen;
+};
+
+// ============================================================
+// 目标重要性映射 — 统一 Weighted / Lexicographic 的重要性度量
+// ============================================================
+// 用于初始化选种（NSGA-II 加权拥挤度 + 核心目标预留名额）
+// Weighted:      importance[i] = weight[i] / Σweight
+// Lexicographic: importance[i] = 0.5^rank[i] / Σ(0.5^rank)
+//   → 第一优先级 ~57%，第二 ~29%，第三 ~14%
+
+inline void compute_importance(const ObjConfig& oc, float* importance) {
+    float sum = 0.0f;
+    for (int i = 0; i < oc.num_obj; i++) {
+        if (oc.mode == CompareMode::Weighted) {
+            importance[i] = oc.weights[i];
+        } else {
+            int rank = oc.priority[i];
+            importance[i] = 1.0f;
+            for (int r = 0; r < rank; r++) importance[i] *= 0.5f;  // 0.5^rank
+        }
+        sum += importance[i];
+    }
+    if (sum > 0.0f) {
+        for (int i = 0; i < oc.num_obj; i++)
+            importance[i] /= sum;
+    }
+}
+
+// ============================================================
+// 比较工具 — 支持 Weighted / Lexicographic
+// ============================================================
+
+// 将目标值统一为"越小越好"：Maximize 目标取负
+__device__ __host__ inline float normalize_obj(float val, ObjDir dir) {
+    return (dir == ObjDir::Maximize) ? -val : val;
+}
+
+// 核心比较：a 是否优于 b
+template<typename Sol>
+__device__ inline bool is_better(const Sol& a, const Sol& b,
+                                  const ObjConfig& oc) {
+    // penalty 优先：可行解一定优于不可行解
+    if (a.penalty <= 0.0f && b.penalty > 0.0f) return true;
+    if (a.penalty > 0.0f && b.penalty <= 0.0f) return false;
+    if (a.penalty > 0.0f && b.penalty > 0.0f) return a.penalty < b.penalty;
+    
+    if (oc.mode == CompareMode::Weighted) {
+        // 加权求和（权重已包含方向信息：Maximize 目标用负权重，或由 normalize_obj 处理）
+        float sum_a = 0.0f, sum_b = 0.0f;
+        for (int i = 0; i < oc.num_obj; i++) {
+            float na = normalize_obj(a.objectives[i], oc.dirs[i]);
+            float nb = normalize_obj(b.objectives[i], oc.dirs[i]);
+            sum_a += oc.weights[i] * na;
+            sum_b += oc.weights[i] * nb;
+        }
+        return sum_a < sum_b;
+    } else {
+        // 字典法：按 priority 顺序逐目标比较
+        for (int p = 0; p < oc.num_obj; p++) {
+            int idx = oc.priority[p];
+            float va = normalize_obj(a.objectives[idx], oc.dirs[idx]);
+            float vb = normalize_obj(b.objectives[idx], oc.dirs[idx]);
+            float diff = va - vb;
+            if (diff < -oc.tolerance[idx]) return true;   // a 明显更好
+            if (diff >  oc.tolerance[idx]) return false;  // b 明显更好
+            // 在容差内视为相等 → 继续比较下一个目标
+        }
+        return false;  // 所有目标都在容差内相等
+    }
+}
+
+// 标量化（SA 接受概率用）：返回越小越好的标量
+template<typename Sol>
+__device__ __host__ inline float scalar_objective(const Sol& sol,
+                                                    const ObjConfig& oc) {
+    if (oc.mode == CompareMode::Weighted) {
+        float sum = 0.0f;
+        for (int i = 0; i < oc.num_obj; i++)
+            sum += oc.weights[i] * normalize_obj(sol.objectives[i], oc.dirs[i]);
+        return sum;
+    } else {
+        // 字典法下 SA 用第一优先级目标作为标量
+        int idx = oc.priority[0];
+        return normalize_obj(sol.objectives[idx], oc.dirs[idx]);
+    }
+}
+
+// 轻量比较：直接操作 float[] 目标数组（避免复制整个 Sol）
+__device__ inline bool obj_is_better(const float* new_objs, const float* old_objs,
+                                      const ObjConfig& oc) {
+    if (oc.mode == CompareMode::Weighted) {
+        float sum_new = 0.0f, sum_old = 0.0f;
+        for (int i = 0; i < oc.num_obj; i++) {
+            sum_new += oc.weights[i] * normalize_obj(new_objs[i], oc.dirs[i]);
+            sum_old += oc.weights[i] * normalize_obj(old_objs[i], oc.dirs[i]);
+        }
+        return sum_new < sum_old;
+    } else {
+        for (int p = 0; p < oc.num_obj; p++) {
+            int idx = oc.priority[p];
+            float va = normalize_obj(new_objs[idx], oc.dirs[idx]);
+            float vb = normalize_obj(old_objs[idx], oc.dirs[idx]);
+            float diff = va - vb;
+            if (diff < -oc.tolerance[idx]) return true;
+            if (diff >  oc.tolerance[idx]) return false;
+        }
+        return false;
+    }
+}
+
+// 轻量标量化：直接操作 float[] 目标数组
+__device__ __host__ inline float obj_scalar(const float* objs, const ObjConfig& oc) {
+    if (oc.mode == CompareMode::Weighted) {
+        float sum = 0.0f;
+        for (int i = 0; i < oc.num_obj; i++)
+            sum += oc.weights[i] * normalize_obj(objs[i], oc.dirs[i]);
+        return sum;
+    } else {
+        int idx = oc.priority[0];
+        return normalize_obj(objs[idx], oc.dirs[idx]);
+    }
+}
+
+// ============================================================
+// AOSStats — 自适应算子选择统计（每个 block 一份）
+// ============================================================
+// v3.0: 粒度从 3 层 → MAX_SEQ 个序列
+// 记录每个序列的使用次数和改进次数
+// batch 结束后由 host 聚合，更新 SeqRegistry 权重
+
+struct AOSStats {
+    // 算子层统计（第二层）
+    int usage[MAX_SEQ];       // 各序列使用次数
+    int improvement[MAX_SEQ]; // 各序列改进次数（delta < 0 且被接受）
+    // K 步数层统计（第一层）
+    int k_usage[MAX_K];       // K=1,2,3 各自使用次数
+    int k_improvement[MAX_K]; // K=1,2,3 各自改进次数
+};
+
+// ============================================================
+// ObjDef — 单个目标的定义（编译期常量）
+// ============================================================
+
+struct ObjDef {
+    ObjDir dir;           // 优化方向
+    float  weight;        // Weighted 模式下的权重
+    float  tolerance;     // Lexicographic 模式下的容差
+};
+
+// ============================================================
+// HeuristicMatrix — 启发式初始解构造用的数据矩阵描述
+// ============================================================
+
+struct HeuristicMatrix {
+    const float* data;   // host 端 N*N 矩阵
+    int N;               // 维度
+};
+
+// ============================================================
+// ProblemBase<Derived, D1, D2> — CRTP 基类
+//
+// 用户继承此基类，提供：
+//   static constexpr ObjDef OBJ_DEFS[] = {...};   — 目标元信息
+//   __device__ float compute_obj(int idx, ...) const;  — 目标分发
+//   __device__ float compute_penalty(...) const;
+//
+// 约定：OBJ_DEFS 和 compute_obj 紧挨着写，case N 对应 OBJ_DEFS[N]
+// NUM_OBJ 由 sizeof(OBJ_DEFS) 自动推导，无需手动维护
+//
+// 基类自动提供：
+//   evaluate(sol)           — 遍历目标列表调用 compute_obj
+//   fill_obj_config(cfg)    — 从 OBJ_DEFS 自动填充 ProblemConfig
+//   obj_config()            — 直接生成 ObjConfig
+// ============================================================
+
+template<typename Derived, int D1_, int D2_>
+struct ProblemBase {
+    static constexpr int D1 = D1_;
+    static constexpr int D2 = D2_;
+    using Sol = Solution<D1, D2>;
+    
+    // NUM_OBJ 从 OBJ_DEFS 数组自动推导
+    static constexpr int NUM_OBJ = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
+    
+    // 自动评估：遍历目标列表
+    __device__ void evaluate(Sol& sol) const {
+        const auto& self = static_cast<const Derived&>(*this);
+        constexpr int n = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
+        for (int i = 0; i < n; i++)
+            sol.objectives[i] = self.compute_obj(i, sol);
+        sol.penalty = self.compute_penalty(sol);
+    }
+    
+    // 从 OBJ_DEFS 自动填充 ProblemConfig 的目标部分
+    void fill_obj_config(ProblemConfig& cfg) const {
+        constexpr int n = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
+        cfg.num_objectives = n;
+        for (int i = 0; i < n; i++) {
+            cfg.obj_dirs[i]      = Derived::OBJ_DEFS[i].dir;
+            cfg.obj_weights[i]   = Derived::OBJ_DEFS[i].weight;
+            cfg.obj_tolerance[i] = Derived::OBJ_DEFS[i].tolerance;
+            cfg.obj_priority[i]  = i;  // 列表顺序即优先级
+        }
+    }
+    
+    // 直接生成 ObjConfig（供 solver 使用）
+    ObjConfig obj_config() const {
+        ProblemConfig pcfg;
+        fill_obj_config(pcfg);
+        return make_obj_config(pcfg);
+    }
+    
+    // 每个 block 在 global memory 中的热数据工作集大小（字节）
+    // 用于 auto pop_size 估算 L2 cache 压力
+    // 默认 = shared_mem_bytes()（数据在 smem 时，gmem 工作集为 0 不影响）
+    // 子类覆盖：当 shared_mem_bytes() 返回 0（数据放不进 smem）时，
+    //           返回实际数据大小（如距离矩阵 n*n*sizeof(float)）
+    size_t working_set_bytes() const {
+        return static_cast<const Derived&>(*this).shared_mem_bytes();
+    }
+    
+    // 可选：初始化 G/O 关系矩阵（为 GUIDED_REBUILD 提供先验知识）
+    // G[i*N+j]: 元素 i 和 j 的分组倾向（对称，[0,1]，越大越倾向同组）
+    // O[i*N+j]: 元素 i 排在 j 前面的倾向（不对称，[0,1]）
+    // 默认不提供（全零），搜索过程中通过 EMA 从历史好解积累
+    // 用户覆盖示例：距离近 → G 和 O 都高
+    void init_relation_matrix(float* h_G, float* h_O, int N) const {
+        (void)h_G; (void)h_O; (void)N;  // 默认：不做任何事（保持全零）
+    }
+    
+    // 可选：返回 host 端数据矩阵供启发式初始解构造
+    // 默认返回 0（不提供），子类 override 后填充 out 数组并返回实际数量
+    int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
+        (void)out; (void)max_count;
+        return 0;
+    }
+};
--- a/python/cugenopt/include/problems/assignment.cuh
+++ b/python/cugenopt/include/problems/assignment.cuh
@ -0,0 +1,114 @@
+/**
+ * assignment.cuh - 指派问题
+ * 
+ * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include "operators.cuh"
+
+struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
+    const float* d_cost;
+    const float* h_cost;  // host 端成本矩阵（用于 init_relation_matrix）
+    int n;
+    
+    // ---- 目标计算 ----
+    __device__ float calc_total_cost(const Sol& sol) const {
+        float total = 0.0f;
+        const int* assign = sol.data[0];
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++)
+            total += d_cost[i * n + assign[i]];
+        return total;
+    }
+    
+    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_cost
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_total_cost(sol);   // OBJ_DEFS[0]
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        return 0.0f;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    // ---- shared memory 接口 ----
+    static constexpr size_t SMEM_LIMIT = 48 * 1024;
+    
+    size_t shared_mem_bytes() const {
+        size_t need = (size_t)n * n * sizeof(float);
+        return need <= SMEM_LIMIT ? need : 0;
+    }
+    
+    size_t working_set_bytes() const {
+        return (size_t)n * n * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sc = reinterpret_cast<float*>(smem);
+        int total = n * n;
+        for (int i = tid; i < total; i += bsz) sc[i] = d_cost[i];
+        d_cost = sc;
+    }
+    
+    // 成本先验：task j 和 task k 如果被相似 agent 偏好，G 值高
+    // O 矩阵：task j 在位置 i 成本低 → O[j][k] 略高（j 倾向排在 k 前面的位置）
+    void init_relation_matrix(float* G, float* O, int N) const {
+        if (!h_cost || N != n) return;
+        // 对每个 task，构建成本向量，task 间余弦相似度 → G
+        // 简化：成本列向量的相关性
+        float max_c = 0.0f;
+        for (int i = 0; i < N * N; i++)
+            if (h_cost[i] > max_c) max_c = h_cost[i];
+        if (max_c <= 0.0f) return;
+        
+        for (int j = 0; j < N; j++)
+            for (int k = 0; k < N; k++) {
+                if (j == k) continue;
+                // G: 两个 task 的成本向量越相似 → 越可能互换
+                float dot = 0.0f, nj = 0.0f, nk = 0.0f;
+                for (int i = 0; i < N; i++) {
+                    float cj = h_cost[i * N + j] / max_c;
+                    float ck = h_cost[i * N + k] / max_c;
+                    dot += cj * ck;
+                    nj += cj * cj;
+                    nk += ck * ck;
+                }
+                float denom = sqrtf(nj) * sqrtf(nk);
+                float sim = (denom > 1e-6f) ? dot / denom : 0.0f;
+                G[j * N + k] = sim * 0.2f;
+                O[j * N + k] = sim * 0.05f;
+            }
+    }
+    
+    static AssignmentProblem create(const float* hc, int n) {
+        AssignmentProblem prob;
+        prob.n = n;
+        prob.h_cost = hc;
+        float* dc;
+        CUDA_CHECK(cudaMalloc(&dc, sizeof(float)*n*n));
+        CUDA_CHECK(cudaMemcpy(dc, hc, sizeof(float)*n*n, cudaMemcpyHostToDevice));
+        prob.d_cost = dc;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_cost) { cudaFree(const_cast<float*>(d_cost)); d_cost = nullptr; }
+        h_cost = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/bin_packing.cuh
+++ b/python/cugenopt/include/problems/bin_packing.cuh
@ -0,0 +1,97 @@
+/**
+ * bin_packing.cuh - 一维装箱问题（Integer 编码 + 约束）
+ * 
+ * N 个物品，每个重量 w[i]，装入最多 B 个箱子，每个箱子容量 C。
+ * 决策变量：data[0][i] ∈ [0, B-1]，表示物品 i 放入的箱子编号。
+ * 目标：最小化使用的箱子数。
+ * 约束：每个箱子总重不超过 C，超出部分作为 penalty。
+ * 
+ * 验证实例：8 物品 weights=[7,5,3,4,6,2,8,1], C=10, 最优=4 箱
+ *   箱0={7,3}=10, 箱1={5,4,1}=10, 箱2={6,2}=8, 箱3={8}=8
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+
+struct BinPackingProblem : ProblemBase<BinPackingProblem, 1, 64> {
+    const float* d_weights;
+    int n;              // 物品数
+    int max_bins;       // 最大箱子数 B
+    float capacity;     // 箱子容量 C
+    
+    __device__ float calc_bins_used(const Sol& sol) const {
+        bool used[32] = {};
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++) {
+            int b = sol.data[0][i];
+            if (b >= 0 && b < max_bins) used[b] = true;
+        }
+        int count = 0;
+        for (int b = 0; b < max_bins; b++)
+            if (used[b]) count++;
+        return (float)count;
+    }
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_bins_used(sol);
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        float penalty = 0.0f;
+        float load[32] = {};
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++) {
+            int b = sol.data[0][i];
+            if (b >= 0 && b < max_bins)
+                load[b] += d_weights[i];
+        }
+        for (int b = 0; b < max_bins; b++) {
+            float over = load[b] - capacity;
+            if (over > 0.0f) penalty += over * 10.0f;
+        }
+        return penalty;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Integer;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        cfg.value_lower_bound = 0;
+        cfg.value_upper_bound = max_bins - 1;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    size_t shared_mem_bytes() const {
+        return (size_t)n * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sw = reinterpret_cast<float*>(smem);
+        for (int i = tid; i < n; i += bsz) sw[i] = d_weights[i];
+        d_weights = sw;
+    }
+    
+    static BinPackingProblem create(const float* h_weights, int n,
+                                     int max_bins, float capacity) {
+        BinPackingProblem prob;
+        prob.n = n; prob.max_bins = max_bins; prob.capacity = capacity;
+        float* dw;
+        CUDA_CHECK(cudaMalloc(&dw, sizeof(float) * n));
+        CUDA_CHECK(cudaMemcpy(dw, h_weights, sizeof(float) * n, cudaMemcpyHostToDevice));
+        prob.d_weights = dw;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_weights) cudaFree(const_cast<float*>(d_weights));
+        d_weights = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/graph_color.cuh
+++ b/python/cugenopt/include/problems/graph_color.cuh
@ -0,0 +1,79 @@
+/**
+ * graph_color.cuh - 图着色问题（Integer 编码）
+ * 
+ * N 个节点的图，用 k 种颜色着色。
+ * 决策变量：data[0][i] ∈ [0, k-1]，表示节点 i 的颜色。
+ * 目标：最小化冲突边数（相邻节点同色的边数）。
+ * 
+ * 验证实例：Petersen 图（10 节点 15 边，色数=3，最优冲突=0）
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+
+struct GraphColorProblem : ProblemBase<GraphColorProblem, 1, 64> {
+    const int* d_adj;   // 邻接矩阵 [N*N]（1=相邻, 0=不相邻）
+    int n;              // 节点数
+    int k;              // 颜色数
+    
+    __device__ float calc_conflicts(const Sol& sol) const {
+        int conflicts = 0;
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++)
+            for (int j = i + 1; j < size; j++)
+                if (d_adj[i * n + j] && sol.data[0][i] == sol.data[0][j])
+                    conflicts++;
+        return (float)conflicts;
+    }
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_conflicts(sol);
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        return 0.0f;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Integer;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        cfg.value_lower_bound = 0;
+        cfg.value_upper_bound = k - 1;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    size_t shared_mem_bytes() const {
+        return (size_t)n * n * sizeof(int);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        int* sa = reinterpret_cast<int*>(smem);
+        int total = n * n;
+        for (int i = tid; i < total; i += bsz) sa[i] = d_adj[i];
+        d_adj = sa;
+    }
+    
+    static GraphColorProblem create(const int* h_adj, int n, int k) {
+        GraphColorProblem prob;
+        prob.n = n; prob.k = k;
+        int* da;
+        CUDA_CHECK(cudaMalloc(&da, sizeof(int) * n * n));
+        CUDA_CHECK(cudaMemcpy(da, h_adj, sizeof(int) * n * n, cudaMemcpyHostToDevice));
+        prob.d_adj = da;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_adj) cudaFree(const_cast<int*>(d_adj));
+        d_adj = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/jsp.cuh
+++ b/python/cugenopt/include/problems/jsp.cuh
@ -0,0 +1,271 @@
+/**
+ * jsp.cuh - 车间调度问题 (Job Shop Scheduling Problem)
+ * 
+ * J 个工件，每个工件有 O 道工序，每道工序指定机器和耗时。
+ * 
+ * === 编码方案 A：Integer 多行（时间表编码）===
+ * JSPProblem: data[j][i] = 工件 j 的第 i 道工序的开始时间
+ *   dim1 = num_jobs, dim2_default = num_ops
+ *   row_mode = Fixed（禁止 ROW_SPLIT/ROW_MERGE）
+ *   每行代表一个工件的固定工序序列，行长度不可变
+ * 
+ * === 编码方案 B：Permutation 多重集（工序排列编码）===
+ * JSPPermProblem: data[0][k] = 工件编号（0..J-1），长度 J*O
+ *   值 j 出现 O 次。从左到右扫描，第 t 次遇到值 j 表示工件 j 的第 t 道工序。
+ *   dim1 = 1, dim2_default = J*O, perm_repeat_count = O
+ *   标准 Permutation 算子（swap/reverse/insert）天然保持多重集结构
+ * 
+ * 目标：Minimize makespan（所有工件完成时间的最大值）。
+ * 约束：
+ *   (a) 工序顺序：同一工件的工序必须按序执行
+ *   (b) 机器冲突：同一机器同一时刻只能处理一个工序
+ * 
+ * 验证实例：自定义 3 工件 3 机器 (3x3)，最优 makespan = 12
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+
+// ============================================================
+// 编码方案 A：Integer 多行（时间表编码）
+// ============================================================
+
+struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
+    const int*   d_machine;     // 工序所需机器 [J*O]
+    const float* d_duration;    // 工序耗时 [J*O]
+    int num_jobs;               // 工件数 J
+    int num_ops;                // 每工件工序数 O
+    int num_machines;           // 机器数 M
+    int time_horizon;           // 时间上界
+    
+    __device__ float calc_makespan(const Sol& sol) const {
+        float makespan = 0.0f;
+        for (int j = 0; j < num_jobs; j++) {
+            int last = num_ops - 1;
+            float end = (float)sol.data[j][last] + d_duration[j * num_ops + last];
+            if (end > makespan) makespan = end;
+        }
+        return makespan;
+    }
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_makespan(sol);
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        float penalty = 0.0f;
+        
+        // (a) 工序顺序约束
+        for (int j = 0; j < num_jobs; j++) {
+            for (int i = 1; i < num_ops; i++) {
+                float prev_end = (float)sol.data[j][i-1] + d_duration[j * num_ops + (i-1)];
+                float curr_start = (float)sol.data[j][i];
+                if (curr_start < prev_end)
+                    penalty += (prev_end - curr_start) * 10.0f;
+            }
+        }
+        
+        // (b) 机器冲突约束
+        int total = num_jobs * num_ops;
+        for (int a = 0; a < total; a++) {
+            int ja = a / num_ops, ia = a % num_ops;
+            int m_a = d_machine[a];
+            float s_a = (float)sol.data[ja][ia];
+            float e_a = s_a + d_duration[a];
+            for (int b = a + 1; b < total; b++) {
+                if (d_machine[b] != m_a) continue;
+                int jb = b / num_ops, ib = b % num_ops;
+                float s_b = (float)sol.data[jb][ib];
+                float e_b = s_b + d_duration[b];
+                float overlap = fminf(e_a, e_b) - fmaxf(s_a, s_b);
+                if (overlap > 0.0f)
+                    penalty += overlap * 10.0f;
+            }
+        }
+        
+        return penalty;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Integer;
+        cfg.dim1 = num_jobs;
+        cfg.dim2_default = num_ops;
+        cfg.value_lower_bound = 0;
+        cfg.value_upper_bound = time_horizon - 1;
+        cfg.row_mode = RowMode::Fixed;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    size_t shared_mem_bytes() const {
+        int total = num_jobs * num_ops;
+        return (size_t)total * (sizeof(int) + sizeof(float));
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        int total = num_jobs * num_ops;
+        int* sm = reinterpret_cast<int*>(smem);
+        for (int i = tid; i < total; i += bsz) sm[i] = d_machine[i];
+        d_machine = sm;
+        
+        float* sd = reinterpret_cast<float*>(sm + total);
+        for (int i = tid; i < total; i += bsz) sd[i] = d_duration[i];
+        d_duration = sd;
+    }
+    
+    static JSPProblem create(const int* h_machine, const float* h_duration,
+                              int num_jobs, int num_ops, int num_machines,
+                              int time_horizon) {
+        JSPProblem prob;
+        prob.num_jobs = num_jobs;
+        prob.num_ops = num_ops;
+        prob.num_machines = num_machines;
+        prob.time_horizon = time_horizon;
+        
+        int total = num_jobs * num_ops;
+        int* dm;
+        CUDA_CHECK(cudaMalloc(&dm, sizeof(int) * total));
+        CUDA_CHECK(cudaMemcpy(dm, h_machine, sizeof(int) * total, cudaMemcpyHostToDevice));
+        prob.d_machine = dm;
+        
+        float* dd;
+        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * total));
+        CUDA_CHECK(cudaMemcpy(dd, h_duration, sizeof(float) * total, cudaMemcpyHostToDevice));
+        prob.d_duration = dd;
+        
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_machine)  { cudaFree(const_cast<int*>(d_machine));     d_machine = nullptr; }
+        if (d_duration) { cudaFree(const_cast<float*>(d_duration));  d_duration = nullptr; }
+    }
+};
+
+// ============================================================
+// 编码方案 B：Permutation 多重集（工序排列编码）
+// ============================================================
+// data[0] 是长度 J*O 的排列，值域 [0, J)，每个值出现 O 次
+// 从左到右扫描：第 t 次遇到值 j → 安排工件 j 的第 t 道工序
+// 贪心解码：每道工序安排在"最早可行时间"（满足工序顺序 + 机器空闲）
+
+struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
+    const int*   d_machine;     // 工序所需机器 [J*O]
+    const float* d_duration;    // 工序耗时 [J*O]
+    int num_jobs;
+    int num_ops;
+    int num_machines;
+    
+    // 贪心解码：从排列生成调度方案，返回 makespan
+    __device__ float decode_and_makespan(const Sol& sol) const {
+        int total = num_jobs * num_ops;
+        int size = sol.dim2_sizes[0];
+        if (size < total) return 1e9f;
+        
+        float job_avail[8];     // 每个工件的下一道工序最早开始时间
+        float mach_avail[8];    // 每台机器的最早空闲时间
+        int   job_next_op[8];   // 每个工件的下一道待安排工序编号
+        
+        for (int j = 0; j < num_jobs; j++) { job_avail[j] = 0.0f; job_next_op[j] = 0; }
+        for (int m = 0; m < num_machines; m++) mach_avail[m] = 0.0f;
+        
+        float makespan = 0.0f;
+        for (int k = 0; k < total; k++) {
+            int j = sol.data[0][k];
+            if (j < 0 || j >= num_jobs) return 1e9f;
+            int op = job_next_op[j];
+            if (op >= num_ops) continue;  // 该工件已安排完
+            
+            int flat = j * num_ops + op;
+            int m = d_machine[flat];
+            float dur = d_duration[flat];
+            
+            // 最早开始时间 = max(工件前序完成, 机器空闲)
+            float start = fmaxf(job_avail[j], mach_avail[m]);
+            float end = start + dur;
+            
+            job_avail[j] = end;
+            mach_avail[m] = end;
+            job_next_op[j] = op + 1;
+            
+            if (end > makespan) makespan = end;
+        }
+        
+        return makespan;
+    }
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return decode_and_makespan(sol);
+            default: return 0.0f;
+        }
+    }
+    
+    // 贪心解码天然满足约束，penalty 始终为 0
+    __device__ float compute_penalty(const Sol& sol) const {
+        return 0.0f;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = 1;
+        cfg.dim2_default = num_jobs * num_ops;
+        cfg.perm_repeat_count = num_ops;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    size_t shared_mem_bytes() const {
+        int total = num_jobs * num_ops;
+        return (size_t)total * (sizeof(int) + sizeof(float));
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        int total = num_jobs * num_ops;
+        int* sm = reinterpret_cast<int*>(smem);
+        for (int i = tid; i < total; i += bsz) sm[i] = d_machine[i];
+        d_machine = sm;
+        
+        float* sd = reinterpret_cast<float*>(sm + total);
+        for (int i = tid; i < total; i += bsz) sd[i] = d_duration[i];
+        d_duration = sd;
+    }
+    
+    static JSPPermProblem create(const int* h_machine, const float* h_duration,
+                                  int num_jobs, int num_ops, int num_machines) {
+        JSPPermProblem prob;
+        prob.num_jobs = num_jobs;
+        prob.num_ops = num_ops;
+        prob.num_machines = num_machines;
+        
+        int total = num_jobs * num_ops;
+        int* dm;
+        CUDA_CHECK(cudaMalloc(&dm, sizeof(int) * total));
+        CUDA_CHECK(cudaMemcpy(dm, h_machine, sizeof(int) * total, cudaMemcpyHostToDevice));
+        prob.d_machine = dm;
+        
+        float* dd;
+        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * total));
+        CUDA_CHECK(cudaMemcpy(dd, h_duration, sizeof(float) * total, cudaMemcpyHostToDevice));
+        prob.d_duration = dd;
+        
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_machine)  { cudaFree(const_cast<int*>(d_machine));     d_machine = nullptr; }
+        if (d_duration) { cudaFree(const_cast<float*>(d_duration));  d_duration = nullptr; }
+    }
+};
--- a/python/cugenopt/include/problems/knapsack.cuh
+++ b/python/cugenopt/include/problems/knapsack.cuh
@ -0,0 +1,88 @@
+/**
+ * knapsack.cuh - 0-1 背包问题
+ * 
+ * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include "operators.cuh"
+
+struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
+    // 问题数据（d_weights 是物品重量，非目标权重）
+    const float* d_weights;
+    const float* d_values;
+    float capacity;
+    int n;
+    
+    // ---- 目标计算 ----
+    __device__ float calc_total_value(const Sol& sol) const {
+        float tv = 0.0f;
+        const int* sel = sol.data[0];
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++)
+            if (sel[i]) tv += d_values[i];
+        return tv;
+    }
+    
+    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Maximize, 1.0f, 0.0f},   // case 0: calc_total_value
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_total_value(sol);   // OBJ_DEFS[0]
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        float tw = 0.0f;
+        const int* sel = sol.data[0];
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++)
+            if (sel[i]) tw += d_weights[i];
+        float over = tw - capacity;
+        return (over > 0.0f) ? over : 0.0f;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Binary;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    // ---- shared memory 接口 ----
+    size_t shared_mem_bytes() const {
+        return 2 * (size_t)n * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sw = reinterpret_cast<float*>(smem);
+        float* sv = sw + n;
+        for (int i = tid; i < n; i += bsz) { sw[i] = d_weights[i]; sv[i] = d_values[i]; }
+        d_weights = sw;
+        d_values = sv;
+    }
+    
+    static KnapsackProblem create(const float* hw, const float* hv, int n, float cap) {
+        KnapsackProblem prob;
+        prob.n = n; prob.capacity = cap;
+        float *dw, *dv;
+        CUDA_CHECK(cudaMalloc(&dw, sizeof(float)*n));
+        CUDA_CHECK(cudaMalloc(&dv, sizeof(float)*n));
+        CUDA_CHECK(cudaMemcpy(dw, hw, sizeof(float)*n, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(dv, hv, sizeof(float)*n, cudaMemcpyHostToDevice));
+        prob.d_weights = dw; prob.d_values = dv;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_weights) cudaFree(const_cast<float*>(d_weights));
+        if (d_values)  cudaFree(const_cast<float*>(d_values));
+        d_weights = nullptr; d_values = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/load_balance.cuh
+++ b/python/cugenopt/include/problems/load_balance.cuh
@ -0,0 +1,83 @@
+/**
+ * load_balance.cuh - 离散负载均衡问题（Integer 编码验证）
+ * 
+ * N 个任务分配到 M 台机器，每个任务有一个处理时间 p[i]。
+ * 决策变量：data[0][i] ∈ [0, M-1]，表示任务 i 分配到哪台机器。
+ * 目标：最小化 makespan（最大机器负载）。
+ * 
+ * 已知 NP-hard（等价于 multiprocessor scheduling / load balancing）。
+ * LPT（最长处理时间优先）贪心可得 4/3 近似。
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+
+struct LoadBalanceProblem : ProblemBase<LoadBalanceProblem, 1, 64> {
+    const float* d_proc_time;   // 任务处理时间 [N]
+    int n;                      // 任务数
+    int m;                      // 机器数
+    
+    __device__ float calc_makespan(const Sol& sol) const {
+        float load[32] = {};    // 最多 32 台机器
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++) {
+            int machine = sol.data[0][i];
+            if (machine >= 0 && machine < m)
+                load[machine] += d_proc_time[i];
+        }
+        float max_load = 0.0f;
+        for (int j = 0; j < m; j++)
+            if (load[j] > max_load) max_load = load[j];
+        return max_load;
+    }
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: makespan
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_makespan(sol);
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        return 0.0f;   // 无约束（任何分配都合法）
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Integer;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        cfg.value_lower_bound = 0;
+        cfg.value_upper_bound = m - 1;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    size_t shared_mem_bytes() const {
+        return (size_t)n * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sp = reinterpret_cast<float*>(smem);
+        for (int i = tid; i < n; i += bsz) sp[i] = d_proc_time[i];
+        d_proc_time = sp;
+    }
+    
+    static LoadBalanceProblem create(const float* h_proc_time, int n, int m) {
+        LoadBalanceProblem prob;
+        prob.n = n; prob.m = m;
+        float* dp;
+        CUDA_CHECK(cudaMalloc(&dp, sizeof(float) * n));
+        CUDA_CHECK(cudaMemcpy(dp, h_proc_time, sizeof(float) * n, cudaMemcpyHostToDevice));
+        prob.d_proc_time = dp;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_proc_time) cudaFree(const_cast<float*>(d_proc_time));
+        d_proc_time = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/qap.cuh
+++ b/python/cugenopt/include/problems/qap.cuh
@ -0,0 +1,84 @@
+/**
+ * qap.cuh - 二次分配问题 (Quadratic Assignment Problem)
+ * 
+ * N 个设施分配到 N 个位置（排列编码）。
+ * 决策变量：data[0][i] = 设施 i 分配到的位置。
+ * 目标：Minimize sum(flow[i][j] * dist[perm[i]][perm[j]])
+ * 
+ * 验证实例：自定义 5x5
+ *   flow: 设施间的物流量
+ *   dist: 位置间的距离
+ *   已知最优 = 58
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+
+struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
+    const float* d_flow;    // 物流量矩阵 [N*N]
+    const float* d_dist;    // 距离矩阵 [N*N]
+    int n;
+    
+    __device__ float calc_cost(const Sol& sol) const {
+        float cost = 0.0f;
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++)
+            for (int j = 0; j < size; j++)
+                cost += d_flow[i * n + j] * d_dist[sol.data[0][i] * n + sol.data[0][j]];
+        return cost;
+    }
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_cost(sol);
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        return 0.0f;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    size_t shared_mem_bytes() const {
+        return 2 * (size_t)n * n * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sf = reinterpret_cast<float*>(smem);
+        float* sd = sf + n * n;
+        int total = n * n;
+        for (int i = tid; i < total; i += bsz) { sf[i] = d_flow[i]; sd[i] = d_dist[i]; }
+        d_flow = sf;
+        d_dist = sd;
+    }
+    
+    static QAPProblem create(const float* h_flow, const float* h_dist, int n) {
+        QAPProblem prob;
+        prob.n = n;
+        float *df, *dd;
+        CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n));
+        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
+        CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
+        prob.d_flow = df; prob.d_dist = dd;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_flow) cudaFree(const_cast<float*>(d_flow));
+        if (d_dist) cudaFree(const_cast<float*>(d_dist));
+        d_flow = nullptr; d_dist = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/schedule.cuh
+++ b/python/cugenopt/include/problems/schedule.cuh
@ -0,0 +1,101 @@
+/**
+ * schedule.cuh - 排班问题
+ * 
+ * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ * 2 个目标：总成本（min）+ 不公平度（min，权重更高）
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include "operators.cuh"
+
+struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
+    const float* d_cost;
+    int days, emps, required;
+    
+    // ---- 目标计算 ----
+    __device__ float calc_total_cost(const Sol& sol) const {
+        float total = 0.0f;
+        for (int d = 0; d < days; d++)
+            for (int e = 0; e < emps; e++)
+                if (sol.data[d][e]) total += d_cost[d * emps + e];
+        return total;
+    }
+    
+    __device__ float calc_unfairness(const Sol& sol) const {
+        int workdays[D2];
+        for (int e = 0; e < emps; e++) workdays[e] = 0;
+        for (int d = 0; d < days; d++)
+            for (int e = 0; e < emps; e++)
+                if (sol.data[d][e]) workdays[e]++;
+        int max_w = 0, min_w = days;
+        for (int e = 0; e < emps; e++) {
+            if (workdays[e] > max_w) max_w = workdays[e];
+            if (workdays[e] < min_w) min_w = workdays[e];
+        }
+        return (float)(max_w - min_w);
+    }
+    
+    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_cost
+        {ObjDir::Minimize, 5.0f, 0.0f},   // case 1: calc_unfairness
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_total_cost(sol);     // OBJ_DEFS[0]
+            case 1: return calc_unfairness(sol);     // OBJ_DEFS[1]
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        float penalty = 0.0f;
+        for (int d = 0; d < days; d++) {
+            int count = 0;
+            for (int e = 0; e < emps; e++)
+                if (sol.data[d][e]) count++;
+            int diff = count - required;
+            penalty += (diff > 0) ? (float)diff : (float)(-diff);
+        }
+        return penalty;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Binary;
+        cfg.dim1 = days;  cfg.dim2_default = emps;
+        cfg.row_mode = RowMode::Fixed;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    // 默认回退全量（基类行为）— 不需要覆盖 evaluate_move
+    
+    // ---- shared memory 接口 ----
+    size_t shared_mem_bytes() const {
+        return (size_t)days * emps * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sc = reinterpret_cast<float*>(smem);
+        int total = days * emps;
+        for (int i = tid; i < total; i += bsz) sc[i] = d_cost[i];
+        d_cost = sc;
+    }
+    
+    static ScheduleProblem create(const float* hc, int days, int emps, int req) {
+        ScheduleProblem prob;
+        prob.days = days; prob.emps = emps; prob.required = req;
+        float* dc;
+        CUDA_CHECK(cudaMalloc(&dc, sizeof(float)*days*emps));
+        CUDA_CHECK(cudaMemcpy(dc, hc, sizeof(float)*days*emps, cudaMemcpyHostToDevice));
+        prob.d_cost = dc;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_cost) { cudaFree(const_cast<float*>(d_cost)); d_cost = nullptr; }
+    }
+};
--- a/python/cugenopt/include/problems/tsp.cuh
+++ b/python/cugenopt/include/problems/tsp.cuh
@ -0,0 +1,110 @@
+/**
+ * tsp.cuh - TSP 问题定义
+ * 
+ * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include "operators.cuh"
+
+struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
+    // 问题数据
+    const float* d_dist;
+    const float* h_dist;  // host 端距离矩阵（用于 init_relation_matrix）
+    int n;
+    
+    // ---- 目标计算 ----
+    __device__ float calc_total_distance(const Sol& sol) const {
+        float total = 0.0f;
+        const int* route = sol.data[0];
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++)
+            total += d_dist[route[i] * n + route[(i + 1) % size]];
+        return total;
+    }
+    
+    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_distance
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_total_distance(sol);   // OBJ_DEFS[0]
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        return 0.0f;  // TSP 无约束
+    }
+    
+    // ---- config（编码/维度部分，目标由基类自动填充）----
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    // ---- shared memory 接口 ----
+    static constexpr size_t SMEM_LIMIT = 48 * 1024;
+    
+    size_t shared_mem_bytes() const {
+        size_t need = (size_t)n * n * sizeof(float);
+        return need <= SMEM_LIMIT ? need : 0;
+    }
+    
+    size_t working_set_bytes() const {
+        return (size_t)n * n * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sd = reinterpret_cast<float*>(smem);
+        int total = n * n;
+        for (int i = tid; i < total; i += bsz)
+            sd[i] = d_dist[i];
+        d_dist = sd;
+    }
+    
+    // 距离先验：距离近 → G/O 分数高
+    void init_relation_matrix(float* G, float* O, int N) const {
+        if (!h_dist || N != n) return;
+        float max_d = 0.0f;
+        for (int i = 0; i < N; i++)
+            for (int j = 0; j < N; j++)
+                if (h_dist[i * N + j] > max_d) max_d = h_dist[i * N + j];
+        if (max_d <= 0.0f) return;
+        for (int i = 0; i < N; i++)
+            for (int j = 0; j < N; j++) {
+                if (i == j) continue;
+                float proximity = 1.0f - h_dist[i * N + j] / max_d;
+                G[i * N + j] = proximity * 0.3f;
+                O[i * N + j] = proximity * 0.1f;
+            }
+    }
+    
+    int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
+        if (max_count < 1 || !h_dist) return 0;
+        out[0] = {h_dist, n};
+        return 1;
+    }
+    
+    static TSPProblem create(const float* h_dist_ptr, int n) {
+        TSPProblem prob;
+        prob.n = n;
+        prob.h_dist = h_dist_ptr;
+        float* dd;
+        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));
+        prob.d_dist = dd;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
+        h_dist = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/tsp_large.cuh
+++ b/python/cugenopt/include/problems/tsp_large.cuh
@ -0,0 +1,107 @@
+/**
+ * tsp_large.cuh - 大规模 TSP 问题定义 (最多 256 城市)
+ * 
+ * 继承 ProblemBase，逻辑与 tsp.cuh 一致，仅 D2 上限不同
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include "operators.cuh"
+
+struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
+    const float* d_dist;
+    const float* h_dist;
+    int n;
+    
+    // ---- 目标计算 ----
+    __device__ float calc_total_distance(const Sol& sol) const {
+        float total = 0.0f;
+        const int* route = sol.data[0];
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++)
+            total += d_dist[route[i] * n + route[(i + 1) % size]];
+        return total;
+    }
+    
+    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_distance
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_total_distance(sol);   // OBJ_DEFS[0]
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        return 0.0f;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    static constexpr size_t SMEM_LIMIT = 48 * 1024;
+    
+    size_t shared_mem_bytes() const {
+        size_t need = (size_t)n * n * sizeof(float);
+        return need <= SMEM_LIMIT ? need : 0;
+    }
+    
+    // 距离矩阵的实际大小（不管是否放进 smem）
+    size_t working_set_bytes() const {
+        return (size_t)n * n * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sd = reinterpret_cast<float*>(smem);
+        int total = n * n;
+        for (int i = tid; i < total; i += bsz)
+            sd[i] = d_dist[i];
+        d_dist = sd;
+    }
+    
+    void init_relation_matrix(float* G, float* O, int N) const {
+        if (!h_dist || N != n) return;
+        float max_d = 0.0f;
+        for (int i = 0; i < N; i++)
+            for (int j = 0; j < N; j++)
+                if (h_dist[i * N + j] > max_d) max_d = h_dist[i * N + j];
+        if (max_d <= 0.0f) return;
+        for (int i = 0; i < N; i++)
+            for (int j = 0; j < N; j++) {
+                if (i == j) continue;
+                float proximity = 1.0f - h_dist[i * N + j] / max_d;
+                G[i * N + j] = proximity * 0.3f;
+                O[i * N + j] = proximity * 0.1f;
+            }
+    }
+    
+    int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
+        if (max_count < 1 || !h_dist) return 0;
+        out[0] = {h_dist, n};
+        return 1;
+    }
+    
+    static TSPLargeProblem create(const float* h_dist_ptr, int n) {
+        TSPLargeProblem prob;
+        prob.n = n;
+        prob.h_dist = h_dist_ptr;
+        float* dd;
+        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));
+        prob.d_dist = dd;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
+        h_dist = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/tsp_xlarge.cuh
+++ b/python/cugenopt/include/problems/tsp_xlarge.cuh
@ -0,0 +1,99 @@
+/**
+ * tsp_xlarge.cuh - 超大规模 TSP 问题定义 (最多 512 城市)
+ * 
+ * 继承 ProblemBase，逻辑与 tsp_large.cuh 一致，D2=512
+ * 注意：距离矩阵 512×512×4B = 1MB，远超 48KB shared memory
+ *       因此 shared_mem_bytes() 返回 0，距离矩阵留在 global memory
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include "operators.cuh"
+
+struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
+    const float* d_dist;
+    const float* h_dist;  // host 端距离矩阵（用于 init_relation_matrix）
+    int n;
+    
+    __device__ float calc_total_distance(const Sol& sol) const {
+        float total = 0.0f;
+        const int* route = sol.data[0];
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++)
+            total += d_dist[route[i] * n + route[(i + 1) % size]];
+        return total;
+    }
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_total_distance(sol);
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const { return 0.0f; }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    // 距离矩阵太大，不放 shared memory
+    size_t shared_mem_bytes() const { return 0; }
+    __device__ void load_shared(char*, int, int) {}
+    
+    size_t working_set_bytes() const {
+        return (size_t)n * n * sizeof(float);
+    }
+    
+    // 用距离矩阵初始化 G/O 先验：距离近 → 分数高
+    void init_relation_matrix(float* G, float* O, int N) const {
+        if (!h_dist || N != n) return;
+        // 找最大距离用于归一化
+        float max_d = 0.0f;
+        for (int i = 0; i < N; i++)
+            for (int j = 0; j < N; j++)
+                if (h_dist[i * N + j] > max_d) max_d = h_dist[i * N + j];
+        if (max_d <= 0.0f) return;
+        
+        for (int i = 0; i < N; i++) {
+            for (int j = 0; j < N; j++) {
+                if (i == j) continue;
+                // 距离近 → G 高（分组倾向强）
+                float proximity = 1.0f - h_dist[i * N + j] / max_d;
+                G[i * N + j] = proximity * 0.3f;  // 初始信号不要太强，留空间给 EMA
+                // 距离近 → O 也给一点信号（对称的，不偏向任何方向）
+                O[i * N + j] = proximity * 0.1f;
+            }
+        }
+    }
+    
+    int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
+        if (max_count < 1 || !h_dist) return 0;
+        out[0] = {h_dist, n};
+        return 1;
+    }
+    
+    static TSPXLargeProblem create(const float* h_dist_ptr, int n) {
+        TSPXLargeProblem prob;
+        prob.n = n;
+        prob.h_dist = h_dist_ptr;  // 保留 host 指针
+        float* dd;
+        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));
+        prob.d_dist = dd;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
+        h_dist = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/vrp.cuh
+++ b/python/cugenopt/include/problems/vrp.cuh
@ -0,0 +1,184 @@
+/**
+ * vrp.cuh - 容量约束车辆路径问题 (CVRP)
+ * 
+ * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ * 多行编码（D1=K 条路线，分区初始化 + 跨行算子）
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include "operators.cuh"
+#include "gpu_cache.cuh"
+
+struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
+    // GPU 数据
+    const float* d_dist;
+    const float* d_demand;
+    const float* h_dist;  // host 端距离矩阵（含 depot，用于 init_relation_matrix）
+    int n;
+    int stride;
+    float capacity;
+    int num_vehicles;
+    int max_vehicles;
+    GpuCache cache;
+    
+    // ---- 目标计算 ----
+    __device__ float compute_route_dist(const int* route, int size) const {
+        if (size == 0) return 0.0f;
+        float dist = 0.0f;
+        int prev = 0;
+        for (int j = 0; j < size; j++) {
+            int node = route[j] + 1;
+            dist += d_dist[prev * stride + node];
+            prev = node;
+        }
+        dist += d_dist[prev * stride + 0];
+        return dist;
+    }
+    
+    __device__ float eval_route(const int* route, int size) const {
+        if (size == 0) return 0.0f;
+        if (!cache.keys) return compute_route_dist(route, size);
+        
+        uint64_t key = route_hash(route, size);
+        float dist;
+        if (cache_lookup(cache, key, dist)) {
+            atomicAdd(cache.d_hits, 1);
+            return dist;
+        }
+        dist = compute_route_dist(route, size);
+        cache_insert(cache, key, dist);
+        atomicAdd(cache.d_misses, 1);
+        return dist;
+    }
+    
+    __device__ float calc_total_distance(const Sol& sol) const {
+        float total = 0.0f;
+        for (int r = 0; r < num_vehicles; r++)
+            total += eval_route(sol.data[r], sol.dim2_sizes[r]);
+        return total;
+    }
+    
+    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_distance
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_total_distance(sol);   // OBJ_DEFS[0]
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        float penalty = 0.0f;
+        int active = 0;
+        for (int r = 0; r < num_vehicles; r++) {
+            int size = sol.dim2_sizes[r];
+            if (size == 0) continue;
+            active++;
+            float load = 0.0f;
+            for (int j = 0; j < size; j++)
+                load += d_demand[sol.data[r][j]];
+            if (load > capacity)
+                penalty += (load - capacity) * 100.0f;
+        }
+        if (active > max_vehicles)
+            penalty += (float)(active - max_vehicles) * 1000.0f;
+        return penalty;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = num_vehicles;
+        cfg.dim2_default = 0;
+        fill_obj_config(cfg);
+        cfg.cross_row_prob = 0.3f;
+        cfg.row_mode = RowMode::Partition;
+        cfg.total_elements = n;
+        return cfg;
+    }
+    
+    // ---- shared memory 接口 ----
+    static constexpr size_t SMEM_LIMIT = 48 * 1024;
+    
+    size_t shared_mem_bytes() const {
+        size_t dist_bytes = (size_t)stride * stride * sizeof(float);
+        size_t demand_bytes = (size_t)n * sizeof(float);
+        size_t total = dist_bytes + demand_bytes;
+        return total <= SMEM_LIMIT ? total : 0;
+    }
+    
+    size_t working_set_bytes() const {
+        return (size_t)stride * stride * sizeof(float) + (size_t)n * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sd = reinterpret_cast<float*>(smem);
+        int dist_size = stride * stride;
+        for (int i = tid; i < dist_size; i += bsz) sd[i] = d_dist[i];
+        d_dist = sd;
+        float* sdem = sd + dist_size;
+        for (int i = tid; i < n; i += bsz) sdem[i] = d_demand[i];
+        d_demand = sdem;
+    }
+    
+    void enable_cache(int cap = 65536) { cache = GpuCache::allocate(cap); }
+    void print_cache_stats() const { cache.print_stats(); }
+    
+    // 距离先验：客户间距离近 → G/O 分数高
+    // 注意：h_dist 含 depot（stride×stride），元素编号 0..n-1 对应 node 1..n
+    void init_relation_matrix(float* G, float* O, int N) const {
+        if (!h_dist || N != n) return;
+        float max_d = 0.0f;
+        for (int i = 0; i < N; i++)
+            for (int j = 0; j < N; j++) {
+                float d = h_dist[(i + 1) * stride + (j + 1)];  // 跳过 depot
+                if (d > max_d) max_d = d;
+            }
+        if (max_d <= 0.0f) return;
+        for (int i = 0; i < N; i++)
+            for (int j = 0; j < N; j++) {
+                if (i == j) continue;
+                float d = h_dist[(i + 1) * stride + (j + 1)];
+                float proximity = 1.0f - d / max_d;
+                G[i * N + j] = proximity * 0.3f;
+                O[i * N + j] = proximity * 0.1f;
+            }
+    }
+    
+    static VRPProblem create(const float* h_dist_ptr, const float* h_demand,
+                              int n, float capacity,
+                              int num_vehicles, int max_vehicles) {
+        VRPProblem prob;
+        prob.n = n;
+        prob.stride = n + 1;
+        prob.capacity = capacity;
+        prob.num_vehicles = num_vehicles;
+        prob.max_vehicles = max_vehicles;
+        prob.cache = GpuCache::disabled();
+        prob.h_dist = h_dist_ptr;
+        
+        int n_nodes = n + 1;
+        float* dd;
+        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n_nodes * n_nodes));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n_nodes * n_nodes, cudaMemcpyHostToDevice));
+        prob.d_dist = dd;
+        
+        float* ddem;
+        CUDA_CHECK(cudaMalloc(&ddem, sizeof(float) * n));
+        CUDA_CHECK(cudaMemcpy(ddem, h_demand, sizeof(float) * n, cudaMemcpyHostToDevice));
+        prob.d_demand = ddem;
+        
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_dist)   { cudaFree(const_cast<float*>(d_dist));   d_dist = nullptr; }
+        if (d_demand) { cudaFree(const_cast<float*>(d_demand)); d_demand = nullptr; }
+        h_dist = nullptr;
+        cache.destroy();
+    }
+};
--- a/python/cugenopt/include/problems/vrptw.cuh
+++ b/python/cugenopt/include/problems/vrptw.cuh
@ -0,0 +1,192 @@
+/**
+ * vrptw.cuh - 带时间窗的车辆路径问题 (VRPTW)
+ * 
+ * 在 CVRP 基础上增加时间窗约束。
+ * 编码：Perm 多行分区（同 CVRP），data[r][j] = 路线 r 的第 j 个客户。
+ * 目标：Minimize 总距离。
+ * 约束：(a) 容量约束, (b) 时间窗约束（到达时间必须 ≤ latest，早到需等待）。
+ * 
+ * 验证实例：8 客户 3 车, 手工设计坐标+时间窗, 确保有已知可行解。
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+
+struct VRPTWProblem : ProblemBase<VRPTWProblem, 8, 64> {
+    const float* d_dist;        // 距离矩阵 [(n+1)*(n+1)]（含 depot）
+    const float* d_demand;      // 需求 [n]
+    const float* d_earliest;    // 最早服务时间 [n+1]（含 depot）
+    const float* d_latest;      // 最晚服务时间 [n+1]（含 depot）
+    const float* d_service;     // 服务耗时 [n+1]（含 depot）
+    int n;                      // 客户数（不含 depot）
+    int stride;                 // n+1
+    float capacity;
+    int num_vehicles;
+    int max_vehicles;
+    
+    __device__ float compute_route_dist(const int* route, int size) const {
+        if (size == 0) return 0.0f;
+        float dist = 0.0f;
+        int prev = 0;
+        for (int j = 0; j < size; j++) {
+            int node = route[j] + 1;
+            dist += d_dist[prev * stride + node];
+            prev = node;
+        }
+        dist += d_dist[prev * stride + 0];
+        return dist;
+    }
+    
+    __device__ float calc_total_distance(const Sol& sol) const {
+        float total = 0.0f;
+        for (int r = 0; r < num_vehicles; r++)
+            total += compute_route_dist(sol.data[r], sol.dim2_sizes[r]);
+        return total;
+    }
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_total_distance(sol);
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        float penalty = 0.0f;
+        int active = 0;
+        for (int r = 0; r < num_vehicles; r++) {
+            int size = sol.dim2_sizes[r];
+            if (size == 0) continue;
+            active++;
+            
+            // 容量约束
+            float load = 0.0f;
+            for (int j = 0; j < size; j++)
+                load += d_demand[sol.data[r][j]];
+            if (load > capacity)
+                penalty += (load - capacity) * 100.0f;
+            
+            // 时间窗约束：模拟路线行驶
+            float time = 0.0f;
+            int prev = 0;
+            for (int j = 0; j < size; j++) {
+                int node = sol.data[r][j] + 1;
+                float travel = d_dist[prev * stride + node];
+                time += travel;
+                // 早到需等待
+                if (time < d_earliest[node])
+                    time = d_earliest[node];
+                // 迟到产生惩罚
+                if (time > d_latest[node])
+                    penalty += (time - d_latest[node]) * 50.0f;
+                time += d_service[node];
+                prev = node;
+            }
+            // 返回 depot 的时间窗
+            float return_time = time + d_dist[prev * stride + 0];
+            if (return_time > d_latest[0])
+                penalty += (return_time - d_latest[0]) * 50.0f;
+        }
+        if (active > max_vehicles)
+            penalty += (float)(active - max_vehicles) * 1000.0f;
+        return penalty;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = num_vehicles;
+        cfg.dim2_default = 0;
+        fill_obj_config(cfg);
+        cfg.cross_row_prob = 0.3f;
+        cfg.row_mode = RowMode::Partition;
+        cfg.total_elements = n;
+        return cfg;
+    }
+    
+    static constexpr size_t SMEM_LIMIT = 48 * 1024;
+    
+    size_t shared_mem_bytes() const {
+        size_t dist_bytes = (size_t)stride * stride * sizeof(float);
+        size_t aux_bytes  = (size_t)(n + 1) * 4 * sizeof(float);  // demand(n) + earliest/latest/service(n+1 each)
+        size_t total = dist_bytes + aux_bytes;
+        return total <= SMEM_LIMIT ? total : 0;
+    }
+    
+    size_t working_set_bytes() const {
+        return (size_t)stride * stride * sizeof(float) + (size_t)(n + 1) * 4 * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sd = reinterpret_cast<float*>(smem);
+        int dist_size = stride * stride;
+        for (int i = tid; i < dist_size; i += bsz) sd[i] = d_dist[i];
+        d_dist = sd;
+        
+        float* sdem = sd + dist_size;
+        for (int i = tid; i < n; i += bsz) sdem[i] = d_demand[i];
+        d_demand = sdem;
+        
+        float* se = sdem + n;
+        int nn = n + 1;
+        for (int i = tid; i < nn; i += bsz) se[i] = d_earliest[i];
+        d_earliest = se;
+        
+        float* sl = se + nn;
+        for (int i = tid; i < nn; i += bsz) sl[i] = d_latest[i];
+        d_latest = sl;
+        
+        float* ss = sl + nn;
+        for (int i = tid; i < nn; i += bsz) ss[i] = d_service[i];
+        d_service = ss;
+    }
+    
+    static VRPTWProblem create(const float* h_dist, const float* h_demand,
+                                const float* h_earliest, const float* h_latest,
+                                const float* h_service,
+                                int n, float capacity,
+                                int num_vehicles, int max_vehicles) {
+        VRPTWProblem prob;
+        prob.n = n;
+        prob.stride = n + 1;
+        prob.capacity = capacity;
+        prob.num_vehicles = num_vehicles;
+        prob.max_vehicles = max_vehicles;
+        
+        int nn = n + 1;
+        float *dd, *ddem, *de, *dl, *ds;
+        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * nn * nn));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * nn * nn, cudaMemcpyHostToDevice));
+        prob.d_dist = dd;
+        
+        CUDA_CHECK(cudaMalloc(&ddem, sizeof(float) * n));
+        CUDA_CHECK(cudaMemcpy(ddem, h_demand, sizeof(float) * n, cudaMemcpyHostToDevice));
+        prob.d_demand = ddem;
+        
+        CUDA_CHECK(cudaMalloc(&de, sizeof(float) * nn));
+        CUDA_CHECK(cudaMemcpy(de, h_earliest, sizeof(float) * nn, cudaMemcpyHostToDevice));
+        prob.d_earliest = de;
+        
+        CUDA_CHECK(cudaMalloc(&dl, sizeof(float) * nn));
+        CUDA_CHECK(cudaMemcpy(dl, h_latest, sizeof(float) * nn, cudaMemcpyHostToDevice));
+        prob.d_latest = dl;
+        
+        CUDA_CHECK(cudaMalloc(&ds, sizeof(float) * nn));
+        CUDA_CHECK(cudaMemcpy(ds, h_service, sizeof(float) * nn, cudaMemcpyHostToDevice));
+        prob.d_service = ds;
+        
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_dist)     { cudaFree(const_cast<float*>(d_dist));     d_dist = nullptr; }
+        if (d_demand)   { cudaFree(const_cast<float*>(d_demand));   d_demand = nullptr; }
+        if (d_earliest) { cudaFree(const_cast<float*>(d_earliest)); d_earliest = nullptr; }
+        if (d_latest)   { cudaFree(const_cast<float*>(d_latest));   d_latest = nullptr; }
+        if (d_service)  { cudaFree(const_cast<float*>(d_service));  d_service = nullptr; }
+    }
+};