Initial commit: cuGenOpt GPU optimization solver

2026-06-09 19:15:13 +02:00 · 2026-03-20 00:33:45 +08:00 · 2026-03-20 00:33:45 +08:00 · fc5a0ff4af
commit fc5a0ff4af
117 changed files with 25545 additions and 0 deletions
--- a/python/cugenopt/include/problems/assignment.cuh
+++ b/python/cugenopt/include/problems/assignment.cuh
@ -0,0 +1,114 @@
+/**
+ * assignment.cuh - 指派问题
+ * 
+ * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include "operators.cuh"
+
+struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
+    const float* d_cost;
+    const float* h_cost;  // host 端成本矩阵（用于 init_relation_matrix）
+    int n;
+    
+    // ---- 目标计算 ----
+    __device__ float calc_total_cost(const Sol& sol) const {
+        float total = 0.0f;
+        const int* assign = sol.data[0];
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++)
+            total += d_cost[i * n + assign[i]];
+        return total;
+    }
+    
+    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_cost
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_total_cost(sol);   // OBJ_DEFS[0]
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        return 0.0f;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    // ---- shared memory 接口 ----
+    static constexpr size_t SMEM_LIMIT = 48 * 1024;
+    
+    size_t shared_mem_bytes() const {
+        size_t need = (size_t)n * n * sizeof(float);
+        return need <= SMEM_LIMIT ? need : 0;
+    }
+    
+    size_t working_set_bytes() const {
+        return (size_t)n * n * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sc = reinterpret_cast<float*>(smem);
+        int total = n * n;
+        for (int i = tid; i < total; i += bsz) sc[i] = d_cost[i];
+        d_cost = sc;
+    }
+    
+    // 成本先验：task j 和 task k 如果被相似 agent 偏好，G 值高
+    // O 矩阵：task j 在位置 i 成本低 → O[j][k] 略高（j 倾向排在 k 前面的位置）
+    void init_relation_matrix(float* G, float* O, int N) const {
+        if (!h_cost || N != n) return;
+        // 对每个 task，构建成本向量，task 间余弦相似度 → G
+        // 简化：成本列向量的相关性
+        float max_c = 0.0f;
+        for (int i = 0; i < N * N; i++)
+            if (h_cost[i] > max_c) max_c = h_cost[i];
+        if (max_c <= 0.0f) return;
+        
+        for (int j = 0; j < N; j++)
+            for (int k = 0; k < N; k++) {
+                if (j == k) continue;
+                // G: 两个 task 的成本向量越相似 → 越可能互换
+                float dot = 0.0f, nj = 0.0f, nk = 0.0f;
+                for (int i = 0; i < N; i++) {
+                    float cj = h_cost[i * N + j] / max_c;
+                    float ck = h_cost[i * N + k] / max_c;
+                    dot += cj * ck;
+                    nj += cj * cj;
+                    nk += ck * ck;
+                }
+                float denom = sqrtf(nj) * sqrtf(nk);
+                float sim = (denom > 1e-6f) ? dot / denom : 0.0f;
+                G[j * N + k] = sim * 0.2f;
+                O[j * N + k] = sim * 0.05f;
+            }
+    }
+    
+    static AssignmentProblem create(const float* hc, int n) {
+        AssignmentProblem prob;
+        prob.n = n;
+        prob.h_cost = hc;
+        float* dc;
+        CUDA_CHECK(cudaMalloc(&dc, sizeof(float)*n*n));
+        CUDA_CHECK(cudaMemcpy(dc, hc, sizeof(float)*n*n, cudaMemcpyHostToDevice));
+        prob.d_cost = dc;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_cost) { cudaFree(const_cast<float*>(d_cost)); d_cost = nullptr; }
+        h_cost = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/bin_packing.cuh
+++ b/python/cugenopt/include/problems/bin_packing.cuh
@ -0,0 +1,97 @@
+/**
+ * bin_packing.cuh - 一维装箱问题（Integer 编码 + 约束）
+ * 
+ * N 个物品，每个重量 w[i]，装入最多 B 个箱子，每个箱子容量 C。
+ * 决策变量：data[0][i] ∈ [0, B-1]，表示物品 i 放入的箱子编号。
+ * 目标：最小化使用的箱子数。
+ * 约束：每个箱子总重不超过 C，超出部分作为 penalty。
+ * 
+ * 验证实例：8 物品 weights=[7,5,3,4,6,2,8,1], C=10, 最优=4 箱
+ *   箱0={7,3}=10, 箱1={5,4,1}=10, 箱2={6,2}=8, 箱3={8}=8
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+
+struct BinPackingProblem : ProblemBase<BinPackingProblem, 1, 64> {
+    const float* d_weights;
+    int n;              // 物品数
+    int max_bins;       // 最大箱子数 B
+    float capacity;     // 箱子容量 C
+    
+    __device__ float calc_bins_used(const Sol& sol) const {
+        bool used[32] = {};
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++) {
+            int b = sol.data[0][i];
+            if (b >= 0 && b < max_bins) used[b] = true;
+        }
+        int count = 0;
+        for (int b = 0; b < max_bins; b++)
+            if (used[b]) count++;
+        return (float)count;
+    }
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_bins_used(sol);
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        float penalty = 0.0f;
+        float load[32] = {};
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++) {
+            int b = sol.data[0][i];
+            if (b >= 0 && b < max_bins)
+                load[b] += d_weights[i];
+        }
+        for (int b = 0; b < max_bins; b++) {
+            float over = load[b] - capacity;
+            if (over > 0.0f) penalty += over * 10.0f;
+        }
+        return penalty;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Integer;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        cfg.value_lower_bound = 0;
+        cfg.value_upper_bound = max_bins - 1;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    size_t shared_mem_bytes() const {
+        return (size_t)n * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sw = reinterpret_cast<float*>(smem);
+        for (int i = tid; i < n; i += bsz) sw[i] = d_weights[i];
+        d_weights = sw;
+    }
+    
+    static BinPackingProblem create(const float* h_weights, int n,
+                                     int max_bins, float capacity) {
+        BinPackingProblem prob;
+        prob.n = n; prob.max_bins = max_bins; prob.capacity = capacity;
+        float* dw;
+        CUDA_CHECK(cudaMalloc(&dw, sizeof(float) * n));
+        CUDA_CHECK(cudaMemcpy(dw, h_weights, sizeof(float) * n, cudaMemcpyHostToDevice));
+        prob.d_weights = dw;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_weights) cudaFree(const_cast<float*>(d_weights));
+        d_weights = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/graph_color.cuh
+++ b/python/cugenopt/include/problems/graph_color.cuh
@ -0,0 +1,79 @@
+/**
+ * graph_color.cuh - 图着色问题（Integer 编码）
+ * 
+ * N 个节点的图，用 k 种颜色着色。
+ * 决策变量：data[0][i] ∈ [0, k-1]，表示节点 i 的颜色。
+ * 目标：最小化冲突边数（相邻节点同色的边数）。
+ * 
+ * 验证实例：Petersen 图（10 节点 15 边，色数=3，最优冲突=0）
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+
+struct GraphColorProblem : ProblemBase<GraphColorProblem, 1, 64> {
+    const int* d_adj;   // 邻接矩阵 [N*N]（1=相邻, 0=不相邻）
+    int n;              // 节点数
+    int k;              // 颜色数
+    
+    __device__ float calc_conflicts(const Sol& sol) const {
+        int conflicts = 0;
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++)
+            for (int j = i + 1; j < size; j++)
+                if (d_adj[i * n + j] && sol.data[0][i] == sol.data[0][j])
+                    conflicts++;
+        return (float)conflicts;
+    }
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_conflicts(sol);
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        return 0.0f;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Integer;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        cfg.value_lower_bound = 0;
+        cfg.value_upper_bound = k - 1;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    size_t shared_mem_bytes() const {
+        return (size_t)n * n * sizeof(int);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        int* sa = reinterpret_cast<int*>(smem);
+        int total = n * n;
+        for (int i = tid; i < total; i += bsz) sa[i] = d_adj[i];
+        d_adj = sa;
+    }
+    
+    static GraphColorProblem create(const int* h_adj, int n, int k) {
+        GraphColorProblem prob;
+        prob.n = n; prob.k = k;
+        int* da;
+        CUDA_CHECK(cudaMalloc(&da, sizeof(int) * n * n));
+        CUDA_CHECK(cudaMemcpy(da, h_adj, sizeof(int) * n * n, cudaMemcpyHostToDevice));
+        prob.d_adj = da;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_adj) cudaFree(const_cast<int*>(d_adj));
+        d_adj = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/jsp.cuh
+++ b/python/cugenopt/include/problems/jsp.cuh
@ -0,0 +1,271 @@
+/**
+ * jsp.cuh - 车间调度问题 (Job Shop Scheduling Problem)
+ * 
+ * J 个工件，每个工件有 O 道工序，每道工序指定机器和耗时。
+ * 
+ * === 编码方案 A：Integer 多行（时间表编码）===
+ * JSPProblem: data[j][i] = 工件 j 的第 i 道工序的开始时间
+ *   dim1 = num_jobs, dim2_default = num_ops
+ *   row_mode = Fixed（禁止 ROW_SPLIT/ROW_MERGE）
+ *   每行代表一个工件的固定工序序列，行长度不可变
+ * 
+ * === 编码方案 B：Permutation 多重集（工序排列编码）===
+ * JSPPermProblem: data[0][k] = 工件编号（0..J-1），长度 J*O
+ *   值 j 出现 O 次。从左到右扫描，第 t 次遇到值 j 表示工件 j 的第 t 道工序。
+ *   dim1 = 1, dim2_default = J*O, perm_repeat_count = O
+ *   标准 Permutation 算子（swap/reverse/insert）天然保持多重集结构
+ * 
+ * 目标：Minimize makespan（所有工件完成时间的最大值）。
+ * 约束：
+ *   (a) 工序顺序：同一工件的工序必须按序执行
+ *   (b) 机器冲突：同一机器同一时刻只能处理一个工序
+ * 
+ * 验证实例：自定义 3 工件 3 机器 (3x3)，最优 makespan = 12
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+
+// ============================================================
+// 编码方案 A：Integer 多行（时间表编码）
+// ============================================================
+
+struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
+    const int*   d_machine;     // 工序所需机器 [J*O]
+    const float* d_duration;    // 工序耗时 [J*O]
+    int num_jobs;               // 工件数 J
+    int num_ops;                // 每工件工序数 O
+    int num_machines;           // 机器数 M
+    int time_horizon;           // 时间上界
+    
+    __device__ float calc_makespan(const Sol& sol) const {
+        float makespan = 0.0f;
+        for (int j = 0; j < num_jobs; j++) {
+            int last = num_ops - 1;
+            float end = (float)sol.data[j][last] + d_duration[j * num_ops + last];
+            if (end > makespan) makespan = end;
+        }
+        return makespan;
+    }
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_makespan(sol);
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        float penalty = 0.0f;
+        
+        // (a) 工序顺序约束
+        for (int j = 0; j < num_jobs; j++) {
+            for (int i = 1; i < num_ops; i++) {
+                float prev_end = (float)sol.data[j][i-1] + d_duration[j * num_ops + (i-1)];
+                float curr_start = (float)sol.data[j][i];
+                if (curr_start < prev_end)
+                    penalty += (prev_end - curr_start) * 10.0f;
+            }
+        }
+        
+        // (b) 机器冲突约束
+        int total = num_jobs * num_ops;
+        for (int a = 0; a < total; a++) {
+            int ja = a / num_ops, ia = a % num_ops;
+            int m_a = d_machine[a];
+            float s_a = (float)sol.data[ja][ia];
+            float e_a = s_a + d_duration[a];
+            for (int b = a + 1; b < total; b++) {
+                if (d_machine[b] != m_a) continue;
+                int jb = b / num_ops, ib = b % num_ops;
+                float s_b = (float)sol.data[jb][ib];
+                float e_b = s_b + d_duration[b];
+                float overlap = fminf(e_a, e_b) - fmaxf(s_a, s_b);
+                if (overlap > 0.0f)
+                    penalty += overlap * 10.0f;
+            }
+        }
+        
+        return penalty;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Integer;
+        cfg.dim1 = num_jobs;
+        cfg.dim2_default = num_ops;
+        cfg.value_lower_bound = 0;
+        cfg.value_upper_bound = time_horizon - 1;
+        cfg.row_mode = RowMode::Fixed;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    size_t shared_mem_bytes() const {
+        int total = num_jobs * num_ops;
+        return (size_t)total * (sizeof(int) + sizeof(float));
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        int total = num_jobs * num_ops;
+        int* sm = reinterpret_cast<int*>(smem);
+        for (int i = tid; i < total; i += bsz) sm[i] = d_machine[i];
+        d_machine = sm;
+        
+        float* sd = reinterpret_cast<float*>(sm + total);
+        for (int i = tid; i < total; i += bsz) sd[i] = d_duration[i];
+        d_duration = sd;
+    }
+    
+    static JSPProblem create(const int* h_machine, const float* h_duration,
+                              int num_jobs, int num_ops, int num_machines,
+                              int time_horizon) {
+        JSPProblem prob;
+        prob.num_jobs = num_jobs;
+        prob.num_ops = num_ops;
+        prob.num_machines = num_machines;
+        prob.time_horizon = time_horizon;
+        
+        int total = num_jobs * num_ops;
+        int* dm;
+        CUDA_CHECK(cudaMalloc(&dm, sizeof(int) * total));
+        CUDA_CHECK(cudaMemcpy(dm, h_machine, sizeof(int) * total, cudaMemcpyHostToDevice));
+        prob.d_machine = dm;
+        
+        float* dd;
+        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * total));
+        CUDA_CHECK(cudaMemcpy(dd, h_duration, sizeof(float) * total, cudaMemcpyHostToDevice));
+        prob.d_duration = dd;
+        
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_machine)  { cudaFree(const_cast<int*>(d_machine));     d_machine = nullptr; }
+        if (d_duration) { cudaFree(const_cast<float*>(d_duration));  d_duration = nullptr; }
+    }
+};
+
+// ============================================================
+// 编码方案 B：Permutation 多重集（工序排列编码）
+// ============================================================
+// data[0] 是长度 J*O 的排列，值域 [0, J)，每个值出现 O 次
+// 从左到右扫描：第 t 次遇到值 j → 安排工件 j 的第 t 道工序
+// 贪心解码：每道工序安排在"最早可行时间"（满足工序顺序 + 机器空闲）
+
+struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
+    const int*   d_machine;     // 工序所需机器 [J*O]
+    const float* d_duration;    // 工序耗时 [J*O]
+    int num_jobs;
+    int num_ops;
+    int num_machines;
+    
+    // 贪心解码：从排列生成调度方案，返回 makespan
+    __device__ float decode_and_makespan(const Sol& sol) const {
+        int total = num_jobs * num_ops;
+        int size = sol.dim2_sizes[0];
+        if (size < total) return 1e9f;
+        
+        float job_avail[8];     // 每个工件的下一道工序最早开始时间
+        float mach_avail[8];    // 每台机器的最早空闲时间
+        int   job_next_op[8];   // 每个工件的下一道待安排工序编号
+        
+        for (int j = 0; j < num_jobs; j++) { job_avail[j] = 0.0f; job_next_op[j] = 0; }
+        for (int m = 0; m < num_machines; m++) mach_avail[m] = 0.0f;
+        
+        float makespan = 0.0f;
+        for (int k = 0; k < total; k++) {
+            int j = sol.data[0][k];
+            if (j < 0 || j >= num_jobs) return 1e9f;
+            int op = job_next_op[j];
+            if (op >= num_ops) continue;  // 该工件已安排完
+            
+            int flat = j * num_ops + op;
+            int m = d_machine[flat];
+            float dur = d_duration[flat];
+            
+            // 最早开始时间 = max(工件前序完成, 机器空闲)
+            float start = fmaxf(job_avail[j], mach_avail[m]);
+            float end = start + dur;
+            
+            job_avail[j] = end;
+            mach_avail[m] = end;
+            job_next_op[j] = op + 1;
+            
+            if (end > makespan) makespan = end;
+        }
+        
+        return makespan;
+    }
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return decode_and_makespan(sol);
+            default: return 0.0f;
+        }
+    }
+    
+    // 贪心解码天然满足约束，penalty 始终为 0
+    __device__ float compute_penalty(const Sol& sol) const {
+        return 0.0f;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = 1;
+        cfg.dim2_default = num_jobs * num_ops;
+        cfg.perm_repeat_count = num_ops;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    size_t shared_mem_bytes() const {
+        int total = num_jobs * num_ops;
+        return (size_t)total * (sizeof(int) + sizeof(float));
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        int total = num_jobs * num_ops;
+        int* sm = reinterpret_cast<int*>(smem);
+        for (int i = tid; i < total; i += bsz) sm[i] = d_machine[i];
+        d_machine = sm;
+        
+        float* sd = reinterpret_cast<float*>(sm + total);
+        for (int i = tid; i < total; i += bsz) sd[i] = d_duration[i];
+        d_duration = sd;
+    }
+    
+    static JSPPermProblem create(const int* h_machine, const float* h_duration,
+                                  int num_jobs, int num_ops, int num_machines) {
+        JSPPermProblem prob;
+        prob.num_jobs = num_jobs;
+        prob.num_ops = num_ops;
+        prob.num_machines = num_machines;
+        
+        int total = num_jobs * num_ops;
+        int* dm;
+        CUDA_CHECK(cudaMalloc(&dm, sizeof(int) * total));
+        CUDA_CHECK(cudaMemcpy(dm, h_machine, sizeof(int) * total, cudaMemcpyHostToDevice));
+        prob.d_machine = dm;
+        
+        float* dd;
+        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * total));
+        CUDA_CHECK(cudaMemcpy(dd, h_duration, sizeof(float) * total, cudaMemcpyHostToDevice));
+        prob.d_duration = dd;
+        
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_machine)  { cudaFree(const_cast<int*>(d_machine));     d_machine = nullptr; }
+        if (d_duration) { cudaFree(const_cast<float*>(d_duration));  d_duration = nullptr; }
+    }
+};
--- a/python/cugenopt/include/problems/knapsack.cuh
+++ b/python/cugenopt/include/problems/knapsack.cuh
@ -0,0 +1,88 @@
+/**
+ * knapsack.cuh - 0-1 背包问题
+ * 
+ * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include "operators.cuh"
+
+struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
+    // 问题数据（d_weights 是物品重量，非目标权重）
+    const float* d_weights;
+    const float* d_values;
+    float capacity;
+    int n;
+    
+    // ---- 目标计算 ----
+    __device__ float calc_total_value(const Sol& sol) const {
+        float tv = 0.0f;
+        const int* sel = sol.data[0];
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++)
+            if (sel[i]) tv += d_values[i];
+        return tv;
+    }
+    
+    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Maximize, 1.0f, 0.0f},   // case 0: calc_total_value
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_total_value(sol);   // OBJ_DEFS[0]
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        float tw = 0.0f;
+        const int* sel = sol.data[0];
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++)
+            if (sel[i]) tw += d_weights[i];
+        float over = tw - capacity;
+        return (over > 0.0f) ? over : 0.0f;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Binary;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    // ---- shared memory 接口 ----
+    size_t shared_mem_bytes() const {
+        return 2 * (size_t)n * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sw = reinterpret_cast<float*>(smem);
+        float* sv = sw + n;
+        for (int i = tid; i < n; i += bsz) { sw[i] = d_weights[i]; sv[i] = d_values[i]; }
+        d_weights = sw;
+        d_values = sv;
+    }
+    
+    static KnapsackProblem create(const float* hw, const float* hv, int n, float cap) {
+        KnapsackProblem prob;
+        prob.n = n; prob.capacity = cap;
+        float *dw, *dv;
+        CUDA_CHECK(cudaMalloc(&dw, sizeof(float)*n));
+        CUDA_CHECK(cudaMalloc(&dv, sizeof(float)*n));
+        CUDA_CHECK(cudaMemcpy(dw, hw, sizeof(float)*n, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(dv, hv, sizeof(float)*n, cudaMemcpyHostToDevice));
+        prob.d_weights = dw; prob.d_values = dv;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_weights) cudaFree(const_cast<float*>(d_weights));
+        if (d_values)  cudaFree(const_cast<float*>(d_values));
+        d_weights = nullptr; d_values = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/load_balance.cuh
+++ b/python/cugenopt/include/problems/load_balance.cuh
@ -0,0 +1,83 @@
+/**
+ * load_balance.cuh - 离散负载均衡问题（Integer 编码验证）
+ * 
+ * N 个任务分配到 M 台机器，每个任务有一个处理时间 p[i]。
+ * 决策变量：data[0][i] ∈ [0, M-1]，表示任务 i 分配到哪台机器。
+ * 目标：最小化 makespan（最大机器负载）。
+ * 
+ * 已知 NP-hard（等价于 multiprocessor scheduling / load balancing）。
+ * LPT（最长处理时间优先）贪心可得 4/3 近似。
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+
+struct LoadBalanceProblem : ProblemBase<LoadBalanceProblem, 1, 64> {
+    const float* d_proc_time;   // 任务处理时间 [N]
+    int n;                      // 任务数
+    int m;                      // 机器数
+    
+    __device__ float calc_makespan(const Sol& sol) const {
+        float load[32] = {};    // 最多 32 台机器
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++) {
+            int machine = sol.data[0][i];
+            if (machine >= 0 && machine < m)
+                load[machine] += d_proc_time[i];
+        }
+        float max_load = 0.0f;
+        for (int j = 0; j < m; j++)
+            if (load[j] > max_load) max_load = load[j];
+        return max_load;
+    }
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: makespan
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_makespan(sol);
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        return 0.0f;   // 无约束（任何分配都合法）
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Integer;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        cfg.value_lower_bound = 0;
+        cfg.value_upper_bound = m - 1;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    size_t shared_mem_bytes() const {
+        return (size_t)n * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sp = reinterpret_cast<float*>(smem);
+        for (int i = tid; i < n; i += bsz) sp[i] = d_proc_time[i];
+        d_proc_time = sp;
+    }
+    
+    static LoadBalanceProblem create(const float* h_proc_time, int n, int m) {
+        LoadBalanceProblem prob;
+        prob.n = n; prob.m = m;
+        float* dp;
+        CUDA_CHECK(cudaMalloc(&dp, sizeof(float) * n));
+        CUDA_CHECK(cudaMemcpy(dp, h_proc_time, sizeof(float) * n, cudaMemcpyHostToDevice));
+        prob.d_proc_time = dp;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_proc_time) cudaFree(const_cast<float*>(d_proc_time));
+        d_proc_time = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/qap.cuh
+++ b/python/cugenopt/include/problems/qap.cuh
@ -0,0 +1,84 @@
+/**
+ * qap.cuh - 二次分配问题 (Quadratic Assignment Problem)
+ * 
+ * N 个设施分配到 N 个位置（排列编码）。
+ * 决策变量：data[0][i] = 设施 i 分配到的位置。
+ * 目标：Minimize sum(flow[i][j] * dist[perm[i]][perm[j]])
+ * 
+ * 验证实例：自定义 5x5
+ *   flow: 设施间的物流量
+ *   dist: 位置间的距离
+ *   已知最优 = 58
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+
+struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
+    const float* d_flow;    // 物流量矩阵 [N*N]
+    const float* d_dist;    // 距离矩阵 [N*N]
+    int n;
+    
+    __device__ float calc_cost(const Sol& sol) const {
+        float cost = 0.0f;
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++)
+            for (int j = 0; j < size; j++)
+                cost += d_flow[i * n + j] * d_dist[sol.data[0][i] * n + sol.data[0][j]];
+        return cost;
+    }
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_cost(sol);
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        return 0.0f;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    size_t shared_mem_bytes() const {
+        return 2 * (size_t)n * n * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sf = reinterpret_cast<float*>(smem);
+        float* sd = sf + n * n;
+        int total = n * n;
+        for (int i = tid; i < total; i += bsz) { sf[i] = d_flow[i]; sd[i] = d_dist[i]; }
+        d_flow = sf;
+        d_dist = sd;
+    }
+    
+    static QAPProblem create(const float* h_flow, const float* h_dist, int n) {
+        QAPProblem prob;
+        prob.n = n;
+        float *df, *dd;
+        CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n));
+        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
+        CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
+        prob.d_flow = df; prob.d_dist = dd;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_flow) cudaFree(const_cast<float*>(d_flow));
+        if (d_dist) cudaFree(const_cast<float*>(d_dist));
+        d_flow = nullptr; d_dist = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/schedule.cuh
+++ b/python/cugenopt/include/problems/schedule.cuh
@ -0,0 +1,101 @@
+/**
+ * schedule.cuh - 排班问题
+ * 
+ * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ * 2 个目标：总成本（min）+ 不公平度（min，权重更高）
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include "operators.cuh"
+
+struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
+    const float* d_cost;
+    int days, emps, required;
+    
+    // ---- 目标计算 ----
+    __device__ float calc_total_cost(const Sol& sol) const {
+        float total = 0.0f;
+        for (int d = 0; d < days; d++)
+            for (int e = 0; e < emps; e++)
+                if (sol.data[d][e]) total += d_cost[d * emps + e];
+        return total;
+    }
+    
+    __device__ float calc_unfairness(const Sol& sol) const {
+        int workdays[D2];
+        for (int e = 0; e < emps; e++) workdays[e] = 0;
+        for (int d = 0; d < days; d++)
+            for (int e = 0; e < emps; e++)
+                if (sol.data[d][e]) workdays[e]++;
+        int max_w = 0, min_w = days;
+        for (int e = 0; e < emps; e++) {
+            if (workdays[e] > max_w) max_w = workdays[e];
+            if (workdays[e] < min_w) min_w = workdays[e];
+        }
+        return (float)(max_w - min_w);
+    }
+    
+    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_cost
+        {ObjDir::Minimize, 5.0f, 0.0f},   // case 1: calc_unfairness
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_total_cost(sol);     // OBJ_DEFS[0]
+            case 1: return calc_unfairness(sol);     // OBJ_DEFS[1]
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        float penalty = 0.0f;
+        for (int d = 0; d < days; d++) {
+            int count = 0;
+            for (int e = 0; e < emps; e++)
+                if (sol.data[d][e]) count++;
+            int diff = count - required;
+            penalty += (diff > 0) ? (float)diff : (float)(-diff);
+        }
+        return penalty;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Binary;
+        cfg.dim1 = days;  cfg.dim2_default = emps;
+        cfg.row_mode = RowMode::Fixed;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    // 默认回退全量（基类行为）— 不需要覆盖 evaluate_move
+    
+    // ---- shared memory 接口 ----
+    size_t shared_mem_bytes() const {
+        return (size_t)days * emps * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sc = reinterpret_cast<float*>(smem);
+        int total = days * emps;
+        for (int i = tid; i < total; i += bsz) sc[i] = d_cost[i];
+        d_cost = sc;
+    }
+    
+    static ScheduleProblem create(const float* hc, int days, int emps, int req) {
+        ScheduleProblem prob;
+        prob.days = days; prob.emps = emps; prob.required = req;
+        float* dc;
+        CUDA_CHECK(cudaMalloc(&dc, sizeof(float)*days*emps));
+        CUDA_CHECK(cudaMemcpy(dc, hc, sizeof(float)*days*emps, cudaMemcpyHostToDevice));
+        prob.d_cost = dc;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_cost) { cudaFree(const_cast<float*>(d_cost)); d_cost = nullptr; }
+    }
+};
--- a/python/cugenopt/include/problems/tsp.cuh
+++ b/python/cugenopt/include/problems/tsp.cuh
@ -0,0 +1,110 @@
+/**
+ * tsp.cuh - TSP 问题定义
+ * 
+ * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include "operators.cuh"
+
+struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
+    // 问题数据
+    const float* d_dist;
+    const float* h_dist;  // host 端距离矩阵（用于 init_relation_matrix）
+    int n;
+    
+    // ---- 目标计算 ----
+    __device__ float calc_total_distance(const Sol& sol) const {
+        float total = 0.0f;
+        const int* route = sol.data[0];
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++)
+            total += d_dist[route[i] * n + route[(i + 1) % size]];
+        return total;
+    }
+    
+    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_distance
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_total_distance(sol);   // OBJ_DEFS[0]
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        return 0.0f;  // TSP 无约束
+    }
+    
+    // ---- config（编码/维度部分，目标由基类自动填充）----
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    // ---- shared memory 接口 ----
+    static constexpr size_t SMEM_LIMIT = 48 * 1024;
+    
+    size_t shared_mem_bytes() const {
+        size_t need = (size_t)n * n * sizeof(float);
+        return need <= SMEM_LIMIT ? need : 0;
+    }
+    
+    size_t working_set_bytes() const {
+        return (size_t)n * n * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sd = reinterpret_cast<float*>(smem);
+        int total = n * n;
+        for (int i = tid; i < total; i += bsz)
+            sd[i] = d_dist[i];
+        d_dist = sd;
+    }
+    
+    // 距离先验：距离近 → G/O 分数高
+    void init_relation_matrix(float* G, float* O, int N) const {
+        if (!h_dist || N != n) return;
+        float max_d = 0.0f;
+        for (int i = 0; i < N; i++)
+            for (int j = 0; j < N; j++)
+                if (h_dist[i * N + j] > max_d) max_d = h_dist[i * N + j];
+        if (max_d <= 0.0f) return;
+        for (int i = 0; i < N; i++)
+            for (int j = 0; j < N; j++) {
+                if (i == j) continue;
+                float proximity = 1.0f - h_dist[i * N + j] / max_d;
+                G[i * N + j] = proximity * 0.3f;
+                O[i * N + j] = proximity * 0.1f;
+            }
+    }
+    
+    int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
+        if (max_count < 1 || !h_dist) return 0;
+        out[0] = {h_dist, n};
+        return 1;
+    }
+    
+    static TSPProblem create(const float* h_dist_ptr, int n) {
+        TSPProblem prob;
+        prob.n = n;
+        prob.h_dist = h_dist_ptr;
+        float* dd;
+        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));
+        prob.d_dist = dd;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
+        h_dist = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/tsp_large.cuh
+++ b/python/cugenopt/include/problems/tsp_large.cuh
@ -0,0 +1,107 @@
+/**
+ * tsp_large.cuh - 大规模 TSP 问题定义 (最多 256 城市)
+ * 
+ * 继承 ProblemBase，逻辑与 tsp.cuh 一致，仅 D2 上限不同
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include "operators.cuh"
+
+struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
+    const float* d_dist;
+    const float* h_dist;
+    int n;
+    
+    // ---- 目标计算 ----
+    __device__ float calc_total_distance(const Sol& sol) const {
+        float total = 0.0f;
+        const int* route = sol.data[0];
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++)
+            total += d_dist[route[i] * n + route[(i + 1) % size]];
+        return total;
+    }
+    
+    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_distance
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_total_distance(sol);   // OBJ_DEFS[0]
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        return 0.0f;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    static constexpr size_t SMEM_LIMIT = 48 * 1024;
+    
+    size_t shared_mem_bytes() const {
+        size_t need = (size_t)n * n * sizeof(float);
+        return need <= SMEM_LIMIT ? need : 0;
+    }
+    
+    // 距离矩阵的实际大小（不管是否放进 smem）
+    size_t working_set_bytes() const {
+        return (size_t)n * n * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sd = reinterpret_cast<float*>(smem);
+        int total = n * n;
+        for (int i = tid; i < total; i += bsz)
+            sd[i] = d_dist[i];
+        d_dist = sd;
+    }
+    
+    void init_relation_matrix(float* G, float* O, int N) const {
+        if (!h_dist || N != n) return;
+        float max_d = 0.0f;
+        for (int i = 0; i < N; i++)
+            for (int j = 0; j < N; j++)
+                if (h_dist[i * N + j] > max_d) max_d = h_dist[i * N + j];
+        if (max_d <= 0.0f) return;
+        for (int i = 0; i < N; i++)
+            for (int j = 0; j < N; j++) {
+                if (i == j) continue;
+                float proximity = 1.0f - h_dist[i * N + j] / max_d;
+                G[i * N + j] = proximity * 0.3f;
+                O[i * N + j] = proximity * 0.1f;
+            }
+    }
+    
+    int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
+        if (max_count < 1 || !h_dist) return 0;
+        out[0] = {h_dist, n};
+        return 1;
+    }
+    
+    static TSPLargeProblem create(const float* h_dist_ptr, int n) {
+        TSPLargeProblem prob;
+        prob.n = n;
+        prob.h_dist = h_dist_ptr;
+        float* dd;
+        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));
+        prob.d_dist = dd;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
+        h_dist = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/tsp_xlarge.cuh
+++ b/python/cugenopt/include/problems/tsp_xlarge.cuh
@ -0,0 +1,99 @@
+/**
+ * tsp_xlarge.cuh - 超大规模 TSP 问题定义 (最多 512 城市)
+ * 
+ * 继承 ProblemBase，逻辑与 tsp_large.cuh 一致，D2=512
+ * 注意：距离矩阵 512×512×4B = 1MB，远超 48KB shared memory
+ *       因此 shared_mem_bytes() 返回 0，距离矩阵留在 global memory
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include "operators.cuh"
+
+struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
+    const float* d_dist;
+    const float* h_dist;  // host 端距离矩阵（用于 init_relation_matrix）
+    int n;
+    
+    __device__ float calc_total_distance(const Sol& sol) const {
+        float total = 0.0f;
+        const int* route = sol.data[0];
+        int size = sol.dim2_sizes[0];
+        for (int i = 0; i < size; i++)
+            total += d_dist[route[i] * n + route[(i + 1) % size]];
+        return total;
+    }
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_total_distance(sol);
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const { return 0.0f; }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = 1;  cfg.dim2_default = n;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    // 距离矩阵太大，不放 shared memory
+    size_t shared_mem_bytes() const { return 0; }
+    __device__ void load_shared(char*, int, int) {}
+    
+    size_t working_set_bytes() const {
+        return (size_t)n * n * sizeof(float);
+    }
+    
+    // 用距离矩阵初始化 G/O 先验：距离近 → 分数高
+    void init_relation_matrix(float* G, float* O, int N) const {
+        if (!h_dist || N != n) return;
+        // 找最大距离用于归一化
+        float max_d = 0.0f;
+        for (int i = 0; i < N; i++)
+            for (int j = 0; j < N; j++)
+                if (h_dist[i * N + j] > max_d) max_d = h_dist[i * N + j];
+        if (max_d <= 0.0f) return;
+        
+        for (int i = 0; i < N; i++) {
+            for (int j = 0; j < N; j++) {
+                if (i == j) continue;
+                // 距离近 → G 高（分组倾向强）
+                float proximity = 1.0f - h_dist[i * N + j] / max_d;
+                G[i * N + j] = proximity * 0.3f;  // 初始信号不要太强，留空间给 EMA
+                // 距离近 → O 也给一点信号（对称的，不偏向任何方向）
+                O[i * N + j] = proximity * 0.1f;
+            }
+        }
+    }
+    
+    int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
+        if (max_count < 1 || !h_dist) return 0;
+        out[0] = {h_dist, n};
+        return 1;
+    }
+    
+    static TSPXLargeProblem create(const float* h_dist_ptr, int n) {
+        TSPXLargeProblem prob;
+        prob.n = n;
+        prob.h_dist = h_dist_ptr;  // 保留 host 指针
+        float* dd;
+        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));
+        prob.d_dist = dd;
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
+        h_dist = nullptr;
+    }
+};
--- a/python/cugenopt/include/problems/vrp.cuh
+++ b/python/cugenopt/include/problems/vrp.cuh
@ -0,0 +1,184 @@
+/**
+ * vrp.cuh - 容量约束车辆路径问题 (CVRP)
+ * 
+ * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ * 多行编码（D1=K 条路线，分区初始化 + 跨行算子）
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include "operators.cuh"
+#include "gpu_cache.cuh"
+
+struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
+    // GPU 数据
+    const float* d_dist;
+    const float* d_demand;
+    const float* h_dist;  // host 端距离矩阵（含 depot，用于 init_relation_matrix）
+    int n;
+    int stride;
+    float capacity;
+    int num_vehicles;
+    int max_vehicles;
+    GpuCache cache;
+    
+    // ---- 目标计算 ----
+    __device__ float compute_route_dist(const int* route, int size) const {
+        if (size == 0) return 0.0f;
+        float dist = 0.0f;
+        int prev = 0;
+        for (int j = 0; j < size; j++) {
+            int node = route[j] + 1;
+            dist += d_dist[prev * stride + node];
+            prev = node;
+        }
+        dist += d_dist[prev * stride + 0];
+        return dist;
+    }
+    
+    __device__ float eval_route(const int* route, int size) const {
+        if (size == 0) return 0.0f;
+        if (!cache.keys) return compute_route_dist(route, size);
+        
+        uint64_t key = route_hash(route, size);
+        float dist;
+        if (cache_lookup(cache, key, dist)) {
+            atomicAdd(cache.d_hits, 1);
+            return dist;
+        }
+        dist = compute_route_dist(route, size);
+        cache_insert(cache, key, dist);
+        atomicAdd(cache.d_misses, 1);
+        return dist;
+    }
+    
+    __device__ float calc_total_distance(const Sol& sol) const {
+        float total = 0.0f;
+        for (int r = 0; r < num_vehicles; r++)
+            total += eval_route(sol.data[r], sol.dim2_sizes[r]);
+        return total;
+    }
+    
+    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_distance
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_total_distance(sol);   // OBJ_DEFS[0]
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        float penalty = 0.0f;
+        int active = 0;
+        for (int r = 0; r < num_vehicles; r++) {
+            int size = sol.dim2_sizes[r];
+            if (size == 0) continue;
+            active++;
+            float load = 0.0f;
+            for (int j = 0; j < size; j++)
+                load += d_demand[sol.data[r][j]];
+            if (load > capacity)
+                penalty += (load - capacity) * 100.0f;
+        }
+        if (active > max_vehicles)
+            penalty += (float)(active - max_vehicles) * 1000.0f;
+        return penalty;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = num_vehicles;
+        cfg.dim2_default = 0;
+        fill_obj_config(cfg);
+        cfg.cross_row_prob = 0.3f;
+        cfg.row_mode = RowMode::Partition;
+        cfg.total_elements = n;
+        return cfg;
+    }
+    
+    // ---- shared memory 接口 ----
+    static constexpr size_t SMEM_LIMIT = 48 * 1024;
+    
+    size_t shared_mem_bytes() const {
+        size_t dist_bytes = (size_t)stride * stride * sizeof(float);
+        size_t demand_bytes = (size_t)n * sizeof(float);
+        size_t total = dist_bytes + demand_bytes;
+        return total <= SMEM_LIMIT ? total : 0;
+    }
+    
+    size_t working_set_bytes() const {
+        return (size_t)stride * stride * sizeof(float) + (size_t)n * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sd = reinterpret_cast<float*>(smem);
+        int dist_size = stride * stride;
+        for (int i = tid; i < dist_size; i += bsz) sd[i] = d_dist[i];
+        d_dist = sd;
+        float* sdem = sd + dist_size;
+        for (int i = tid; i < n; i += bsz) sdem[i] = d_demand[i];
+        d_demand = sdem;
+    }
+    
+    void enable_cache(int cap = 65536) { cache = GpuCache::allocate(cap); }
+    void print_cache_stats() const { cache.print_stats(); }
+    
+    // 距离先验：客户间距离近 → G/O 分数高
+    // 注意：h_dist 含 depot（stride×stride），元素编号 0..n-1 对应 node 1..n
+    void init_relation_matrix(float* G, float* O, int N) const {
+        if (!h_dist || N != n) return;
+        float max_d = 0.0f;
+        for (int i = 0; i < N; i++)
+            for (int j = 0; j < N; j++) {
+                float d = h_dist[(i + 1) * stride + (j + 1)];  // 跳过 depot
+                if (d > max_d) max_d = d;
+            }
+        if (max_d <= 0.0f) return;
+        for (int i = 0; i < N; i++)
+            for (int j = 0; j < N; j++) {
+                if (i == j) continue;
+                float d = h_dist[(i + 1) * stride + (j + 1)];
+                float proximity = 1.0f - d / max_d;
+                G[i * N + j] = proximity * 0.3f;
+                O[i * N + j] = proximity * 0.1f;
+            }
+    }
+    
+    static VRPProblem create(const float* h_dist_ptr, const float* h_demand,
+                              int n, float capacity,
+                              int num_vehicles, int max_vehicles) {
+        VRPProblem prob;
+        prob.n = n;
+        prob.stride = n + 1;
+        prob.capacity = capacity;
+        prob.num_vehicles = num_vehicles;
+        prob.max_vehicles = max_vehicles;
+        prob.cache = GpuCache::disabled();
+        prob.h_dist = h_dist_ptr;
+        
+        int n_nodes = n + 1;
+        float* dd;
+        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n_nodes * n_nodes));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n_nodes * n_nodes, cudaMemcpyHostToDevice));
+        prob.d_dist = dd;
+        
+        float* ddem;
+        CUDA_CHECK(cudaMalloc(&ddem, sizeof(float) * n));
+        CUDA_CHECK(cudaMemcpy(ddem, h_demand, sizeof(float) * n, cudaMemcpyHostToDevice));
+        prob.d_demand = ddem;
+        
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_dist)   { cudaFree(const_cast<float*>(d_dist));   d_dist = nullptr; }
+        if (d_demand) { cudaFree(const_cast<float*>(d_demand)); d_demand = nullptr; }
+        h_dist = nullptr;
+        cache.destroy();
+    }
+};
--- a/python/cugenopt/include/problems/vrptw.cuh
+++ b/python/cugenopt/include/problems/vrptw.cuh
@ -0,0 +1,192 @@
+/**
+ * vrptw.cuh - 带时间窗的车辆路径问题 (VRPTW)
+ * 
+ * 在 CVRP 基础上增加时间窗约束。
+ * 编码：Perm 多行分区（同 CVRP），data[r][j] = 路线 r 的第 j 个客户。
+ * 目标：Minimize 总距离。
+ * 约束：(a) 容量约束, (b) 时间窗约束（到达时间必须 ≤ latest，早到需等待）。
+ * 
+ * 验证实例：8 客户 3 车, 手工设计坐标+时间窗, 确保有已知可行解。
+ */
+
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+
+struct VRPTWProblem : ProblemBase<VRPTWProblem, 8, 64> {
+    const float* d_dist;        // 距离矩阵 [(n+1)*(n+1)]（含 depot）
+    const float* d_demand;      // 需求 [n]
+    const float* d_earliest;    // 最早服务时间 [n+1]（含 depot）
+    const float* d_latest;      // 最晚服务时间 [n+1]（含 depot）
+    const float* d_service;     // 服务耗时 [n+1]（含 depot）
+    int n;                      // 客户数（不含 depot）
+    int stride;                 // n+1
+    float capacity;
+    int num_vehicles;
+    int max_vehicles;
+    
+    __device__ float compute_route_dist(const int* route, int size) const {
+        if (size == 0) return 0.0f;
+        float dist = 0.0f;
+        int prev = 0;
+        for (int j = 0; j < size; j++) {
+            int node = route[j] + 1;
+            dist += d_dist[prev * stride + node];
+            prev = node;
+        }
+        dist += d_dist[prev * stride + 0];
+        return dist;
+    }
+    
+    __device__ float calc_total_distance(const Sol& sol) const {
+        float total = 0.0f;
+        for (int r = 0; r < num_vehicles; r++)
+            total += compute_route_dist(sol.data[r], sol.dim2_sizes[r]);
+        return total;
+    }
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f},
+    };
+    __device__ float compute_obj(int idx, const Sol& sol) const {
+        switch (idx) {
+            case 0: return calc_total_distance(sol);
+            default: return 0.0f;
+        }
+    }
+    
+    __device__ float compute_penalty(const Sol& sol) const {
+        float penalty = 0.0f;
+        int active = 0;
+        for (int r = 0; r < num_vehicles; r++) {
+            int size = sol.dim2_sizes[r];
+            if (size == 0) continue;
+            active++;
+            
+            // 容量约束
+            float load = 0.0f;
+            for (int j = 0; j < size; j++)
+                load += d_demand[sol.data[r][j]];
+            if (load > capacity)
+                penalty += (load - capacity) * 100.0f;
+            
+            // 时间窗约束：模拟路线行驶
+            float time = 0.0f;
+            int prev = 0;
+            for (int j = 0; j < size; j++) {
+                int node = sol.data[r][j] + 1;
+                float travel = d_dist[prev * stride + node];
+                time += travel;
+                // 早到需等待
+                if (time < d_earliest[node])
+                    time = d_earliest[node];
+                // 迟到产生惩罚
+                if (time > d_latest[node])
+                    penalty += (time - d_latest[node]) * 50.0f;
+                time += d_service[node];
+                prev = node;
+            }
+            // 返回 depot 的时间窗
+            float return_time = time + d_dist[prev * stride + 0];
+            if (return_time > d_latest[0])
+                penalty += (return_time - d_latest[0]) * 50.0f;
+        }
+        if (active > max_vehicles)
+            penalty += (float)(active - max_vehicles) * 1000.0f;
+        return penalty;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = num_vehicles;
+        cfg.dim2_default = 0;
+        fill_obj_config(cfg);
+        cfg.cross_row_prob = 0.3f;
+        cfg.row_mode = RowMode::Partition;
+        cfg.total_elements = n;
+        return cfg;
+    }
+    
+    static constexpr size_t SMEM_LIMIT = 48 * 1024;
+    
+    size_t shared_mem_bytes() const {
+        size_t dist_bytes = (size_t)stride * stride * sizeof(float);
+        size_t aux_bytes  = (size_t)(n + 1) * 4 * sizeof(float);  // demand(n) + earliest/latest/service(n+1 each)
+        size_t total = dist_bytes + aux_bytes;
+        return total <= SMEM_LIMIT ? total : 0;
+    }
+    
+    size_t working_set_bytes() const {
+        return (size_t)stride * stride * sizeof(float) + (size_t)(n + 1) * 4 * sizeof(float);
+    }
+    
+    __device__ void load_shared(char* smem, int tid, int bsz) {
+        float* sd = reinterpret_cast<float*>(smem);
+        int dist_size = stride * stride;
+        for (int i = tid; i < dist_size; i += bsz) sd[i] = d_dist[i];
+        d_dist = sd;
+        
+        float* sdem = sd + dist_size;
+        for (int i = tid; i < n; i += bsz) sdem[i] = d_demand[i];
+        d_demand = sdem;
+        
+        float* se = sdem + n;
+        int nn = n + 1;
+        for (int i = tid; i < nn; i += bsz) se[i] = d_earliest[i];
+        d_earliest = se;
+        
+        float* sl = se + nn;
+        for (int i = tid; i < nn; i += bsz) sl[i] = d_latest[i];
+        d_latest = sl;
+        
+        float* ss = sl + nn;
+        for (int i = tid; i < nn; i += bsz) ss[i] = d_service[i];
+        d_service = ss;
+    }
+    
+    static VRPTWProblem create(const float* h_dist, const float* h_demand,
+                                const float* h_earliest, const float* h_latest,
+                                const float* h_service,
+                                int n, float capacity,
+                                int num_vehicles, int max_vehicles) {
+        VRPTWProblem prob;
+        prob.n = n;
+        prob.stride = n + 1;
+        prob.capacity = capacity;
+        prob.num_vehicles = num_vehicles;
+        prob.max_vehicles = max_vehicles;
+        
+        int nn = n + 1;
+        float *dd, *ddem, *de, *dl, *ds;
+        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * nn * nn));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * nn * nn, cudaMemcpyHostToDevice));
+        prob.d_dist = dd;
+        
+        CUDA_CHECK(cudaMalloc(&ddem, sizeof(float) * n));
+        CUDA_CHECK(cudaMemcpy(ddem, h_demand, sizeof(float) * n, cudaMemcpyHostToDevice));
+        prob.d_demand = ddem;
+        
+        CUDA_CHECK(cudaMalloc(&de, sizeof(float) * nn));
+        CUDA_CHECK(cudaMemcpy(de, h_earliest, sizeof(float) * nn, cudaMemcpyHostToDevice));
+        prob.d_earliest = de;
+        
+        CUDA_CHECK(cudaMalloc(&dl, sizeof(float) * nn));
+        CUDA_CHECK(cudaMemcpy(dl, h_latest, sizeof(float) * nn, cudaMemcpyHostToDevice));
+        prob.d_latest = dl;
+        
+        CUDA_CHECK(cudaMalloc(&ds, sizeof(float) * nn));
+        CUDA_CHECK(cudaMemcpy(ds, h_service, sizeof(float) * nn, cudaMemcpyHostToDevice));
+        prob.d_service = ds;
+        
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_dist)     { cudaFree(const_cast<float*>(d_dist));     d_dist = nullptr; }
+        if (d_demand)   { cudaFree(const_cast<float*>(d_demand));   d_demand = nullptr; }
+        if (d_earliest) { cudaFree(const_cast<float*>(d_earliest)); d_earliest = nullptr; }
+        if (d_latest)   { cudaFree(const_cast<float*>(d_latest));   d_latest = nullptr; }
+        if (d_service)  { cudaFree(const_cast<float*>(d_service));  d_service = nullptr; }
+    }
+};