Initial commit: cuGenOpt GPU optimization solver

2026-05-01 13:12:37 +02:00 · 2026-03-20 00:33:45 +08:00 · 2026-03-20 00:33:45 +08:00 · fc5a0ff4af
commit fc5a0ff4af
117 changed files with 25545 additions and 0 deletions
--- a/benchmark/experiments/e12_extreme_scale/extreme_tsp.cuh
+++ b/benchmark/experiments/e12_extreme_scale/extreme_tsp.cuh
@ -0,0 +1,82 @@
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include "operators.cuh"
+
+// 极大规模 TSP（最多 2048 个城市）
+struct ExtremeTSPProblem : ProblemBase<ExtremeTSPProblem, 1, 2048> {
+    const float* d_dist;
+    const float* h_dist;
+    int n;
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f}
+    };
+    
+    __device__ float compute_obj(int obj_idx, const Sol& s) const {
+        float total = 0;
+        for (int i = 0; i < n - 1; i++) {
+            int from = s.data[0][i];
+            int to = s.data[0][i + 1];
+            total += d_dist[from * n + to];
+        }
+        total += d_dist[s.data[0][n - 1] * n + s.data[0][0]];
+        return total;
+    }
+    
+    __device__ float compute_penalty(const Sol& s) const {
+        return 0.0f;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = 1;
+        cfg.dim2_default = n;
+        fill_obj_config(cfg);
+        return cfg;
+    }
+    
+    size_t working_set_bytes() const {
+        return (size_t)n * n * sizeof(float);
+    }
+    
+    static ExtremeTSPProblem create(const float* h_dist_matrix, int num_cities) {
+        ExtremeTSPProblem prob;
+        prob.n = num_cities;
+        prob.h_dist = h_dist_matrix;
+        
+        size_t dist_size = (size_t)num_cities * num_cities * sizeof(float);
+        CUDA_CHECK(cudaMalloc(&prob.d_dist, dist_size));
+        CUDA_CHECK(cudaMemcpy((void*)prob.d_dist, h_dist_matrix, dist_size, cudaMemcpyHostToDevice));
+        
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_dist) {
+            cudaFree((void*)d_dist);
+            d_dist = nullptr;
+        }
+    }
+    
+    ExtremeTSPProblem* clone_to_device(int target_gpu) const {
+        int orig_device;
+        CUDA_CHECK(cudaGetDevice(&orig_device));
+        CUDA_CHECK(cudaSetDevice(target_gpu));
+        
+        float* dd;
+        size_t dist_size = (size_t)n * n * sizeof(float);
+        CUDA_CHECK(cudaMalloc(&dd, dist_size));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist, dist_size, cudaMemcpyHostToDevice));
+        
+        CUDA_CHECK(cudaSetDevice(orig_device));
+        
+        ExtremeTSPProblem* new_prob = new ExtremeTSPProblem();
+        new_prob->n = n;
+        new_prob->h_dist = h_dist;
+        new_prob->d_dist = dd;
+        
+        return new_prob;
+    }
+};
--- a/benchmark/experiments/e12_extreme_scale/extreme_vrp.cuh
+++ b/benchmark/experiments/e12_extreme_scale/extreme_vrp.cuh
@ -0,0 +1,131 @@
+#pragma once
+#include "types.cuh"
+#include "cuda_utils.cuh"
+#include "operators.cuh"
+
+// 极大规模 VRP（最多 1000 个客户，160 辆车）
+// D1=160, D2=128 → Solution = 160×128×4 = 80 KB
+struct ExtremeVRPProblem : ProblemBase<ExtremeVRPProblem, 160, 128> {
+    const float* d_dist;
+    const float* d_demand;
+    const float* h_dist;
+    const float* h_demand;
+    int n;
+    float capacity;
+    int num_vehicles;
+    int max_vehicles;
+    
+    static constexpr ObjDef OBJ_DEFS[] = {
+        {ObjDir::Minimize, 1.0f, 0.0f}
+    };
+    
+    __device__ float compute_obj(int obj_idx, const Sol& s) const {
+        float total = 0;
+        for (int v = 0; v < num_vehicles; v++) {
+            int route_len = s.dim2_sizes[v];
+            if (route_len == 0) continue;
+            
+            int first_node = s.data[v][0] + 1;
+            total += d_dist[0 * (n+1) + first_node];
+            
+            int prev = first_node;
+            for (int i = 1; i < route_len; i++) {
+                int node = s.data[v][i] + 1;
+                total += d_dist[prev * (n+1) + node];
+                prev = node;
+            }
+            
+            total += d_dist[prev * (n+1) + 0];
+        }
+        return total;
+    }
+    
+    __device__ float compute_penalty(const Sol& s) const {
+        float penalty = 0;
+        for (int v = 0; v < num_vehicles; v++) {
+            float load = 0;
+            for (int i = 0; i < s.dim2_sizes[v]; i++) {
+                load += d_demand[s.data[v][i]];
+            }
+            if (load > capacity) {
+                penalty += (load - capacity) * 100.0f;
+            }
+        }
+        return penalty;
+    }
+    
+    ProblemConfig config() const {
+        ProblemConfig cfg;
+        cfg.encoding = EncodingType::Permutation;
+        cfg.dim1 = num_vehicles;
+        cfg.dim2_default = 0;
+        fill_obj_config(cfg);
+        cfg.cross_row_prob = 0.3f;
+        cfg.row_mode = RowMode::Partition;
+        cfg.total_elements = n;
+        return cfg;
+    }
+    
+    size_t working_set_bytes() const {
+        return (size_t)(n + 1) * (n + 1) * sizeof(float) + (size_t)n * sizeof(float);
+    }
+    
+    static ExtremeVRPProblem create(const float* h_dist_matrix, const float* h_demand_array,
+                                   int num_customers, float vehicle_capacity,
+                                   int num_veh, int max_veh) {
+        ExtremeVRPProblem prob;
+        prob.n = num_customers;
+        prob.capacity = vehicle_capacity;
+        prob.num_vehicles = num_veh;
+        prob.max_vehicles = max_veh;
+        prob.h_dist = h_dist_matrix;
+        prob.h_demand = h_demand_array;
+        
+        size_t dist_size = (size_t)(num_customers + 1) * (num_customers + 1) * sizeof(float);
+        size_t demand_size = (size_t)num_customers * sizeof(float);
+        
+        CUDA_CHECK(cudaMalloc(&prob.d_dist, dist_size));
+        CUDA_CHECK(cudaMalloc(&prob.d_demand, demand_size));
+        CUDA_CHECK(cudaMemcpy((void*)prob.d_dist, h_dist_matrix, dist_size, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy((void*)prob.d_demand, h_demand_array, demand_size, cudaMemcpyHostToDevice));
+        
+        return prob;
+    }
+    
+    void destroy() {
+        if (d_dist) cudaFree((void*)d_dist);
+        if (d_demand) cudaFree((void*)d_demand);
+        d_dist = nullptr;
+        d_demand = nullptr;
+    }
+    
+    ExtremeVRPProblem* clone_to_device(int target_gpu) const {
+        int orig_device;
+        CUDA_CHECK(cudaGetDevice(&orig_device));
+        CUDA_CHECK(cudaSetDevice(target_gpu));
+        
+        float* dd;
+        float* ddem;
+        size_t dist_size = (size_t)(n + 1) * (n + 1) * sizeof(float);
+        size_t demand_size = (size_t)n * sizeof(float);
+        
+        CUDA_CHECK(cudaMalloc(&dd, dist_size));
+        CUDA_CHECK(cudaMalloc(&ddem, demand_size));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist, dist_size, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(ddem, h_demand, demand_size, cudaMemcpyHostToDevice));
+        
+        CUDA_CHECK(cudaSetDevice(orig_device));
+        
+        ExtremeVRPProblem* new_prob = new ExtremeVRPProblem();
+        new_prob->n = n;
+        new_prob->capacity = capacity;
+        new_prob->num_vehicles = num_vehicles;
+        new_prob->max_vehicles = max_vehicles;
+        new_prob->h_dist = h_dist;
+        new_prob->h_demand = h_demand;
+        new_prob->d_dist = dd;
+        new_prob->d_demand = ddem;
+        
+        return new_prob;
+    }
+};
--- a/benchmark/experiments/e12_extreme_scale/test_e12.cu
+++ b/benchmark/experiments/e12_extreme_scale/test_e12.cu
@ -0,0 +1,167 @@
+#include "solver.cuh"
+#include "multi_gpu_solver.cuh"
+#include "extreme_tsp.cuh"
+#include "extreme_vrp.cuh"
+#include <cstdio>
+#include <vector>
+
+void generate_random_tsp(float* dist, int n, unsigned seed) {
+    srand(seed);
+    for (int i = 0; i < n; i++) {
+        dist[i * n + i] = 0.0f;
+        for (int j = i + 1; j < n; j++) {
+            float d = 10.0f + (rand() % 10000) / 10.0f;
+            dist[i * n + j] = d;
+            dist[j * n + i] = d;
+        }
+    }
+}
+
+void generate_random_vrp(float* dist, float* demand, int n, unsigned seed) {
+    srand(seed);
+    int stride = n + 1;
+    for (int i = 0; i < stride; i++) {
+        dist[i * stride + i] = 0.0f;
+        for (int j = i + 1; j < stride; j++) {
+            float d = 10.0f + (rand() % 10000) / 10.0f;
+            dist[i * stride + j] = d;
+            dist[j * stride + i] = d;
+        }
+    }
+    for (int i = 0; i < n; i++) {
+        demand[i] = 5.0f + (rand() % 20);
+    }
+}
+
+int main() {
+    printf("==============================================\n");
+    printf("E12: 极大规模多 GPU 实验\n");
+    printf("==============================================\n\n");
+    
+    int num_gpus;
+    cudaGetDeviceCount(&num_gpus);
+    printf("检测到 %d 个 GPU\n\n", num_gpus);
+    
+    const int num_runs = 3;
+    
+    // ========== TSP n=2000 ==========
+    printf("[TSP n=2000]\n");
+    printf("  工作集: 2000×2000×4 = 16 MB\n");
+    printf("  预估种群: ~16 (L2=6MB)\n\n");
+    
+    int n_tsp = 2000;
+    float* h_dist_tsp = new float[n_tsp * n_tsp];
+    printf("  生成数据...\n");
+    generate_random_tsp(h_dist_tsp, n_tsp, 12345);
+    
+    printf("  创建 Problem...\n");
+    auto prob_tsp = ExtremeTSPProblem::create(h_dist_tsp, n_tsp);
+    
+    SolverConfig cfg;
+    cfg.pop_size = 0;
+    cfg.max_gen = 5000;
+    cfg.verbose = false;
+    cfg.num_islands = 16;
+    cfg.use_aos = true;
+    cfg.sa_temp_init = 50.0f;
+    cfg.use_cuda_graph = true;
+    
+    // 单GPU
+    printf("  单GPU: ");
+    std::vector<float> single_results;
+    for (int run = 0; run < num_runs; run++) {
+        cfg.seed = 42 + run * 100;
+        auto result = solve(prob_tsp, cfg);
+        single_results.push_back(result.best_solution.objectives[0]);
+        printf("%.1f ", result.best_solution.objectives[0]);
+    }
+    float avg_single = 0;
+    for (float v : single_results) avg_single += v;
+    avg_single /= num_runs;
+    printf("→ %.2f\n", avg_single);
+    
+    // 多GPU
+    if (num_gpus >= 2) {
+        printf("  %dGPU: ", num_gpus);
+        std::vector<float> multi_results;
+        cfg.num_gpus = num_gpus;
+        for (int run = 0; run < num_runs; run++) {
+            cfg.seed = 42 + run * 100;
+            auto result = solve_multi_gpu(prob_tsp, cfg);
+            multi_results.push_back(result.best_solution.objectives[0]);
+            printf("%.1f ", result.best_solution.objectives[0]);
+        }
+        float avg_multi = 0;
+        for (float v : multi_results) avg_multi += v;
+        avg_multi /= num_runs;
+        float improvement = (avg_single - avg_multi) / avg_single * 100;
+        printf("→ %.2f (%.2f%%)\n", avg_multi, improvement);
+    }
+    
+    prob_tsp.destroy();
+    delete[] h_dist_tsp;
+    
+    printf("\n");
+    
+    // ========== VRP n=1000, 160 vehicles ==========
+    printf("[VRP n=1000, vehicles=160]\n");
+    printf("  配置: D1=160, D2=128, Solution=80KB\n");
+    printf("  需求: 5-24 (平均14.5), 容量: 100\n");
+    printf("  理论需要车辆: 146, 实际: 160 (留14辆余量)\n");
+    printf("  工作集: 1001×1001×4 = 4 MB\n\n");
+    
+    int n_vrp = 1000;
+    int num_veh = 160;
+    float* h_dist_vrp = new float[(n_vrp+1) * (n_vrp+1)];
+    float* h_demand_vrp = new float[n_vrp];
+    
+    printf("  生成数据...\n");
+    generate_random_vrp(h_dist_vrp, h_demand_vrp, n_vrp, 12345);
+    
+    printf("  创建 Problem...\n");
+    auto prob_vrp = ExtremeVRPProblem::create(h_dist_vrp, h_demand_vrp, n_vrp, 100.0f, num_veh, num_veh);
+    
+    cfg.max_gen = 5000;
+    
+    // 单GPU
+    printf("  单GPU: ");
+    single_results.clear();
+    for (int run = 0; run < num_runs; run++) {
+        cfg.seed = 42 + run * 100;
+        auto result = solve(prob_vrp, cfg);
+        single_results.push_back(result.best_solution.objectives[0]);
+        printf("%.1f ", result.best_solution.objectives[0]);
+    }
+    avg_single = 0;
+    for (float v : single_results) avg_single += v;
+    avg_single /= num_runs;
+    printf("→ %.2f\n", avg_single);
+    
+    // 多GPU
+    if (num_gpus >= 2) {
+        printf("  %dGPU: ", num_gpus);
+        std::vector<float> multi_results;
+        cfg.num_gpus = num_gpus;
+        for (int run = 0; run < num_runs; run++) {
+            cfg.seed = 42 + run * 100;
+            auto result = solve_multi_gpu(prob_vrp, cfg);
+            multi_results.push_back(result.best_solution.objectives[0]);
+            printf("%.1f ", result.best_solution.objectives[0]);
+        }
+        float avg_multi = 0;
+        for (float v : multi_results) avg_multi += v;
+        avg_multi /= num_runs;
+        float improvement = (avg_single - avg_multi) / avg_single * 100;
+        printf("→ %.2f (%.2f%%)\n", avg_multi, improvement);
+    }
+    
+    prob_vrp.destroy();
+    delete[] h_dist_vrp;
+    delete[] h_demand_vrp;
+    
+    printf("\n==============================================\n");
+    printf("E12 极大规模实验完成\n");
+    printf("==============================================\n");
+    
+    return 0;
+}