Initial commit: cuGenOpt GPU optimization solver

This commit is contained in:
L-yang-yang 2026-03-20 00:33:45 +08:00
commit fc5a0ff4af
117 changed files with 25545 additions and 0 deletions

View file

@ -0,0 +1,81 @@
# E10: 大规模问题实验
## 实验目的
验证 cuGenOpt 在大规模问题n>100上的性能表现以及多 GPU 简化版的实际收益。
## 实验设计
### 测试规模
**TSP**:
- n = 100, 200, 300, 400, 500
**VRP**:
- n = 50, 100, 150, 200
- 车辆数动态调整n/20 + 1
- 容量固定为 150
### 对比维度
1. **单 GPU vs 多 GPU**(简化版)
2. **不同规模下的性能表现**
3. **多 GPU 的收益曲线**
### 配置参数
```cpp
SolverConfig cfg;
cfg.pop_size = 0; // 自适应L2 cache感知
cfg.max_gen = 10000;
cfg.num_islands = 16;
cfg.use_aos = true;
cfg.sa_temp_init = 50.0f;
cfg.use_cuda_graph = true;
```
### 运行次数
每个配置运行 5 次,取平均值。
## 文件说明
- `large_tsp_problem.cuh`: 支持最多 512 个城市的 TSP 问题定义
- `large_vrp_problem.cuh`: 支持最多 256 个客户、16 辆车的 VRP 问题定义
- `gpu.cu`: 主实验代码
## 编译和运行
```bash
# 在远程服务器上
cd ~/cugenopt_e10
# 编译
nvcc -arch=sm_70 -O2 -std=c++17 --extended-lambda \
-I ../../../prototype/core \
-I ../../../prototype/problems \
-I . \
-o e10_test gpu.cu
# 运行
./e10_test > e10_output.txt 2>&1
```
## 预期结果
1. **单 GPU 性能**
- 小规模n≤100gap < 5%
- 中规模n=200-300gap < 10%
- 大规模n≥400gap 可能较高,但仍能找到可行解
2. **多 GPU 收益**
- 预期在大规模问题上收益更明显2-5%
- 验证"简化版"在实际场景中的价值
3. **可扩展性**
- 观察 gens/s 随规模的变化
- 识别性能瓶颈shared memory, L2 cache
## 实验日期
2026-03-05

View file

@ -0,0 +1,185 @@
#include "solver.cuh"
#include "multi_gpu_solver.cuh"
#include "large_tsp_problem.cuh"
#include "large_vrp_problem.cuh"
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <vector>
#include <algorithm>
// 生成随机TSP实例
void generate_random_tsp(float* dist, int n, unsigned seed) {
srand(seed);
for (int i = 0; i < n; i++) {
dist[i * n + i] = 0.0f;
for (int j = i + 1; j < n; j++) {
float d = 10.0f + (rand() % 10000) / 10.0f;
dist[i * n + j] = d;
dist[j * n + i] = d;
}
}
}
// 生成随机VRP实例
void generate_random_vrp(float* dist, float* demand, int n, unsigned seed) {
srand(seed);
int stride = n + 1;
// 距离矩阵包含depot
for (int i = 0; i < stride; i++) {
dist[i * stride + i] = 0.0f;
for (int j = i + 1; j < stride; j++) {
float d = 10.0f + (rand() % 10000) / 10.0f;
dist[i * stride + j] = d;
dist[j * stride + i] = d;
}
}
// 需求
for (int i = 0; i < n; i++) {
demand[i] = 5.0f + (rand() % 20);
}
}
int main() {
printf("==============================================\n");
printf("E10: 大规模问题实验 (TSP & VRP)\n");
printf("==============================================\n\n");
// 检测可用GPU数量
int num_gpus;
cudaGetDeviceCount(&num_gpus);
printf("检测到 %d 个 GPU\n\n", num_gpus);
const int num_runs = 5;
// ========== TSP 大规模测试 ==========
printf("实验 1: TSP 大规模测试\n");
printf("----------------------------------------------\n");
std::vector<int> tsp_sizes = {100, 200, 300, 400, 500};
for (int n : tsp_sizes) {
printf("\n[TSP n=%d]\n", n);
// 生成实例
float* h_dist = new float[n * n];
generate_random_tsp(h_dist, n, 12345);
auto prob = LargeTSPProblem::create(h_dist, n);
// 配置
SolverConfig cfg;
cfg.pop_size = 0; // 自适应
cfg.max_gen = 10000;
cfg.verbose = false;
cfg.num_islands = 16;
cfg.use_aos = true;
cfg.sa_temp_init = 50.0f;
cfg.use_cuda_graph = true;
// 单GPU测试
printf(" 单GPU (5 runs): ");
std::vector<float> single_gpu_results;
for (int run = 0; run < num_runs; run++) {
cfg.seed = 42 + run * 100;
auto result = solve(prob, cfg);
single_gpu_results.push_back(result.best_solution.objectives[0]);
printf("%.1f ", result.best_solution.objectives[0]);
}
float avg_single = 0;
for (float v : single_gpu_results) avg_single += v;
avg_single /= num_runs;
printf(" → 平均: %.2f\n", avg_single);
// 多GPU测试如果可用
if (num_gpus >= 2) {
printf(" 多GPU (%d GPUs, 5 runs): ", num_gpus);
std::vector<float> multi_gpu_results;
cfg.num_gpus = num_gpus;
for (int run = 0; run < num_runs; run++) {
cfg.seed = 42 + run * 100;
auto result = solve_multi_gpu(prob, cfg);
multi_gpu_results.push_back(result.best_solution.objectives[0]);
printf("%.1f ", result.best_solution.objectives[0]);
}
float avg_multi = 0;
for (float v : multi_gpu_results) avg_multi += v;
avg_multi /= num_runs;
float improvement = (avg_single - avg_multi) / avg_single * 100;
printf(" → 平均: %.2f (%.2f%%)\n", avg_multi, improvement);
}
prob.destroy();
delete[] h_dist;
}
// ========== VRP 大规模测试 ==========
printf("\n\n实验 2: VRP 大规模测试\n");
printf("----------------------------------------------\n");
std::vector<int> vrp_sizes = {50, 100, 150, 200};
for (int n : vrp_sizes) {
printf("\n[VRP n=%d]\n", n);
// 生成实例
float* h_dist = new float[(n+1) * (n+1)];
float* h_demand = new float[n];
generate_random_vrp(h_dist, h_demand, n, 23456);
int num_vehicles = (n / 20) + 1; // 动态车辆数
float capacity = 150.0f;
auto prob = LargeVRPProblem::create(h_dist, h_demand, n, capacity, num_vehicles, num_vehicles + 4);
// 配置
SolverConfig cfg;
cfg.pop_size = 0; // 自适应
cfg.max_gen = 10000;
cfg.verbose = false;
cfg.num_islands = 16;
cfg.use_aos = true;
cfg.sa_temp_init = 50.0f;
cfg.use_cuda_graph = true;
// 单GPU测试
printf(" 单GPU (5 runs): ");
std::vector<float> single_gpu_results;
for (int run = 0; run < num_runs; run++) {
cfg.seed = 42 + run * 100;
auto result = solve(prob, cfg);
single_gpu_results.push_back(result.best_solution.objectives[0]);
printf("%.1f ", result.best_solution.objectives[0]);
}
float avg_single = 0;
for (float v : single_gpu_results) avg_single += v;
avg_single /= num_runs;
printf(" → 平均: %.2f\n", avg_single);
// 多GPU测试如果可用
if (num_gpus >= 2) {
printf(" 多GPU (%d GPUs, 5 runs): ", num_gpus);
std::vector<float> multi_gpu_results;
cfg.num_gpus = num_gpus;
for (int run = 0; run < num_runs; run++) {
cfg.seed = 42 + run * 100;
auto result = solve_multi_gpu(prob, cfg);
multi_gpu_results.push_back(result.best_solution.objectives[0]);
printf("%.1f ", result.best_solution.objectives[0]);
}
float avg_multi = 0;
for (float v : multi_gpu_results) avg_multi += v;
avg_multi /= num_runs;
float improvement = (avg_single - avg_multi) / avg_single * 100;
printf(" → 平均: %.2f (%.2f%%)\n", avg_multi, improvement);
}
prob.destroy();
delete[] h_dist;
delete[] h_demand;
}
printf("\n==============================================\n");
printf("实验完成!\n");
printf("==============================================\n");
return 0;
}

View file

@ -0,0 +1,87 @@
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
#include "operators.cuh"
// 支持大规模 TSP最多 512 个城市)
struct LargeTSPProblem : ProblemBase<LargeTSPProblem, 1, 512> {
const float* d_dist;
const float* h_dist;
int n;
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}
};
__device__ float compute_obj(int obj_idx, const Sol& s) const {
float total = 0;
for (int i = 0; i < n - 1; i++) {
int from = s.data[0][i];
int to = s.data[0][i + 1];
total += d_dist[from * n + to];
}
total += d_dist[s.data[0][n - 1] * n + s.data[0][0]];
return total;
}
__device__ float compute_penalty(const Sol& s) const {
return 0.0f;
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Permutation;
cfg.dim1 = 1;
cfg.dim2_default = n;
fill_obj_config(cfg);
return cfg;
}
// 可选:覆盖 working_set_bytes 用于 L2 cache 感知
size_t working_set_bytes() const {
return (size_t)n * n * sizeof(float);
}
static LargeTSPProblem create(const float* h_dist_matrix, int num_cities) {
LargeTSPProblem prob;
prob.n = num_cities;
prob.h_dist = h_dist_matrix;
size_t dist_size = (size_t)num_cities * num_cities * sizeof(float);
CUDA_CHECK(cudaMalloc(&prob.d_dist, dist_size));
CUDA_CHECK(cudaMemcpy((void*)prob.d_dist, h_dist_matrix, dist_size, cudaMemcpyHostToDevice));
return prob;
}
void destroy() {
if (d_dist) {
cudaFree((void*)d_dist);
d_dist = nullptr;
}
}
// Multi-GPU support
LargeTSPProblem* clone_to_device(int target_gpu) const {
int orig_device;
CUDA_CHECK(cudaGetDevice(&orig_device));
CUDA_CHECK(cudaSetDevice(target_gpu));
// 分配设备内存并拷贝距离矩阵到目标 GPU
float* dd;
size_t dist_size = (size_t)n * n * sizeof(float);
CUDA_CHECK(cudaMalloc(&dd, dist_size));
CUDA_CHECK(cudaMemcpy(dd, h_dist, dist_size, cudaMemcpyHostToDevice));
// 恢复原设备
CUDA_CHECK(cudaSetDevice(orig_device));
// 创建新的 Problem 实例(在 host 端)
LargeTSPProblem* new_prob = new LargeTSPProblem();
new_prob->n = n;
new_prob->h_dist = h_dist;
new_prob->d_dist = dd;
return new_prob;
}
};

View file

@ -0,0 +1,138 @@
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
#include "operators.cuh"
// 支持大规模 VRP最多 256 个客户16 辆车)
struct LargeVRPProblem : ProblemBase<LargeVRPProblem, 16, 256> {
const float* d_dist;
const float* d_demand;
const float* h_dist;
const float* h_demand;
int n;
float capacity;
int num_vehicles;
int max_vehicles;
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}
};
__device__ float compute_obj(int obj_idx, const Sol& s) const {
float total = 0;
for (int v = 0; v < num_vehicles; v++) {
int route_len = s.dim2_sizes[v];
if (route_len == 0) continue;
// 从depot到第一个客户客户编号需要+1因为0是depot
int first_node = s.data[v][0] + 1;
total += d_dist[0 * (n+1) + first_node];
// 路径内部
int prev = first_node;
for (int i = 1; i < route_len; i++) {
int node = s.data[v][i] + 1;
total += d_dist[prev * (n+1) + node];
prev = node;
}
// 最后一个客户回depot
total += d_dist[prev * (n+1) + 0];
}
return total;
}
__device__ float compute_penalty(const Sol& s) const {
float penalty = 0;
for (int v = 0; v < num_vehicles; v++) {
float load = 0;
for (int i = 0; i < s.dim2_sizes[v]; i++) {
load += d_demand[s.data[v][i]];
}
if (load > capacity) {
penalty += (load - capacity) * 100.0f;
}
}
return penalty;
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Permutation;
cfg.dim1 = num_vehicles;
cfg.dim2_default = 0; // Partition 模式下由框架自动分配
fill_obj_config(cfg);
cfg.cross_row_prob = 0.3f;
cfg.row_mode = RowMode::Partition;
cfg.total_elements = n; // 总共有 n 个客户需要分配到各车辆
return cfg;
}
// 可选:覆盖 working_set_bytes 用于 L2 cache 感知
size_t working_set_bytes() const {
return (size_t)(n + 1) * (n + 1) * sizeof(float) + (size_t)n * sizeof(float);
}
static LargeVRPProblem create(const float* h_dist_matrix, const float* h_demand_array,
int num_customers, float vehicle_capacity,
int num_veh, int max_veh) {
LargeVRPProblem prob;
prob.n = num_customers;
prob.capacity = vehicle_capacity;
prob.num_vehicles = num_veh;
prob.max_vehicles = max_veh;
prob.h_dist = h_dist_matrix;
prob.h_demand = h_demand_array;
size_t dist_size = (size_t)(num_customers + 1) * (num_customers + 1) * sizeof(float);
size_t demand_size = (size_t)num_customers * sizeof(float);
CUDA_CHECK(cudaMalloc(&prob.d_dist, dist_size));
CUDA_CHECK(cudaMalloc(&prob.d_demand, demand_size));
CUDA_CHECK(cudaMemcpy((void*)prob.d_dist, h_dist_matrix, dist_size, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy((void*)prob.d_demand, h_demand_array, demand_size, cudaMemcpyHostToDevice));
return prob;
}
void destroy() {
if (d_dist) cudaFree((void*)d_dist);
if (d_demand) cudaFree((void*)d_demand);
d_dist = nullptr;
d_demand = nullptr;
}
// Multi-GPU support
LargeVRPProblem* clone_to_device(int target_gpu) const {
int orig_device;
CUDA_CHECK(cudaGetDevice(&orig_device));
CUDA_CHECK(cudaSetDevice(target_gpu));
// 分配设备内存并拷贝数据到目标 GPU
float* dd;
float* ddem;
size_t dist_size = (size_t)(n + 1) * (n + 1) * sizeof(float);
size_t demand_size = (size_t)n * sizeof(float);
CUDA_CHECK(cudaMalloc(&dd, dist_size));
CUDA_CHECK(cudaMalloc(&ddem, demand_size));
CUDA_CHECK(cudaMemcpy(dd, h_dist, dist_size, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(ddem, h_demand, demand_size, cudaMemcpyHostToDevice));
// 恢复原设备
CUDA_CHECK(cudaSetDevice(orig_device));
// 创建新的 Problem 实例(在 host 端)
LargeVRPProblem* new_prob = new LargeVRPProblem();
new_prob->n = n;
new_prob->capacity = capacity;
new_prob->num_vehicles = num_vehicles;
new_prob->max_vehicles = max_vehicles;
new_prob->h_dist = h_dist;
new_prob->h_demand = h_demand;
new_prob->d_dist = dd;
new_prob->d_demand = ddem;
return new_prob;
}
};