mirror of
https://github.com/L-yang-yang/cugenopt.git
synced 2026-04-30 13:06:21 +02:00
Initial commit: cuGenOpt GPU optimization solver
This commit is contained in:
commit
fc5a0ff4af
117 changed files with 25545 additions and 0 deletions
81
benchmark/experiments/e10_large_scale/README.md
Normal file
81
benchmark/experiments/e10_large_scale/README.md
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
# E10: 大规模问题实验
|
||||
|
||||
## 实验目的
|
||||
|
||||
验证 cuGenOpt 在大规模问题(n>100)上的性能表现,以及多 GPU 简化版的实际收益。
|
||||
|
||||
## 实验设计
|
||||
|
||||
### 测试规模
|
||||
|
||||
**TSP**:
|
||||
- n = 100, 200, 300, 400, 500
|
||||
|
||||
**VRP**:
|
||||
- n = 50, 100, 150, 200
|
||||
- 车辆数动态调整(n/20 + 1)
|
||||
- 容量固定为 150
|
||||
|
||||
### 对比维度
|
||||
|
||||
1. **单 GPU vs 多 GPU**(简化版)
|
||||
2. **不同规模下的性能表现**
|
||||
3. **多 GPU 的收益曲线**
|
||||
|
||||
### 配置参数
|
||||
|
||||
```cpp
|
||||
SolverConfig cfg;
|
||||
cfg.pop_size = 0; // 自适应(L2 cache感知)
|
||||
cfg.max_gen = 10000;
|
||||
cfg.num_islands = 16;
|
||||
cfg.use_aos = true;
|
||||
cfg.sa_temp_init = 50.0f;
|
||||
cfg.use_cuda_graph = true;
|
||||
```
|
||||
|
||||
### 运行次数
|
||||
|
||||
每个配置运行 5 次,取平均值。
|
||||
|
||||
## 文件说明
|
||||
|
||||
- `large_tsp_problem.cuh`: 支持最多 512 个城市的 TSP 问题定义
|
||||
- `large_vrp_problem.cuh`: 支持最多 256 个客户、16 辆车的 VRP 问题定义
|
||||
- `gpu.cu`: 主实验代码
|
||||
|
||||
## 编译和运行
|
||||
|
||||
```bash
|
||||
# 在远程服务器上
|
||||
cd ~/cugenopt_e10
|
||||
|
||||
# 编译
|
||||
nvcc -arch=sm_70 -O2 -std=c++17 --extended-lambda \
|
||||
-I ../../../prototype/core \
|
||||
-I ../../../prototype/problems \
|
||||
-I . \
|
||||
-o e10_test gpu.cu
|
||||
|
||||
# 运行
|
||||
./e10_test > e10_output.txt 2>&1
|
||||
```
|
||||
|
||||
## 预期结果
|
||||
|
||||
1. **单 GPU 性能**:
|
||||
- 小规模(n≤100):gap < 5%
|
||||
- 中规模(n=200-300):gap < 10%
|
||||
- 大规模(n≥400):gap 可能较高,但仍能找到可行解
|
||||
|
||||
2. **多 GPU 收益**:
|
||||
- 预期在大规模问题上收益更明显(2-5%)
|
||||
- 验证"简化版"在实际场景中的价值
|
||||
|
||||
3. **可扩展性**:
|
||||
- 观察 gens/s 随规模的变化
|
||||
- 识别性能瓶颈(shared memory, L2 cache)
|
||||
|
||||
## 实验日期
|
||||
|
||||
2026-03-05
|
||||
185
benchmark/experiments/e10_large_scale/gpu.cu
Normal file
185
benchmark/experiments/e10_large_scale/gpu.cu
Normal file
|
|
@ -0,0 +1,185 @@
|
|||
#include "solver.cuh"
|
||||
#include "multi_gpu_solver.cuh"
|
||||
#include "large_tsp_problem.cuh"
|
||||
#include "large_vrp_problem.cuh"
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <ctime>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
// 生成随机TSP实例
|
||||
void generate_random_tsp(float* dist, int n, unsigned seed) {
|
||||
srand(seed);
|
||||
for (int i = 0; i < n; i++) {
|
||||
dist[i * n + i] = 0.0f;
|
||||
for (int j = i + 1; j < n; j++) {
|
||||
float d = 10.0f + (rand() % 10000) / 10.0f;
|
||||
dist[i * n + j] = d;
|
||||
dist[j * n + i] = d;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 生成随机VRP实例
|
||||
void generate_random_vrp(float* dist, float* demand, int n, unsigned seed) {
|
||||
srand(seed);
|
||||
int stride = n + 1;
|
||||
// 距离矩阵(包含depot)
|
||||
for (int i = 0; i < stride; i++) {
|
||||
dist[i * stride + i] = 0.0f;
|
||||
for (int j = i + 1; j < stride; j++) {
|
||||
float d = 10.0f + (rand() % 10000) / 10.0f;
|
||||
dist[i * stride + j] = d;
|
||||
dist[j * stride + i] = d;
|
||||
}
|
||||
}
|
||||
// 需求
|
||||
for (int i = 0; i < n; i++) {
|
||||
demand[i] = 5.0f + (rand() % 20);
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
printf("==============================================\n");
|
||||
printf("E10: 大规模问题实验 (TSP & VRP)\n");
|
||||
printf("==============================================\n\n");
|
||||
|
||||
// 检测可用GPU数量
|
||||
int num_gpus;
|
||||
cudaGetDeviceCount(&num_gpus);
|
||||
printf("检测到 %d 个 GPU\n\n", num_gpus);
|
||||
|
||||
const int num_runs = 5;
|
||||
|
||||
// ========== TSP 大规模测试 ==========
|
||||
printf("实验 1: TSP 大规模测试\n");
|
||||
printf("----------------------------------------------\n");
|
||||
|
||||
std::vector<int> tsp_sizes = {100, 200, 300, 400, 500};
|
||||
|
||||
for (int n : tsp_sizes) {
|
||||
printf("\n[TSP n=%d]\n", n);
|
||||
|
||||
// 生成实例
|
||||
float* h_dist = new float[n * n];
|
||||
generate_random_tsp(h_dist, n, 12345);
|
||||
auto prob = LargeTSPProblem::create(h_dist, n);
|
||||
|
||||
// 配置
|
||||
SolverConfig cfg;
|
||||
cfg.pop_size = 0; // 自适应
|
||||
cfg.max_gen = 10000;
|
||||
cfg.verbose = false;
|
||||
cfg.num_islands = 16;
|
||||
cfg.use_aos = true;
|
||||
cfg.sa_temp_init = 50.0f;
|
||||
cfg.use_cuda_graph = true;
|
||||
|
||||
// 单GPU测试
|
||||
printf(" 单GPU (5 runs): ");
|
||||
std::vector<float> single_gpu_results;
|
||||
for (int run = 0; run < num_runs; run++) {
|
||||
cfg.seed = 42 + run * 100;
|
||||
auto result = solve(prob, cfg);
|
||||
single_gpu_results.push_back(result.best_solution.objectives[0]);
|
||||
printf("%.1f ", result.best_solution.objectives[0]);
|
||||
}
|
||||
float avg_single = 0;
|
||||
for (float v : single_gpu_results) avg_single += v;
|
||||
avg_single /= num_runs;
|
||||
printf(" → 平均: %.2f\n", avg_single);
|
||||
|
||||
// 多GPU测试(如果可用)
|
||||
if (num_gpus >= 2) {
|
||||
printf(" 多GPU (%d GPUs, 5 runs): ", num_gpus);
|
||||
std::vector<float> multi_gpu_results;
|
||||
cfg.num_gpus = num_gpus;
|
||||
for (int run = 0; run < num_runs; run++) {
|
||||
cfg.seed = 42 + run * 100;
|
||||
auto result = solve_multi_gpu(prob, cfg);
|
||||
multi_gpu_results.push_back(result.best_solution.objectives[0]);
|
||||
printf("%.1f ", result.best_solution.objectives[0]);
|
||||
}
|
||||
float avg_multi = 0;
|
||||
for (float v : multi_gpu_results) avg_multi += v;
|
||||
avg_multi /= num_runs;
|
||||
float improvement = (avg_single - avg_multi) / avg_single * 100;
|
||||
printf(" → 平均: %.2f (%.2f%%)\n", avg_multi, improvement);
|
||||
}
|
||||
|
||||
prob.destroy();
|
||||
delete[] h_dist;
|
||||
}
|
||||
|
||||
// ========== VRP 大规模测试 ==========
|
||||
printf("\n\n实验 2: VRP 大规模测试\n");
|
||||
printf("----------------------------------------------\n");
|
||||
|
||||
std::vector<int> vrp_sizes = {50, 100, 150, 200};
|
||||
|
||||
for (int n : vrp_sizes) {
|
||||
printf("\n[VRP n=%d]\n", n);
|
||||
|
||||
// 生成实例
|
||||
float* h_dist = new float[(n+1) * (n+1)];
|
||||
float* h_demand = new float[n];
|
||||
generate_random_vrp(h_dist, h_demand, n, 23456);
|
||||
|
||||
int num_vehicles = (n / 20) + 1; // 动态车辆数
|
||||
float capacity = 150.0f;
|
||||
auto prob = LargeVRPProblem::create(h_dist, h_demand, n, capacity, num_vehicles, num_vehicles + 4);
|
||||
|
||||
// 配置
|
||||
SolverConfig cfg;
|
||||
cfg.pop_size = 0; // 自适应
|
||||
cfg.max_gen = 10000;
|
||||
cfg.verbose = false;
|
||||
cfg.num_islands = 16;
|
||||
cfg.use_aos = true;
|
||||
cfg.sa_temp_init = 50.0f;
|
||||
cfg.use_cuda_graph = true;
|
||||
|
||||
// 单GPU测试
|
||||
printf(" 单GPU (5 runs): ");
|
||||
std::vector<float> single_gpu_results;
|
||||
for (int run = 0; run < num_runs; run++) {
|
||||
cfg.seed = 42 + run * 100;
|
||||
auto result = solve(prob, cfg);
|
||||
single_gpu_results.push_back(result.best_solution.objectives[0]);
|
||||
printf("%.1f ", result.best_solution.objectives[0]);
|
||||
}
|
||||
float avg_single = 0;
|
||||
for (float v : single_gpu_results) avg_single += v;
|
||||
avg_single /= num_runs;
|
||||
printf(" → 平均: %.2f\n", avg_single);
|
||||
|
||||
// 多GPU测试(如果可用)
|
||||
if (num_gpus >= 2) {
|
||||
printf(" 多GPU (%d GPUs, 5 runs): ", num_gpus);
|
||||
std::vector<float> multi_gpu_results;
|
||||
cfg.num_gpus = num_gpus;
|
||||
for (int run = 0; run < num_runs; run++) {
|
||||
cfg.seed = 42 + run * 100;
|
||||
auto result = solve_multi_gpu(prob, cfg);
|
||||
multi_gpu_results.push_back(result.best_solution.objectives[0]);
|
||||
printf("%.1f ", result.best_solution.objectives[0]);
|
||||
}
|
||||
float avg_multi = 0;
|
||||
for (float v : multi_gpu_results) avg_multi += v;
|
||||
avg_multi /= num_runs;
|
||||
float improvement = (avg_single - avg_multi) / avg_single * 100;
|
||||
printf(" → 平均: %.2f (%.2f%%)\n", avg_multi, improvement);
|
||||
}
|
||||
|
||||
prob.destroy();
|
||||
delete[] h_dist;
|
||||
delete[] h_demand;
|
||||
}
|
||||
|
||||
printf("\n==============================================\n");
|
||||
printf("实验完成!\n");
|
||||
printf("==============================================\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
87
benchmark/experiments/e10_large_scale/large_tsp_problem.cuh
Normal file
87
benchmark/experiments/e10_large_scale/large_tsp_problem.cuh
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
#include "operators.cuh"
|
||||
|
||||
// 支持大规模 TSP(最多 512 个城市)
|
||||
struct LargeTSPProblem : ProblemBase<LargeTSPProblem, 1, 512> {
|
||||
const float* d_dist;
|
||||
const float* h_dist;
|
||||
int n;
|
||||
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{ObjDir::Minimize, 1.0f, 0.0f}
|
||||
};
|
||||
|
||||
__device__ float compute_obj(int obj_idx, const Sol& s) const {
|
||||
float total = 0;
|
||||
for (int i = 0; i < n - 1; i++) {
|
||||
int from = s.data[0][i];
|
||||
int to = s.data[0][i + 1];
|
||||
total += d_dist[from * n + to];
|
||||
}
|
||||
total += d_dist[s.data[0][n - 1] * n + s.data[0][0]];
|
||||
return total;
|
||||
}
|
||||
|
||||
__device__ float compute_penalty(const Sol& s) const {
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = EncodingType::Permutation;
|
||||
cfg.dim1 = 1;
|
||||
cfg.dim2_default = n;
|
||||
fill_obj_config(cfg);
|
||||
return cfg;
|
||||
}
|
||||
|
||||
// 可选:覆盖 working_set_bytes 用于 L2 cache 感知
|
||||
size_t working_set_bytes() const {
|
||||
return (size_t)n * n * sizeof(float);
|
||||
}
|
||||
|
||||
static LargeTSPProblem create(const float* h_dist_matrix, int num_cities) {
|
||||
LargeTSPProblem prob;
|
||||
prob.n = num_cities;
|
||||
prob.h_dist = h_dist_matrix;
|
||||
|
||||
size_t dist_size = (size_t)num_cities * num_cities * sizeof(float);
|
||||
CUDA_CHECK(cudaMalloc(&prob.d_dist, dist_size));
|
||||
CUDA_CHECK(cudaMemcpy((void*)prob.d_dist, h_dist_matrix, dist_size, cudaMemcpyHostToDevice));
|
||||
|
||||
return prob;
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (d_dist) {
|
||||
cudaFree((void*)d_dist);
|
||||
d_dist = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Multi-GPU support
|
||||
LargeTSPProblem* clone_to_device(int target_gpu) const {
|
||||
int orig_device;
|
||||
CUDA_CHECK(cudaGetDevice(&orig_device));
|
||||
CUDA_CHECK(cudaSetDevice(target_gpu));
|
||||
|
||||
// 分配设备内存并拷贝距离矩阵到目标 GPU
|
||||
float* dd;
|
||||
size_t dist_size = (size_t)n * n * sizeof(float);
|
||||
CUDA_CHECK(cudaMalloc(&dd, dist_size));
|
||||
CUDA_CHECK(cudaMemcpy(dd, h_dist, dist_size, cudaMemcpyHostToDevice));
|
||||
|
||||
// 恢复原设备
|
||||
CUDA_CHECK(cudaSetDevice(orig_device));
|
||||
|
||||
// 创建新的 Problem 实例(在 host 端)
|
||||
LargeTSPProblem* new_prob = new LargeTSPProblem();
|
||||
new_prob->n = n;
|
||||
new_prob->h_dist = h_dist;
|
||||
new_prob->d_dist = dd;
|
||||
|
||||
return new_prob;
|
||||
}
|
||||
};
|
||||
138
benchmark/experiments/e10_large_scale/large_vrp_problem.cuh
Normal file
138
benchmark/experiments/e10_large_scale/large_vrp_problem.cuh
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
#include "operators.cuh"
|
||||
|
||||
// 支持大规模 VRP(最多 256 个客户,16 辆车)
|
||||
struct LargeVRPProblem : ProblemBase<LargeVRPProblem, 16, 256> {
|
||||
const float* d_dist;
|
||||
const float* d_demand;
|
||||
const float* h_dist;
|
||||
const float* h_demand;
|
||||
int n;
|
||||
float capacity;
|
||||
int num_vehicles;
|
||||
int max_vehicles;
|
||||
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{ObjDir::Minimize, 1.0f, 0.0f}
|
||||
};
|
||||
|
||||
__device__ float compute_obj(int obj_idx, const Sol& s) const {
|
||||
float total = 0;
|
||||
for (int v = 0; v < num_vehicles; v++) {
|
||||
int route_len = s.dim2_sizes[v];
|
||||
if (route_len == 0) continue;
|
||||
|
||||
// 从depot到第一个客户(客户编号需要+1,因为0是depot)
|
||||
int first_node = s.data[v][0] + 1;
|
||||
total += d_dist[0 * (n+1) + first_node];
|
||||
|
||||
// 路径内部
|
||||
int prev = first_node;
|
||||
for (int i = 1; i < route_len; i++) {
|
||||
int node = s.data[v][i] + 1;
|
||||
total += d_dist[prev * (n+1) + node];
|
||||
prev = node;
|
||||
}
|
||||
|
||||
// 最后一个客户回depot
|
||||
total += d_dist[prev * (n+1) + 0];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
__device__ float compute_penalty(const Sol& s) const {
|
||||
float penalty = 0;
|
||||
for (int v = 0; v < num_vehicles; v++) {
|
||||
float load = 0;
|
||||
for (int i = 0; i < s.dim2_sizes[v]; i++) {
|
||||
load += d_demand[s.data[v][i]];
|
||||
}
|
||||
if (load > capacity) {
|
||||
penalty += (load - capacity) * 100.0f;
|
||||
}
|
||||
}
|
||||
return penalty;
|
||||
}
|
||||
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = EncodingType::Permutation;
|
||||
cfg.dim1 = num_vehicles;
|
||||
cfg.dim2_default = 0; // Partition 模式下由框架自动分配
|
||||
fill_obj_config(cfg);
|
||||
cfg.cross_row_prob = 0.3f;
|
||||
cfg.row_mode = RowMode::Partition;
|
||||
cfg.total_elements = n; // 总共有 n 个客户需要分配到各车辆
|
||||
return cfg;
|
||||
}
|
||||
|
||||
// 可选:覆盖 working_set_bytes 用于 L2 cache 感知
|
||||
size_t working_set_bytes() const {
|
||||
return (size_t)(n + 1) * (n + 1) * sizeof(float) + (size_t)n * sizeof(float);
|
||||
}
|
||||
|
||||
static LargeVRPProblem create(const float* h_dist_matrix, const float* h_demand_array,
|
||||
int num_customers, float vehicle_capacity,
|
||||
int num_veh, int max_veh) {
|
||||
LargeVRPProblem prob;
|
||||
prob.n = num_customers;
|
||||
prob.capacity = vehicle_capacity;
|
||||
prob.num_vehicles = num_veh;
|
||||
prob.max_vehicles = max_veh;
|
||||
prob.h_dist = h_dist_matrix;
|
||||
prob.h_demand = h_demand_array;
|
||||
|
||||
size_t dist_size = (size_t)(num_customers + 1) * (num_customers + 1) * sizeof(float);
|
||||
size_t demand_size = (size_t)num_customers * sizeof(float);
|
||||
|
||||
CUDA_CHECK(cudaMalloc(&prob.d_dist, dist_size));
|
||||
CUDA_CHECK(cudaMalloc(&prob.d_demand, demand_size));
|
||||
CUDA_CHECK(cudaMemcpy((void*)prob.d_dist, h_dist_matrix, dist_size, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy((void*)prob.d_demand, h_demand_array, demand_size, cudaMemcpyHostToDevice));
|
||||
|
||||
return prob;
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (d_dist) cudaFree((void*)d_dist);
|
||||
if (d_demand) cudaFree((void*)d_demand);
|
||||
d_dist = nullptr;
|
||||
d_demand = nullptr;
|
||||
}
|
||||
|
||||
// Multi-GPU support
|
||||
LargeVRPProblem* clone_to_device(int target_gpu) const {
|
||||
int orig_device;
|
||||
CUDA_CHECK(cudaGetDevice(&orig_device));
|
||||
CUDA_CHECK(cudaSetDevice(target_gpu));
|
||||
|
||||
// 分配设备内存并拷贝数据到目标 GPU
|
||||
float* dd;
|
||||
float* ddem;
|
||||
size_t dist_size = (size_t)(n + 1) * (n + 1) * sizeof(float);
|
||||
size_t demand_size = (size_t)n * sizeof(float);
|
||||
|
||||
CUDA_CHECK(cudaMalloc(&dd, dist_size));
|
||||
CUDA_CHECK(cudaMalloc(&ddem, demand_size));
|
||||
CUDA_CHECK(cudaMemcpy(dd, h_dist, dist_size, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(ddem, h_demand, demand_size, cudaMemcpyHostToDevice));
|
||||
|
||||
// 恢复原设备
|
||||
CUDA_CHECK(cudaSetDevice(orig_device));
|
||||
|
||||
// 创建新的 Problem 实例(在 host 端)
|
||||
LargeVRPProblem* new_prob = new LargeVRPProblem();
|
||||
new_prob->n = n;
|
||||
new_prob->capacity = capacity;
|
||||
new_prob->num_vehicles = num_vehicles;
|
||||
new_prob->max_vehicles = max_vehicles;
|
||||
new_prob->h_dist = h_dist;
|
||||
new_prob->h_demand = h_demand;
|
||||
new_prob->d_dist = dd;
|
||||
new_prob->d_demand = ddem;
|
||||
|
||||
return new_prob;
|
||||
}
|
||||
};
|
||||
Loading…
Add table
Add a link
Reference in a new issue