cuGenOpt/benchmark/experiments/e6_gpu_hardware/gpu.cu
2026-03-20 00:33:45 +08:00

716 lines
25 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* E6: GPU 硬件对比
*
* 目的:验证 Memory-Bound 特性,量化不同 GPU 的加速效果
*
* 实验设计:
* Part A — 固定代数 (gen=2000):测量纯吞吐量差异
* TSP eil51/kroA100/ch150, CVRP10, Schedule3x4
* Part B — 固定时间 (30s):测量相同时间下的解质量差异
* QAP tai15a, JSP ft10, Knapsack100, VRPTW R101/C101/RC101
*
* Part B 的实例覆盖:
* - Shared memory 内QAP (2KB), JSP (800B), Knapsack (800B)
* - Shared memory 溢出VRPTW (40KB+, 超 T4 48KB 限制)
* → 验证 V100 (96KB smem) 是否能让 VRPTW 回到 shared memory
*
* 用法:./gpu [data_dir]
* 在不同 GPU 上分别运行,结果文件命名包含 GPU 型号
*/
#include "bench_common.cuh"
#include <cstdlib>
#include <cstdio>
#include <vector>
#include <fstream>
#include <sstream>
#include <string>
#include <cmath>
// ============================================================
// 文件解析工具(与 E7 共用)
// ============================================================
struct QAPData {
int n;
std::vector<float> dist;
std::vector<float> flow;
};
static QAPData parse_qaplib(const char* path) {
QAPData d;
std::ifstream f(path);
if (!f.is_open()) { fprintf(stderr, "Cannot open %s\n", path); exit(1); }
f >> d.n;
int nn = d.n * d.n;
d.dist.resize(nn);
d.flow.resize(nn);
for (int i = 0; i < nn; i++) f >> d.dist[i];
for (int i = 0; i < nn; i++) f >> d.flow[i];
return d;
}
struct JSPData {
int num_jobs, num_machines;
std::vector<int> machines;
std::vector<float> durations;
};
static JSPData parse_jsp(const char* path) {
JSPData d;
std::ifstream f(path);
if (!f.is_open()) { fprintf(stderr, "Cannot open %s\n", path); exit(1); }
f >> d.num_jobs >> d.num_machines;
int total = d.num_jobs * d.num_machines;
d.machines.resize(total);
d.durations.resize(total);
for (int j = 0; j < d.num_jobs; j++) {
for (int o = 0; o < d.num_machines; o++) {
int m; float dur;
f >> m >> dur;
d.machines[j * d.num_machines + o] = m;
d.durations[j * d.num_machines + o] = dur;
}
}
return d;
}
struct KnapsackData {
int n;
float capacity;
std::vector<float> values;
std::vector<float> weights;
};
static KnapsackData parse_knapsack(const char* path) {
KnapsackData d;
std::ifstream f(path);
if (!f.is_open()) { fprintf(stderr, "Cannot open %s\n", path); exit(1); }
int cap;
f >> d.n >> cap;
d.capacity = (float)cap;
d.values.resize(d.n);
d.weights.resize(d.n);
for (int i = 0; i < d.n; i++) {
int v, w;
f >> v >> w;
d.values[i] = (float)v;
d.weights[i] = (float)w;
}
return d;
}
static int knapsack_dp_optimal(const KnapsackData& d) {
int cap = (int)d.capacity;
std::vector<int> dp(cap + 1, 0);
for (int i = 0; i < d.n; i++) {
int w = (int)d.weights[i], v = (int)d.values[i];
for (int c = cap; c >= w; c--)
if (dp[c - w] + v > dp[c])
dp[c] = dp[c - w] + v;
}
return dp[cap];
}
struct SolomonNode {
int id;
float x, y;
float demand;
float ready, due, service;
};
struct SolomonData {
int num_vehicles;
float capacity;
std::vector<SolomonNode> nodes;
int num_customers;
std::vector<float> dist;
};
static SolomonData parse_solomon(const char* path) {
SolomonData d;
std::ifstream f(path);
if (!f.is_open()) { fprintf(stderr, "Cannot open %s\n", path); exit(1); }
std::string line;
std::getline(f, line);
while (std::getline(f, line)) {
if (line.find("NUMBER") != std::string::npos && line.find("CAPACITY") != std::string::npos)
break;
}
f >> d.num_vehicles >> d.capacity;
while (std::getline(f, line)) {
if (line.find("CUST") != std::string::npos) break;
}
std::getline(f, line);
SolomonNode node;
while (f >> node.id >> node.x >> node.y >> node.demand
>> node.ready >> node.due >> node.service) {
d.nodes.push_back(node);
}
d.num_customers = (int)d.nodes.size() - 1;
int nn = (int)d.nodes.size();
d.dist.resize(nn * nn);
for (int i = 0; i < nn; i++)
for (int j = 0; j < nn; j++) {
float dx = d.nodes[i].x - d.nodes[j].x;
float dy = d.nodes[i].y - d.nodes[j].y;
d.dist[i * nn + j] = sqrtf(dx * dx + dy * dy);
}
return d;
}
// ============================================================
// QAP Problem (D2=16, N<=16)
// ============================================================
struct QAPMedium : ProblemBase<QAPMedium, 1, 16> {
const float* d_flow;
const float* d_dist;
int n;
__device__ float calc_cost(const Sol& s) const {
float cost = 0.0f;
int sz = s.dim2_sizes[0];
for (int i = 0; i < sz; i++)
for (int j = 0; j < sz; j++)
cost += d_flow[i * n + j] * d_dist[s.data[0][i] * n + s.data[0][j]];
return cost;
}
static constexpr ObjDef OBJ_DEFS[] = { {ObjDir::Minimize, 1.0f, 0.0f} };
__device__ float compute_obj(int, const Sol& s) const { return calc_cost(s); }
__device__ float compute_penalty(const Sol&) const { return 0.0f; }
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Permutation;
cfg.dim1 = 1; cfg.dim2_default = n;
fill_obj_config(cfg);
return cfg;
}
size_t shared_mem_bytes() const { return 2 * (size_t)n * n * sizeof(float); }
__device__ void load_shared(char* smem, int tid, int bsz) {
float* sf = reinterpret_cast<float*>(smem);
float* sd = sf + n * n;
int total = n * n;
for (int i = tid; i < total; i += bsz) { sf[i] = d_flow[i]; sd[i] = d_dist[i]; }
d_flow = sf; d_dist = sd;
}
static QAPMedium create(const float* h_flow, const float* h_dist, int n) {
QAPMedium p;
p.n = n;
float *df, *dd;
CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n));
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
p.d_flow = df; p.d_dist = dd;
return p;
}
void destroy() {
if (d_flow) cudaFree(const_cast<float*>(d_flow));
if (d_dist) cudaFree(const_cast<float*>(d_dist));
d_flow = nullptr; d_dist = nullptr;
}
};
// ============================================================
// JSP Perm Problem (D2=128, J*O<=128, J/M<=16)
// ============================================================
struct JSPPermMedium : ProblemBase<JSPPermMedium, 1, 128> {
const int* d_machine;
const float* d_duration;
int num_jobs, num_ops, num_machines;
__device__ float decode_and_makespan(const Sol& s) const {
int total = num_jobs * num_ops;
int size = s.dim2_sizes[0];
if (size < total) return 1e9f;
float job_avail[16] = {};
float mach_avail[16] = {};
int job_next_op[16] = {};
float makespan = 0.0f;
for (int k = 0; k < total; k++) {
int j = s.data[0][k];
if (j < 0 || j >= num_jobs) return 1e9f;
int op = job_next_op[j];
if (op >= num_ops) continue;
int flat = j * num_ops + op;
int m = d_machine[flat];
float dur = d_duration[flat];
float start = fmaxf(job_avail[j], mach_avail[m]);
float end = start + dur;
job_avail[j] = end;
mach_avail[m] = end;
job_next_op[j] = op + 1;
if (end > makespan) makespan = end;
}
return makespan;
}
static constexpr ObjDef OBJ_DEFS[] = { {ObjDir::Minimize, 1.0f, 0.0f} };
__device__ float compute_obj(int, const Sol& s) const { return decode_and_makespan(s); }
__device__ float compute_penalty(const Sol&) const { return 0.0f; }
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Permutation;
cfg.dim1 = 1;
cfg.dim2_default = num_jobs * num_ops;
cfg.perm_repeat_count = num_ops;
fill_obj_config(cfg);
return cfg;
}
size_t shared_mem_bytes() const {
int total = num_jobs * num_ops;
return (size_t)total * (sizeof(int) + sizeof(float));
}
__device__ void load_shared(char* smem, int tid, int bsz) {
int total = num_jobs * num_ops;
int* sm = reinterpret_cast<int*>(smem);
for (int i = tid; i < total; i += bsz) sm[i] = d_machine[i];
d_machine = sm;
float* sd = reinterpret_cast<float*>(sm + total);
for (int i = tid; i < total; i += bsz) sd[i] = d_duration[i];
d_duration = sd;
}
static JSPPermMedium create(const int* h_machine, const float* h_duration,
int nj, int no, int nm) {
JSPPermMedium p;
p.num_jobs = nj; p.num_ops = no; p.num_machines = nm;
int total = nj * no;
int* dm; float* dd;
CUDA_CHECK(cudaMalloc(&dm, sizeof(int) * total));
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * total));
CUDA_CHECK(cudaMemcpy(dm, h_machine, sizeof(int) * total, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(dd, h_duration, sizeof(float) * total, cudaMemcpyHostToDevice));
p.d_machine = dm; p.d_duration = dd;
return p;
}
void destroy() {
if (d_machine) { cudaFree(const_cast<int*>(d_machine)); d_machine = nullptr; }
if (d_duration) { cudaFree(const_cast<float*>(d_duration)); d_duration = nullptr; }
}
};
// ============================================================
// Knapsack Problem (D2=128, N<=128)
// ============================================================
struct KnapsackMedium : ProblemBase<KnapsackMedium, 1, 128> {
const float* d_weights;
const float* d_values;
float capacity;
int n;
__device__ float calc_total_value(const Sol& s) const {
float tv = 0.0f;
int size = s.dim2_sizes[0];
for (int i = 0; i < size; i++)
if (s.data[0][i]) tv += d_values[i];
return tv;
}
static constexpr ObjDef OBJ_DEFS[] = { {ObjDir::Maximize, 1.0f, 0.0f} };
__device__ float compute_obj(int, const Sol& s) const { return calc_total_value(s); }
__device__ float compute_penalty(const Sol& s) const {
float tw = 0.0f;
int size = s.dim2_sizes[0];
for (int i = 0; i < size; i++)
if (s.data[0][i]) tw += d_weights[i];
float over = tw - capacity;
return (over > 0.0f) ? over : 0.0f;
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Binary;
cfg.dim1 = 1; cfg.dim2_default = n;
fill_obj_config(cfg);
return cfg;
}
size_t shared_mem_bytes() const { return 2 * (size_t)n * sizeof(float); }
__device__ void load_shared(char* smem, int tid, int bsz) {
float* sw = reinterpret_cast<float*>(smem);
float* sv = sw + n;
for (int i = tid; i < n; i += bsz) { sw[i] = d_weights[i]; sv[i] = d_values[i]; }
d_weights = sw; d_values = sv;
}
static KnapsackMedium create(const float* hw, const float* hv, int n, float cap) {
KnapsackMedium p;
p.n = n; p.capacity = cap;
float *dw, *dv;
CUDA_CHECK(cudaMalloc(&dw, sizeof(float) * n));
CUDA_CHECK(cudaMalloc(&dv, sizeof(float) * n));
CUDA_CHECK(cudaMemcpy(dw, hw, sizeof(float) * n, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(dv, hv, sizeof(float) * n, cudaMemcpyHostToDevice));
p.d_weights = dw; p.d_values = dv;
return p;
}
void destroy() {
if (d_weights) cudaFree(const_cast<float*>(d_weights));
if (d_values) cudaFree(const_cast<float*>(d_values));
d_weights = nullptr; d_values = nullptr;
}
};
// ============================================================
// VRPTW Problem (D1=25, D2=128, N<=100 customers, <=25 vehicles)
// ============================================================
struct VRPTWMedium : ProblemBase<VRPTWMedium, 25, 128> {
const float* d_dist;
const float* d_demand;
const float* d_earliest;
const float* d_latest;
const float* d_service;
const float* h_dist;
int n;
int stride;
float capacity;
int num_vehicles;
int max_vehicles;
__device__ float compute_route_dist(const int* route, int size) const {
if (size == 0) return 0.0f;
float dist = 0.0f;
int prev = 0;
for (int j = 0; j < size; j++) {
int node = route[j] + 1;
dist += d_dist[prev * stride + node];
prev = node;
}
dist += d_dist[prev * stride + 0];
return dist;
}
__device__ float calc_total_distance(const Sol& sol) const {
float total = 0.0f;
for (int r = 0; r < num_vehicles; r++)
total += compute_route_dist(sol.data[r], sol.dim2_sizes[r]);
return total;
}
static constexpr ObjDef OBJ_DEFS[] = { {ObjDir::Minimize, 1.0f, 0.0f} };
__device__ float compute_obj(int, const Sol& sol) const { return calc_total_distance(sol); }
__device__ float compute_penalty(const Sol& sol) const {
float penalty = 0.0f;
int active = 0;
for (int r = 0; r < num_vehicles; r++) {
int size = sol.dim2_sizes[r];
if (size == 0) continue;
active++;
float load = 0.0f;
for (int j = 0; j < size; j++)
load += d_demand[sol.data[r][j]];
if (load > capacity)
penalty += (load - capacity) * 100.0f;
float time = 0.0f;
int prev = 0;
for (int j = 0; j < size; j++) {
int node = sol.data[r][j] + 1;
float travel = d_dist[prev * stride + node];
time += travel;
if (time < d_earliest[node])
time = d_earliest[node];
if (time > d_latest[node])
penalty += (time - d_latest[node]) * 50.0f;
time += d_service[node];
prev = node;
}
float return_time = time + d_dist[prev * stride + 0];
if (return_time > d_latest[0])
penalty += (return_time - d_latest[0]) * 50.0f;
}
if (active > max_vehicles)
penalty += (float)(active - max_vehicles) * 1000.0f;
return penalty;
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Permutation;
cfg.dim1 = num_vehicles;
cfg.dim2_default = 0;
fill_obj_config(cfg);
cfg.cross_row_prob = 0.3f;
cfg.row_mode = RowMode::Partition;
cfg.total_elements = n;
return cfg;
}
int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
if (max_count < 1 || !h_dist) return 0;
out[0] = {h_dist, stride};
return 1;
}
size_t shared_mem_bytes() const {
size_t dist_bytes = (size_t)stride * stride * sizeof(float);
size_t aux_bytes = (size_t)(n + 1) * 4 * sizeof(float);
return dist_bytes + aux_bytes;
}
size_t working_set_bytes() const {
return (size_t)stride * stride * sizeof(float) + (size_t)(n + 1) * 4 * sizeof(float);
}
__device__ void load_shared(char* smem, int tid, int bsz) {
float* sd = reinterpret_cast<float*>(smem);
int dist_size = stride * stride;
for (int i = tid; i < dist_size; i += bsz) sd[i] = d_dist[i];
d_dist = sd;
float* sdem = sd + dist_size;
for (int i = tid; i < n; i += bsz) sdem[i] = d_demand[i];
d_demand = sdem;
float* se = sdem + n;
int nn = n + 1;
for (int i = tid; i < nn; i += bsz) se[i] = d_earliest[i];
d_earliest = se;
float* sl = se + nn;
for (int i = tid; i < nn; i += bsz) sl[i] = d_latest[i];
d_latest = sl;
float* ss = sl + nn;
for (int i = tid; i < nn; i += bsz) ss[i] = d_service[i];
d_service = ss;
}
static VRPTWMedium create(const SolomonData& sd) {
VRPTWMedium p;
p.n = sd.num_customers;
p.stride = sd.num_customers + 1;
p.capacity = sd.capacity;
p.num_vehicles = sd.num_vehicles;
p.max_vehicles = sd.num_vehicles;
p.h_dist = sd.dist.data();
int nn = p.stride;
float *dd, *ddem, *de, *dl, *ds;
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * nn * nn));
CUDA_CHECK(cudaMemcpy(dd, sd.dist.data(), sizeof(float) * nn * nn, cudaMemcpyHostToDevice));
p.d_dist = dd;
std::vector<float> demand(p.n), earliest(nn), latest(nn), service(nn);
for (int i = 0; i < p.n; i++)
demand[i] = sd.nodes[i + 1].demand;
for (int i = 0; i < nn; i++) {
earliest[i] = sd.nodes[i].ready;
latest[i] = sd.nodes[i].due;
service[i] = sd.nodes[i].service;
}
CUDA_CHECK(cudaMalloc(&ddem, sizeof(float) * p.n));
CUDA_CHECK(cudaMemcpy(ddem, demand.data(), sizeof(float) * p.n, cudaMemcpyHostToDevice));
p.d_demand = ddem;
CUDA_CHECK(cudaMalloc(&de, sizeof(float) * nn));
CUDA_CHECK(cudaMemcpy(de, earliest.data(), sizeof(float) * nn, cudaMemcpyHostToDevice));
p.d_earliest = de;
CUDA_CHECK(cudaMalloc(&dl, sizeof(float) * nn));
CUDA_CHECK(cudaMemcpy(dl, latest.data(), sizeof(float) * nn, cudaMemcpyHostToDevice));
p.d_latest = dl;
CUDA_CHECK(cudaMalloc(&ds, sizeof(float) * nn));
CUDA_CHECK(cudaMemcpy(ds, service.data(), sizeof(float) * nn, cudaMemcpyHostToDevice));
p.d_service = ds;
return p;
}
void destroy() {
if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
if (d_demand) { cudaFree(const_cast<float*>(d_demand)); d_demand = nullptr; }
if (d_earliest) { cudaFree(const_cast<float*>(d_earliest)); d_earliest = nullptr; }
if (d_latest) { cudaFree(const_cast<float*>(d_latest)); d_latest = nullptr; }
if (d_service) { cudaFree(const_cast<float*>(d_service)); d_service = nullptr; }
}
};
// ============================================================
// Main
// ============================================================
int main(int argc, char** argv) {
bench_init();
bench_csv_header();
const char* data_dir = "../../data";
if (argc > 1) data_dir = argv[1];
// ========================================================
// Part A: 固定代数 — 测量纯吞吐量 (gens/s)
// ========================================================
fprintf(stderr, "\n=== Part A: Fixed generations (gen=2000) ===\n");
{
const int GEN = 2000;
const int REPEATS = 3;
// TSP 实例
TSPInstance instances[] = {
{"eil51", eil51_coords, EIL51_N, 426.0f},
{"kroA100", kroA100_coords, KROA100_N, 21282.0f},
{"ch150", CH150_coords, CH150_N, 6528.0f},
};
for (auto& inst : instances) {
fprintf(stderr, " [e6-A] TSP %s (n=%d)\n", inst.name, inst.n);
float* dist = new float[inst.n * inst.n];
compute_euc2d_dist(dist, inst.coords, inst.n);
SolverConfig c = make_default_config(GEN);
bench_run_tsp<void>(inst.name, "A_gen2000", inst.n, dist, c, inst.optimal, REPEATS);
delete[] dist;
}
// CVRP10
{
fprintf(stderr, " [e6-A] CVRP10\n");
const int N = 10, NN = N + 1;
float coords[NN][2] = {
{50,50},{60,50},{70,50},{80,50},{50,60},{50,70},{50,80},{40,50},{30,50},{50,40},{50,30}
};
float demands[N] = {5,4,6,5,4,6,5,4,5,6};
float dist[NN * NN];
for (int i = 0; i < NN; i++)
for (int j = 0; j < NN; j++) {
float dx = coords[i][0] - coords[j][0];
float dy = coords[i][1] - coords[j][1];
dist[i * NN + j] = roundf(sqrtf(dx * dx + dy * dy));
}
auto p = VRPProblem::create(dist, demands, N, 15.0f, 4, 4);
SolverConfig c = make_default_config(GEN);
bench_run("CVRP10", "A_gen2000", p, c, 200.0f, REPEATS);
p.destroy();
}
// Schedule3x4
{
fprintf(stderr, " [e6-A] Schedule3x4\n");
float cost[12] = {5,3,8,4, 6,2,7,5, 4,6,3,7};
auto p = ScheduleProblem::create(cost, 3, 4, 2);
SolverConfig c = make_default_config(GEN);
bench_run("Schedule3x4", "A_gen2000", p, c, 0.0f, REPEATS);
p.destroy();
}
}
// ========================================================
// Part B: 固定时间 — 测量解质量 + gens/s
// ========================================================
fprintf(stderr, "\n=== Part B: Fixed time (30s) ===\n");
{
const float TIME = 30.0f;
// QAP tai15a (smem: 2*15*15*4 = 1.8KB, 完全在 shared memory 内)
{
char path[512];
snprintf(path, sizeof(path), "%s/qaplib/tai15a.dat", data_dir);
QAPData d = parse_qaplib(path);
fprintf(stderr, " [e6-B] QAP tai15a: N=%d, smem=%.1fKB\n",
d.n, 2.0f * d.n * d.n * 4 / 1024.0f);
auto p = QAPMedium::create(d.flow.data(), d.dist.data(), d.n);
SolverConfig c = make_timed_config(TIME);
bench_run("QAP_tai15a", "B_t30s", p, c, 388214.0f);
p.destroy();
}
// JSP ft10 (smem: 100*(4+4) = 800B)
{
char path[512];
snprintf(path, sizeof(path), "%s/jsp/ft10.txt", data_dir);
JSPData d = parse_jsp(path);
fprintf(stderr, " [e6-B] JSP ft10: %dx%d, smem=%.1fKB\n",
d.num_jobs, d.num_machines,
(float)(d.num_jobs * d.num_machines) * 8 / 1024.0f);
auto p = JSPPermMedium::create(d.machines.data(), d.durations.data(),
d.num_jobs, d.num_machines, d.num_machines);
SolverConfig c = make_timed_config(TIME);
bench_run("JSP_ft10", "B_t30s", p, c, 930.0f);
p.destroy();
}
// Knapsack100 (smem: 2*100*4 = 800B)
{
char path[512];
snprintf(path, sizeof(path), "%s/knapsack/knapPI_1_100.txt", data_dir);
KnapsackData d = parse_knapsack(path);
int opt = knapsack_dp_optimal(d);
fprintf(stderr, " [e6-B] Knapsack N=%d, smem=%.1fKB, DP opt=%d\n",
d.n, 2.0f * d.n * 4 / 1024.0f, opt);
auto p = KnapsackMedium::create(d.weights.data(), d.values.data(), d.n, d.capacity);
SolverConfig c = make_timed_config(TIME);
bench_run("Knapsack100", "B_t30s", p, c, (float)opt);
p.destroy();
}
// VRPTW R101 (smem: 101*101*4 + 101*4*4 = ~42KB → T4 溢出, V100 可能放得下)
{
char path[512];
snprintf(path, sizeof(path), "%s/solomon/R101.txt", data_dir);
SolomonData sd = parse_solomon(path);
size_t dist_bytes = (size_t)(sd.num_customers+1) * (sd.num_customers+1) * sizeof(float);
size_t aux_bytes = (size_t)(sd.num_customers+1) * 4 * sizeof(float);
fprintf(stderr, " [e6-B] VRPTW R101: N=%d, data=%.1fKB (dist=%.1fKB + aux=%.1fKB)\n",
sd.num_customers,
(dist_bytes + aux_bytes) / 1024.0f,
dist_bytes / 1024.0f, aux_bytes / 1024.0f);
auto p = VRPTWMedium::create(sd);
SolverConfig c = make_timed_config(TIME);
bench_run("VRPTW_R101", "B_t30s", p, c, 1637.7f);
p.destroy();
}
// VRPTW C101
{
char path[512];
snprintf(path, sizeof(path), "%s/solomon/C101.txt", data_dir);
SolomonData sd = parse_solomon(path);
fprintf(stderr, " [e6-B] VRPTW C101: N=%d\n", sd.num_customers);
auto p = VRPTWMedium::create(sd);
SolverConfig c = make_timed_config(TIME);
bench_run("VRPTW_C101", "B_t30s", p, c, 827.3f);
p.destroy();
}
// VRPTW RC101
{
char path[512];
snprintf(path, sizeof(path), "%s/solomon/RC101.txt", data_dir);
SolomonData sd = parse_solomon(path);
fprintf(stderr, " [e6-B] VRPTW RC101: N=%d\n", sd.num_customers);
auto p = VRPTWMedium::create(sd);
SolverConfig c = make_timed_config(TIME);
bench_run("VRPTW_RC101", "B_t30s", p, c, 1619.8f);
p.destroy();
}
}
fprintf(stderr, "\n[e6] GPU hardware comparison completed.\n");
return 0;
}