mirror of
https://github.com/L-yang-yang/cugenopt.git
synced 2026-04-24 12:06:22 +02:00
feat(multi-gpu): implement inject distribution modes (OneIsland/HalfIslands/AllIslands)
inject_check_kernel now respects MultiGpuInjectMode from SolverConfig instead of hardcoding OneIsland. HalfIslands uses LCG-based random island selection. Also fixes stale write_async calls in test_multi_gpu_b3.cu. Verified on 2×V100S: all 5 B3 tests pass, e5 (12 problem types) all optimal.
This commit is contained in:
parent
93fda8d900
commit
714a2ee23b
2 changed files with 359 additions and 23 deletions
325
benchmark/experiments/e9_multi_gpu_b3/test_multi_gpu_b3.cu
Normal file
325
benchmark/experiments/e9_multi_gpu_b3/test_multi_gpu_b3.cu
Normal file
|
|
@ -0,0 +1,325 @@
|
|||
/**
|
||||
* test_multi_gpu_b3.cu - Functional tests for option B3 (passive injection)
|
||||
*
|
||||
* Coverage:
|
||||
* 1. InjectBuffer basics: allocate, write, read, destroy
|
||||
* 2. inject_check_kernel correctness: check and inject solutions correctly
|
||||
* 3. Coordinator thread: periodically collect and inject global_best
|
||||
* 4. End-to-end: full runs with 2 and 4 GPUs
|
||||
* 5. Performance: benefit of B3 vs simplified v5.0
|
||||
*/
|
||||
|
||||
#include "core/multi_gpu_solver.cuh"
|
||||
#include "problems/tsp.cuh"
|
||||
#include "problems/vrp.cuh"
|
||||
#include <cstdio>
|
||||
#include <cmath>
|
||||
|
||||
// ============================================================
|
||||
// Helpers: generate test data
|
||||
// ============================================================
|
||||
|
||||
void generate_random_tsp(float* dist, int n, unsigned seed = 42) {
|
||||
srand(seed);
|
||||
for (int i = 0; i < n; i++) {
|
||||
dist[i * n + i] = 0.0f;
|
||||
for (int j = i + 1; j < n; j++) {
|
||||
float d = 10.0f + (rand() % 1000) / 10.0f;
|
||||
dist[i * n + j] = d;
|
||||
dist[j * n + i] = d;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void generate_random_vrp(float* dist, float* demand, int n, unsigned seed = 42) {
|
||||
srand(seed);
|
||||
int stride = n + 1;
|
||||
for (int i = 0; i < stride; i++) {
|
||||
dist[i * stride + i] = 0.0f;
|
||||
for (int j = i + 1; j < stride; j++) {
|
||||
float d = 10.0f + (rand() % 1000) / 10.0f;
|
||||
dist[i * stride + j] = d;
|
||||
dist[j * stride + i] = d;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < n; i++) {
|
||||
demand[i] = 5.0f + (rand() % 20);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Test 1: InjectBuffer basics
|
||||
// ============================================================
|
||||
|
||||
void test_inject_buffer() {
|
||||
printf("\n=== Test 1: InjectBuffer Basic Functionality ===\n");
|
||||
|
||||
using Sol = Solution<1, 32>;
|
||||
|
||||
// Allocate InjectBuffer
|
||||
auto buf = InjectBuffer<Sol>::allocate(0);
|
||||
|
||||
// Build a test solution
|
||||
Sol test_sol;
|
||||
test_sol.dim2_sizes[0] = 5;
|
||||
for (int i = 0; i < 5; i++) test_sol.data[0][i] = i + 10;
|
||||
test_sol.objectives[0] = 123.45f;
|
||||
test_sol.penalty = 0.0f;
|
||||
|
||||
buf.write_sync(test_sol, 0);
|
||||
|
||||
// Read flag (expect 1)
|
||||
int flag;
|
||||
cudaMemcpy(&flag, buf.d_flag, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
printf(" Flag after write: %d (expected 1)\n", flag);
|
||||
|
||||
// Read solution back
|
||||
Sol read_sol;
|
||||
cudaMemcpy(&read_sol, buf.d_solution, sizeof(Sol), cudaMemcpyDeviceToHost);
|
||||
printf(" Read solution: obj=%.2f, penalty=%.2f, data[0][0]=%d\n",
|
||||
read_sol.objectives[0], read_sol.penalty, read_sol.data[0][0]);
|
||||
|
||||
// Verify data consistency
|
||||
bool ok = (fabs(read_sol.objectives[0] - 123.45f) < 1e-3) &&
|
||||
(read_sol.data[0][0] == 10) &&
|
||||
(flag == 1);
|
||||
|
||||
printf(" Result: %s\n", ok ? "PASS" : "FAIL");
|
||||
|
||||
// Cleanup
|
||||
buf.destroy();
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Test 2: inject_check_kernel correctness
|
||||
// ============================================================
|
||||
|
||||
void test_inject_check_kernel() {
|
||||
printf("\n=== Test 2: inject_check_kernel Correctness ===\n");
|
||||
|
||||
using Sol = Solution<1, 32>;
|
||||
const int pop_size = 64;
|
||||
const int island_size = 16;
|
||||
|
||||
// Allocate population
|
||||
Sol* d_pop;
|
||||
cudaMalloc(&d_pop, sizeof(Sol) * pop_size);
|
||||
|
||||
// Init population (all solutions obj=100.0)
|
||||
Sol* h_pop = new Sol[pop_size];
|
||||
for (int i = 0; i < pop_size; i++) {
|
||||
h_pop[i].objectives[0] = 100.0f;
|
||||
h_pop[i].penalty = 0.0f;
|
||||
}
|
||||
cudaMemcpy(d_pop, h_pop, sizeof(Sol) * pop_size, cudaMemcpyHostToDevice);
|
||||
|
||||
// Create InjectBuffer and write a better solution (obj=50.0)
|
||||
auto buf = InjectBuffer<Sol>::allocate(0);
|
||||
Sol inject_sol;
|
||||
inject_sol.objectives[0] = 50.0f;
|
||||
inject_sol.penalty = 0.0f;
|
||||
buf.write_sync(inject_sol, 0);
|
||||
|
||||
// Copy InjectBuffer struct to device
|
||||
InjectBuffer<Sol>* d_buf;
|
||||
cudaMalloc(&d_buf, sizeof(InjectBuffer<Sol>));
|
||||
cudaMemcpy(d_buf, &buf, sizeof(InjectBuffer<Sol>), cudaMemcpyHostToDevice);
|
||||
|
||||
// Build ObjConfig
|
||||
ObjConfig oc;
|
||||
oc.num_obj = 1;
|
||||
oc.mode = CompareMode::Weighted;
|
||||
oc.dirs[0] = ObjDir::Minimize;
|
||||
oc.weights[0] = 1.0f;
|
||||
|
||||
// Launch inject_check_kernel (OneIsland mode for this unit test)
|
||||
inject_check_kernel<<<1, 1>>>(d_pop, pop_size, island_size, d_buf, oc,
|
||||
MultiGpuInjectMode::OneIsland);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// Read population; check whether first island worst was replaced
|
||||
cudaMemcpy(h_pop, d_pop, sizeof(Sol) * pop_size, cudaMemcpyDeviceToHost);
|
||||
|
||||
int replaced_count = 0;
|
||||
for (int i = 0; i < island_size; i++) {
|
||||
if (fabs(h_pop[i].objectives[0] - 50.0f) < 1e-3) {
|
||||
replaced_count++;
|
||||
}
|
||||
}
|
||||
|
||||
printf(" Replaced count in first island: %d (expected 1)\n", replaced_count);
|
||||
|
||||
// Check flag was cleared
|
||||
int flag;
|
||||
cudaMemcpy(&flag, buf.d_flag, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
printf(" Flag after inject_check: %d (expected 0)\n", flag);
|
||||
|
||||
bool ok = (replaced_count == 1) && (flag == 0);
|
||||
printf(" Result: %s\n", ok ? "PASS" : "FAIL");
|
||||
|
||||
// Cleanup
|
||||
buf.destroy();
|
||||
cudaFree(d_buf);
|
||||
cudaFree(d_pop);
|
||||
delete[] h_pop;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Test 3: 2-GPU end-to-end (small scale)
|
||||
// ============================================================
|
||||
|
||||
void test_2gpu_tsp_small() {
|
||||
printf("\n=== Test 3: 2 GPU TSP (n=30) ===\n");
|
||||
|
||||
int device_count;
|
||||
cudaGetDeviceCount(&device_count);
|
||||
if (device_count < 2) {
|
||||
printf(" SKIP: Need at least 2 GPUs\n");
|
||||
return;
|
||||
}
|
||||
|
||||
const int n = 30;
|
||||
float* h_dist = new float[n * n];
|
||||
generate_random_tsp(h_dist, n, 12345);
|
||||
|
||||
auto prob = TSPProblem::create(h_dist, n);
|
||||
|
||||
SolverConfig cfg;
|
||||
cfg.pop_size = 256;
|
||||
cfg.max_gen = 2000;
|
||||
cfg.verbose = true;
|
||||
cfg.seed = 42;
|
||||
cfg.num_islands = 4;
|
||||
cfg.use_aos = true;
|
||||
cfg.sa_temp_init = 10.0f;
|
||||
cfg.use_cuda_graph = true;
|
||||
|
||||
// Option B3: 2 GPUs with exchange
|
||||
cfg.num_gpus = 2;
|
||||
cfg.multi_gpu_interval_sec = 2.0f; // exchange every 2 seconds
|
||||
cfg.multi_gpu_inject_mode = MultiGpuInjectMode::OneIsland;
|
||||
|
||||
auto result = solve_multi_gpu(prob, cfg);
|
||||
|
||||
printf(" Result: obj=%.2f, penalty=%.2f\n",
|
||||
result.best_solution.objectives[0],
|
||||
result.best_solution.penalty);
|
||||
|
||||
delete[] h_dist;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Test 4: VRP with 2 GPUs (medium scale)
|
||||
// ============================================================
|
||||
|
||||
void test_2gpu_vrp_medium() {
|
||||
printf("\n=== Test 4: 2 GPU VRP (n=50) ===\n");
|
||||
|
||||
int device_count;
|
||||
cudaGetDeviceCount(&device_count);
|
||||
if (device_count < 2) {
|
||||
printf(" SKIP: Need at least 2 GPUs (have %d)\n", device_count);
|
||||
return;
|
||||
}
|
||||
|
||||
const int n = 50;
|
||||
float* h_dist = new float[(n+1) * (n+1)];
|
||||
float* h_demand = new float[n];
|
||||
generate_random_vrp(h_dist, h_demand, n, 23456);
|
||||
|
||||
auto prob = VRPProblem::create(h_dist, h_demand, n, 150.0f, 8, 16);
|
||||
|
||||
SolverConfig cfg;
|
||||
cfg.pop_size = 512;
|
||||
cfg.max_gen = 3000;
|
||||
cfg.verbose = true;
|
||||
cfg.seed = 42;
|
||||
cfg.num_islands = 8;
|
||||
cfg.use_aos = true;
|
||||
cfg.sa_temp_init = 15.0f;
|
||||
cfg.use_cuda_graph = true;
|
||||
|
||||
// Option B3: 2 GPUs with exchange
|
||||
cfg.num_gpus = 2;
|
||||
cfg.multi_gpu_interval_sec = 3.0f; // exchange every 3 seconds
|
||||
cfg.multi_gpu_inject_mode = MultiGpuInjectMode::HalfIslands;
|
||||
|
||||
auto result = solve_multi_gpu(prob, cfg);
|
||||
|
||||
printf(" Result: obj=%.2f, penalty=%.2f\n",
|
||||
result.best_solution.objectives[0],
|
||||
result.best_solution.penalty);
|
||||
|
||||
delete[] h_dist;
|
||||
delete[] h_demand;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Test 5: Performance comparison (B3 vs simplified)
|
||||
// ============================================================
|
||||
|
||||
void test_performance_comparison() {
|
||||
printf("\n=== Test 5: Performance Comparison (B3 vs Simplified) ===\n");
|
||||
|
||||
int device_count;
|
||||
cudaGetDeviceCount(&device_count);
|
||||
if (device_count < 2) {
|
||||
printf(" SKIP: Need at least 2 GPUs\n");
|
||||
return;
|
||||
}
|
||||
|
||||
const int n = 40;
|
||||
float* h_dist = new float[n * n];
|
||||
generate_random_tsp(h_dist, n, 34567);
|
||||
|
||||
auto prob = TSPProblem::create(h_dist, n);
|
||||
|
||||
SolverConfig cfg;
|
||||
cfg.pop_size = 512;
|
||||
cfg.max_gen = 5000;
|
||||
cfg.verbose = false;
|
||||
cfg.seed = 42;
|
||||
cfg.num_islands = 8;
|
||||
cfg.use_aos = true;
|
||||
cfg.sa_temp_init = 20.0f;
|
||||
cfg.use_cuda_graph = true;
|
||||
|
||||
// Multiple runs for averaging
|
||||
const int num_runs = 5;
|
||||
|
||||
printf("\n Running %d times with 2 GPUs...\n", num_runs);
|
||||
|
||||
// Option B3: with exchange
|
||||
float b3_sum = 0.0f;
|
||||
cfg.num_gpus = 2;
|
||||
cfg.multi_gpu_interval_sec = 2.0f;
|
||||
for (int run = 0; run < num_runs; run++) {
|
||||
cfg.seed = 42 + run * 100;
|
||||
auto result = solve_multi_gpu(prob, cfg);
|
||||
b3_sum += result.best_solution.objectives[0];
|
||||
printf(" Run %d: obj=%.2f\n", run+1, result.best_solution.objectives[0]);
|
||||
}
|
||||
float b3_avg = b3_sum / num_runs;
|
||||
|
||||
printf("\n B3 Average: %.2f\n", b3_avg);
|
||||
|
||||
delete[] h_dist;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Main
|
||||
// ============================================================
|
||||
|
||||
int main() {
|
||||
printf("Multi-GPU B3 (Passive Injection) Test Suite\n");
|
||||
printf("============================================\n");
|
||||
|
||||
test_inject_buffer();
|
||||
test_inject_check_kernel();
|
||||
test_2gpu_tsp_small();
|
||||
test_2gpu_vrp_medium();
|
||||
test_performance_comparison();
|
||||
|
||||
printf("\n=== All Tests Completed ===\n");
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -528,42 +528,52 @@ __global__ void inject_to_islands_kernel(Sol* pop, int pop_size, int island_size
|
|||
// ============================================================
|
||||
// v5.0 plan B3: inject_check_kernel — passive injection check
|
||||
// ============================================================
|
||||
// During migrate, GPU checks InjectBuffer; if new solution exists, inject at worst of first island
|
||||
// atomicExch reads and clears flag atomically for thread safety
|
||||
// During migrate, GPU checks InjectBuffer; if new solution exists, inject into
|
||||
// target islands based on MultiGpuInjectMode.
|
||||
//
|
||||
// Design notes:
|
||||
// 1. Single thread (thread 0 of block 0) to avoid races
|
||||
// 1. Single thread (thread 0 of block 0) — serial over target islands (count is small)
|
||||
// 2. atomicExch reads flag and clears it so each solution is handled once
|
||||
// 3. Inject only into first island (OneIsland strategy) to preserve diversity
|
||||
// 3. Inject mode: OneIsland (island 0), HalfIslands (random half), AllIslands (all)
|
||||
// 4. Optional: if inject_buf is nullptr, skip (single-GPU unaffected)
|
||||
|
||||
template<typename Sol>
|
||||
__global__ void inject_check_kernel(Sol* pop, int pop_size, int island_size,
|
||||
InjectBuffer<Sol>* inject_buf, ObjConfig oc) {
|
||||
// Single-thread execution
|
||||
InjectBuffer<Sol>* inject_buf, ObjConfig oc,
|
||||
MultiGpuInjectMode mode) {
|
||||
if (threadIdx.x != 0 || blockIdx.x != 0) return;
|
||||
|
||||
// No injection buffer — return (single-GPU case)
|
||||
if (inject_buf == nullptr) return;
|
||||
|
||||
// Atomically read and clear flag (each solution processed once)
|
||||
int flag = atomicExch(inject_buf->d_flag, 0);
|
||||
|
||||
// No new solution — return
|
||||
if (flag != 1) return;
|
||||
|
||||
// Read injected solution
|
||||
Sol inject_sol = *(inject_buf->d_solution);
|
||||
|
||||
// Find worst slot on first island
|
||||
int num_islands = pop_size / island_size;
|
||||
if (num_islands == 0) return;
|
||||
|
||||
int worst = find_worst_in_island(pop, 0, island_size, oc);
|
||||
if (mode == MultiGpuInjectMode::OneIsland) {
|
||||
int worst = find_worst_in_island(pop, 0, island_size, oc);
|
||||
if (is_better(inject_sol, pop[worst], oc))
|
||||
pop[worst] = inject_sol;
|
||||
|
||||
// Replace if injection is better
|
||||
if (is_better(inject_sol, pop[worst], oc)) {
|
||||
pop[worst] = inject_sol;
|
||||
} else if (mode == MultiGpuInjectMode::AllIslands) {
|
||||
for (int i = 0; i < num_islands; i++) {
|
||||
int worst = find_worst_in_island(pop, i * island_size, island_size, oc);
|
||||
if (is_better(inject_sol, pop[worst], oc))
|
||||
pop[worst] = inject_sol;
|
||||
}
|
||||
|
||||
} else { // HalfIslands — randomly select num_islands/2 islands
|
||||
int half = (num_islands + 1) / 2;
|
||||
unsigned seed = (unsigned)clock();
|
||||
for (int count = 0; count < half; count++) {
|
||||
seed = seed * 1664525u + 1013904223u; // LCG
|
||||
int isle = (int)(seed % (unsigned)num_islands);
|
||||
int worst = find_worst_in_island(pop, isle * island_size, island_size, oc);
|
||||
if (is_better(inject_sol, pop[worst], oc))
|
||||
pop[worst] = inject_sol;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1247,7 +1257,8 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
|
|||
// Must be outside Graph: inject_buf content changes dynamically
|
||||
if (inject_buf != nullptr && use_islands) {
|
||||
inject_check_kernel<<<1, 1>>>(pop.d_solutions, pop_size,
|
||||
island_size, inject_buf, oc);
|
||||
island_size, inject_buf, oc,
|
||||
cfg.multi_gpu_inject_mode);
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue