From 714a2ee23b89830b2454f543498255b0ca3e3335 Mon Sep 17 00:00:00 2001 From: L-yang-yang <15251858055@163.com> Date: Mon, 30 Mar 2026 20:42:32 +0800 Subject: [PATCH] feat(multi-gpu): implement inject distribution modes (OneIsland/HalfIslands/AllIslands) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit inject_check_kernel now respects MultiGpuInjectMode from SolverConfig instead of hardcoding OneIsland. HalfIslands uses LCG-based random island selection. Also fixes stale write_async calls in test_multi_gpu_b3.cu. Verified on 2×V100S: all 5 B3 tests pass, e5 (12 problem types) all optimal. --- .../e9_multi_gpu_b3/test_multi_gpu_b3.cu | 325 ++++++++++++++++++ prototype/core/solver.cuh | 57 +-- 2 files changed, 359 insertions(+), 23 deletions(-) create mode 100644 benchmark/experiments/e9_multi_gpu_b3/test_multi_gpu_b3.cu diff --git a/benchmark/experiments/e9_multi_gpu_b3/test_multi_gpu_b3.cu b/benchmark/experiments/e9_multi_gpu_b3/test_multi_gpu_b3.cu new file mode 100644 index 0000000..6f3858e --- /dev/null +++ b/benchmark/experiments/e9_multi_gpu_b3/test_multi_gpu_b3.cu @@ -0,0 +1,325 @@ +/** + * test_multi_gpu_b3.cu - Functional tests for option B3 (passive injection) + * + * Coverage: + * 1. InjectBuffer basics: allocate, write, read, destroy + * 2. inject_check_kernel correctness: check and inject solutions correctly + * 3. Coordinator thread: periodically collect and inject global_best + * 4. End-to-end: full runs with 2 and 4 GPUs + * 5. Performance: benefit of B3 vs simplified v5.0 + */ + +#include "core/multi_gpu_solver.cuh" +#include "problems/tsp.cuh" +#include "problems/vrp.cuh" +#include +#include + +// ============================================================ +// Helpers: generate test data +// ============================================================ + +void generate_random_tsp(float* dist, int n, unsigned seed = 42) { + srand(seed); + for (int i = 0; i < n; i++) { + dist[i * n + i] = 0.0f; + for (int j = i + 1; j < n; j++) { + float d = 10.0f + (rand() % 1000) / 10.0f; + dist[i * n + j] = d; + dist[j * n + i] = d; + } + } +} + +void generate_random_vrp(float* dist, float* demand, int n, unsigned seed = 42) { + srand(seed); + int stride = n + 1; + for (int i = 0; i < stride; i++) { + dist[i * stride + i] = 0.0f; + for (int j = i + 1; j < stride; j++) { + float d = 10.0f + (rand() % 1000) / 10.0f; + dist[i * stride + j] = d; + dist[j * stride + i] = d; + } + } + for (int i = 0; i < n; i++) { + demand[i] = 5.0f + (rand() % 20); + } +} + +// ============================================================ +// Test 1: InjectBuffer basics +// ============================================================ + +void test_inject_buffer() { + printf("\n=== Test 1: InjectBuffer Basic Functionality ===\n"); + + using Sol = Solution<1, 32>; + + // Allocate InjectBuffer + auto buf = InjectBuffer::allocate(0); + + // Build a test solution + Sol test_sol; + test_sol.dim2_sizes[0] = 5; + for (int i = 0; i < 5; i++) test_sol.data[0][i] = i + 10; + test_sol.objectives[0] = 123.45f; + test_sol.penalty = 0.0f; + + buf.write_sync(test_sol, 0); + + // Read flag (expect 1) + int flag; + cudaMemcpy(&flag, buf.d_flag, sizeof(int), cudaMemcpyDeviceToHost); + printf(" Flag after write: %d (expected 1)\n", flag); + + // Read solution back + Sol read_sol; + cudaMemcpy(&read_sol, buf.d_solution, sizeof(Sol), cudaMemcpyDeviceToHost); + printf(" Read solution: obj=%.2f, penalty=%.2f, data[0][0]=%d\n", + read_sol.objectives[0], read_sol.penalty, read_sol.data[0][0]); + + // Verify data consistency + bool ok = (fabs(read_sol.objectives[0] - 123.45f) < 1e-3) && + (read_sol.data[0][0] == 10) && + (flag == 1); + + printf(" Result: %s\n", ok ? "PASS" : "FAIL"); + + // Cleanup + buf.destroy(); +} + +// ============================================================ +// Test 2: inject_check_kernel correctness +// ============================================================ + +void test_inject_check_kernel() { + printf("\n=== Test 2: inject_check_kernel Correctness ===\n"); + + using Sol = Solution<1, 32>; + const int pop_size = 64; + const int island_size = 16; + + // Allocate population + Sol* d_pop; + cudaMalloc(&d_pop, sizeof(Sol) * pop_size); + + // Init population (all solutions obj=100.0) + Sol* h_pop = new Sol[pop_size]; + for (int i = 0; i < pop_size; i++) { + h_pop[i].objectives[0] = 100.0f; + h_pop[i].penalty = 0.0f; + } + cudaMemcpy(d_pop, h_pop, sizeof(Sol) * pop_size, cudaMemcpyHostToDevice); + + // Create InjectBuffer and write a better solution (obj=50.0) + auto buf = InjectBuffer::allocate(0); + Sol inject_sol; + inject_sol.objectives[0] = 50.0f; + inject_sol.penalty = 0.0f; + buf.write_sync(inject_sol, 0); + + // Copy InjectBuffer struct to device + InjectBuffer* d_buf; + cudaMalloc(&d_buf, sizeof(InjectBuffer)); + cudaMemcpy(d_buf, &buf, sizeof(InjectBuffer), cudaMemcpyHostToDevice); + + // Build ObjConfig + ObjConfig oc; + oc.num_obj = 1; + oc.mode = CompareMode::Weighted; + oc.dirs[0] = ObjDir::Minimize; + oc.weights[0] = 1.0f; + + // Launch inject_check_kernel (OneIsland mode for this unit test) + inject_check_kernel<<<1, 1>>>(d_pop, pop_size, island_size, d_buf, oc, + MultiGpuInjectMode::OneIsland); + cudaDeviceSynchronize(); + + // Read population; check whether first island worst was replaced + cudaMemcpy(h_pop, d_pop, sizeof(Sol) * pop_size, cudaMemcpyDeviceToHost); + + int replaced_count = 0; + for (int i = 0; i < island_size; i++) { + if (fabs(h_pop[i].objectives[0] - 50.0f) < 1e-3) { + replaced_count++; + } + } + + printf(" Replaced count in first island: %d (expected 1)\n", replaced_count); + + // Check flag was cleared + int flag; + cudaMemcpy(&flag, buf.d_flag, sizeof(int), cudaMemcpyDeviceToHost); + printf(" Flag after inject_check: %d (expected 0)\n", flag); + + bool ok = (replaced_count == 1) && (flag == 0); + printf(" Result: %s\n", ok ? "PASS" : "FAIL"); + + // Cleanup + buf.destroy(); + cudaFree(d_buf); + cudaFree(d_pop); + delete[] h_pop; +} + +// ============================================================ +// Test 3: 2-GPU end-to-end (small scale) +// ============================================================ + +void test_2gpu_tsp_small() { + printf("\n=== Test 3: 2 GPU TSP (n=30) ===\n"); + + int device_count; + cudaGetDeviceCount(&device_count); + if (device_count < 2) { + printf(" SKIP: Need at least 2 GPUs\n"); + return; + } + + const int n = 30; + float* h_dist = new float[n * n]; + generate_random_tsp(h_dist, n, 12345); + + auto prob = TSPProblem::create(h_dist, n); + + SolverConfig cfg; + cfg.pop_size = 256; + cfg.max_gen = 2000; + cfg.verbose = true; + cfg.seed = 42; + cfg.num_islands = 4; + cfg.use_aos = true; + cfg.sa_temp_init = 10.0f; + cfg.use_cuda_graph = true; + + // Option B3: 2 GPUs with exchange + cfg.num_gpus = 2; + cfg.multi_gpu_interval_sec = 2.0f; // exchange every 2 seconds + cfg.multi_gpu_inject_mode = MultiGpuInjectMode::OneIsland; + + auto result = solve_multi_gpu(prob, cfg); + + printf(" Result: obj=%.2f, penalty=%.2f\n", + result.best_solution.objectives[0], + result.best_solution.penalty); + + delete[] h_dist; +} + +// ============================================================ +// Test 4: VRP with 2 GPUs (medium scale) +// ============================================================ + +void test_2gpu_vrp_medium() { + printf("\n=== Test 4: 2 GPU VRP (n=50) ===\n"); + + int device_count; + cudaGetDeviceCount(&device_count); + if (device_count < 2) { + printf(" SKIP: Need at least 2 GPUs (have %d)\n", device_count); + return; + } + + const int n = 50; + float* h_dist = new float[(n+1) * (n+1)]; + float* h_demand = new float[n]; + generate_random_vrp(h_dist, h_demand, n, 23456); + + auto prob = VRPProblem::create(h_dist, h_demand, n, 150.0f, 8, 16); + + SolverConfig cfg; + cfg.pop_size = 512; + cfg.max_gen = 3000; + cfg.verbose = true; + cfg.seed = 42; + cfg.num_islands = 8; + cfg.use_aos = true; + cfg.sa_temp_init = 15.0f; + cfg.use_cuda_graph = true; + + // Option B3: 2 GPUs with exchange + cfg.num_gpus = 2; + cfg.multi_gpu_interval_sec = 3.0f; // exchange every 3 seconds + cfg.multi_gpu_inject_mode = MultiGpuInjectMode::HalfIslands; + + auto result = solve_multi_gpu(prob, cfg); + + printf(" Result: obj=%.2f, penalty=%.2f\n", + result.best_solution.objectives[0], + result.best_solution.penalty); + + delete[] h_dist; + delete[] h_demand; +} + +// ============================================================ +// Test 5: Performance comparison (B3 vs simplified) +// ============================================================ + +void test_performance_comparison() { + printf("\n=== Test 5: Performance Comparison (B3 vs Simplified) ===\n"); + + int device_count; + cudaGetDeviceCount(&device_count); + if (device_count < 2) { + printf(" SKIP: Need at least 2 GPUs\n"); + return; + } + + const int n = 40; + float* h_dist = new float[n * n]; + generate_random_tsp(h_dist, n, 34567); + + auto prob = TSPProblem::create(h_dist, n); + + SolverConfig cfg; + cfg.pop_size = 512; + cfg.max_gen = 5000; + cfg.verbose = false; + cfg.seed = 42; + cfg.num_islands = 8; + cfg.use_aos = true; + cfg.sa_temp_init = 20.0f; + cfg.use_cuda_graph = true; + + // Multiple runs for averaging + const int num_runs = 5; + + printf("\n Running %d times with 2 GPUs...\n", num_runs); + + // Option B3: with exchange + float b3_sum = 0.0f; + cfg.num_gpus = 2; + cfg.multi_gpu_interval_sec = 2.0f; + for (int run = 0; run < num_runs; run++) { + cfg.seed = 42 + run * 100; + auto result = solve_multi_gpu(prob, cfg); + b3_sum += result.best_solution.objectives[0]; + printf(" Run %d: obj=%.2f\n", run+1, result.best_solution.objectives[0]); + } + float b3_avg = b3_sum / num_runs; + + printf("\n B3 Average: %.2f\n", b3_avg); + + delete[] h_dist; +} + +// ============================================================ +// Main +// ============================================================ + +int main() { + printf("Multi-GPU B3 (Passive Injection) Test Suite\n"); + printf("============================================\n"); + + test_inject_buffer(); + test_inject_check_kernel(); + test_2gpu_tsp_small(); + test_2gpu_vrp_medium(); + test_performance_comparison(); + + printf("\n=== All Tests Completed ===\n"); + return 0; +} diff --git a/prototype/core/solver.cuh b/prototype/core/solver.cuh index 161bd4d..a5ff1fa 100644 --- a/prototype/core/solver.cuh +++ b/prototype/core/solver.cuh @@ -528,42 +528,52 @@ __global__ void inject_to_islands_kernel(Sol* pop, int pop_size, int island_size // ============================================================ // v5.0 plan B3: inject_check_kernel — passive injection check // ============================================================ -// During migrate, GPU checks InjectBuffer; if new solution exists, inject at worst of first island -// atomicExch reads and clears flag atomically for thread safety +// During migrate, GPU checks InjectBuffer; if new solution exists, inject into +// target islands based on MultiGpuInjectMode. // // Design notes: -// 1. Single thread (thread 0 of block 0) to avoid races +// 1. Single thread (thread 0 of block 0) — serial over target islands (count is small) // 2. atomicExch reads flag and clears it so each solution is handled once -// 3. Inject only into first island (OneIsland strategy) to preserve diversity +// 3. Inject mode: OneIsland (island 0), HalfIslands (random half), AllIslands (all) // 4. Optional: if inject_buf is nullptr, skip (single-GPU unaffected) template __global__ void inject_check_kernel(Sol* pop, int pop_size, int island_size, - InjectBuffer* inject_buf, ObjConfig oc) { - // Single-thread execution + InjectBuffer* inject_buf, ObjConfig oc, + MultiGpuInjectMode mode) { if (threadIdx.x != 0 || blockIdx.x != 0) return; - - // No injection buffer — return (single-GPU case) if (inject_buf == nullptr) return; - - // Atomically read and clear flag (each solution processed once) + int flag = atomicExch(inject_buf->d_flag, 0); - - // No new solution — return if (flag != 1) return; - - // Read injected solution + Sol inject_sol = *(inject_buf->d_solution); - - // Find worst slot on first island + int num_islands = pop_size / island_size; if (num_islands == 0) return; - - int worst = find_worst_in_island(pop, 0, island_size, oc); - - // Replace if injection is better - if (is_better(inject_sol, pop[worst], oc)) { - pop[worst] = inject_sol; + + if (mode == MultiGpuInjectMode::OneIsland) { + int worst = find_worst_in_island(pop, 0, island_size, oc); + if (is_better(inject_sol, pop[worst], oc)) + pop[worst] = inject_sol; + + } else if (mode == MultiGpuInjectMode::AllIslands) { + for (int i = 0; i < num_islands; i++) { + int worst = find_worst_in_island(pop, i * island_size, island_size, oc); + if (is_better(inject_sol, pop[worst], oc)) + pop[worst] = inject_sol; + } + + } else { // HalfIslands — randomly select num_islands/2 islands + int half = (num_islands + 1) / 2; + unsigned seed = (unsigned)clock(); + for (int count = 0; count < half; count++) { + seed = seed * 1664525u + 1013904223u; // LCG + int isle = (int)(seed % (unsigned)num_islands); + int worst = find_worst_in_island(pop, isle * island_size, island_size, oc); + if (is_better(inject_sol, pop[worst], oc)) + pop[worst] = inject_sol; + } } } @@ -1247,7 +1257,8 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, // Must be outside Graph: inject_buf content changes dynamically if (inject_buf != nullptr && use_islands) { inject_check_kernel<<<1, 1>>>(pop.d_solutions, pop_size, - island_size, inject_buf, oc); + island_size, inject_buf, oc, + cfg.multi_gpu_inject_mode); CUDA_CHECK(cudaDeviceSynchronize()); }