From 714a2ee23b89830b2454f543498255b0ca3e3335 Mon Sep 17 00:00:00 2001
From: L-yang-yang <15251858055@163.com>
Date: Mon, 30 Mar 2026 20:42:32 +0800
Subject: [PATCH] feat(multi-gpu): implement inject distribution modes
 (OneIsland/HalfIslands/AllIslands)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

inject_check_kernel now respects MultiGpuInjectMode from SolverConfig instead of
hardcoding OneIsland. HalfIslands uses LCG-based random island selection.
Also fixes stale write_async calls in test_multi_gpu_b3.cu.

Verified on 2×V100S: all 5 B3 tests pass, e5 (12 problem types) all optimal.
---
 .../e9_multi_gpu_b3/test_multi_gpu_b3.cu      | 325 ++++++++++++++++++
 prototype/core/solver.cuh                     |  57 +--
 2 files changed, 359 insertions(+), 23 deletions(-)
 create mode 100644 benchmark/experiments/e9_multi_gpu_b3/test_multi_gpu_b3.cu
diff --git a/benchmark/experiments/e9_multi_gpu_b3/test_multi_gpu_b3.cu b/benchmark/experiments/e9_multi_gpu_b3/test_multi_gpu_b3.cu
new file mode 100644
index 0000000..6f3858e
--- /dev/null
+++ b/benchmark/experiments/e9_multi_gpu_b3/test_multi_gpu_b3.cu
@@ -0,0 +1,325 @@
+/**
+ * test_multi_gpu_b3.cu - Functional tests for option B3 (passive injection)
+ *
+ * Coverage:
+ * 1. InjectBuffer basics: allocate, write, read, destroy
+ * 2. inject_check_kernel correctness: check and inject solutions correctly
+ * 3. Coordinator thread: periodically collect and inject global_best
+ * 4. End-to-end: full runs with 2 and 4 GPUs
+ * 5. Performance: benefit of B3 vs simplified v5.0
+ */
+
+#include "core/multi_gpu_solver.cuh"
+#include "problems/tsp.cuh"
+#include "problems/vrp.cuh"
+#include <cstdio>
+#include <cmath>
+
+// ============================================================
+// Helpers: generate test data
+// ============================================================
+
+void generate_random_tsp(float* dist, int n, unsigned seed = 42) {
+    srand(seed);
+    for (int i = 0; i < n; i++) {
+        dist[i * n + i] = 0.0f;
+        for (int j = i + 1; j < n; j++) {
+            float d = 10.0f + (rand() % 1000) / 10.0f;
+            dist[i * n + j] = d;
+            dist[j * n + i] = d;
+        }
+    }
+}
+
+void generate_random_vrp(float* dist, float* demand, int n, unsigned seed = 42) {
+    srand(seed);
+    int stride = n + 1;
+    for (int i = 0; i < stride; i++) {
+        dist[i * stride + i] = 0.0f;
+        for (int j = i + 1; j < stride; j++) {
+            float d = 10.0f + (rand() % 1000) / 10.0f;
+            dist[i * stride + j] = d;
+            dist[j * stride + i] = d;
+        }
+    }
+    for (int i = 0; i < n; i++) {
+        demand[i] = 5.0f + (rand() % 20);
+    }
+}
+
+// ============================================================
+// Test 1: InjectBuffer basics
+// ============================================================
+
+void test_inject_buffer() {
+    printf("\n=== Test 1: InjectBuffer Basic Functionality ===\n");
+    
+    using Sol = Solution<1, 32>;
+    
+    // Allocate InjectBuffer
+    auto buf = InjectBuffer<Sol>::allocate(0);
+    
+    // Build a test solution
+    Sol test_sol;
+    test_sol.dim2_sizes[0] = 5;
+    for (int i = 0; i < 5; i++) test_sol.data[0][i] = i + 10;
+    test_sol.objectives[0] = 123.45f;
+    test_sol.penalty = 0.0f;
+    
+    buf.write_sync(test_sol, 0);
+    
+    // Read flag (expect 1)
+    int flag;
+    cudaMemcpy(&flag, buf.d_flag, sizeof(int), cudaMemcpyDeviceToHost);
+    printf("  Flag after write: %d (expected 1)\n", flag);
+    
+    // Read solution back
+    Sol read_sol;
+    cudaMemcpy(&read_sol, buf.d_solution, sizeof(Sol), cudaMemcpyDeviceToHost);
+    printf("  Read solution: obj=%.2f, penalty=%.2f, data[0][0]=%d\n",
+           read_sol.objectives[0], read_sol.penalty, read_sol.data[0][0]);
+    
+    // Verify data consistency
+    bool ok = (fabs(read_sol.objectives[0] - 123.45f) < 1e-3) &&
+              (read_sol.data[0][0] == 10) &&
+              (flag == 1);
+    
+    printf("  Result: %s\n", ok ? "PASS" : "FAIL");
+    
+    // Cleanup
+    buf.destroy();
+}
+
+// ============================================================
+// Test 2: inject_check_kernel correctness
+// ============================================================
+
+void test_inject_check_kernel() {
+    printf("\n=== Test 2: inject_check_kernel Correctness ===\n");
+    
+    using Sol = Solution<1, 32>;
+    const int pop_size = 64;
+    const int island_size = 16;
+    
+    // Allocate population
+    Sol* d_pop;
+    cudaMalloc(&d_pop, sizeof(Sol) * pop_size);
+    
+    // Init population (all solutions obj=100.0)
+    Sol* h_pop = new Sol[pop_size];
+    for (int i = 0; i < pop_size; i++) {
+        h_pop[i].objectives[0] = 100.0f;
+        h_pop[i].penalty = 0.0f;
+    }
+    cudaMemcpy(d_pop, h_pop, sizeof(Sol) * pop_size, cudaMemcpyHostToDevice);
+    
+    // Create InjectBuffer and write a better solution (obj=50.0)
+    auto buf = InjectBuffer<Sol>::allocate(0);
+    Sol inject_sol;
+    inject_sol.objectives[0] = 50.0f;
+    inject_sol.penalty = 0.0f;
+    buf.write_sync(inject_sol, 0);
+    
+    // Copy InjectBuffer struct to device
+    InjectBuffer<Sol>* d_buf;
+    cudaMalloc(&d_buf, sizeof(InjectBuffer<Sol>));
+    cudaMemcpy(d_buf, &buf, sizeof(InjectBuffer<Sol>), cudaMemcpyHostToDevice);
+    
+    // Build ObjConfig
+    ObjConfig oc;
+    oc.num_obj = 1;
+    oc.mode = CompareMode::Weighted;
+    oc.dirs[0] = ObjDir::Minimize;
+    oc.weights[0] = 1.0f;
+    
+    // Launch inject_check_kernel (OneIsland mode for this unit test)
+    inject_check_kernel<<<1, 1>>>(d_pop, pop_size, island_size, d_buf, oc,
+                                   MultiGpuInjectMode::OneIsland);
+    cudaDeviceSynchronize();
+    
+    // Read population; check whether first island worst was replaced
+    cudaMemcpy(h_pop, d_pop, sizeof(Sol) * pop_size, cudaMemcpyDeviceToHost);
+    
+    int replaced_count = 0;
+    for (int i = 0; i < island_size; i++) {
+        if (fabs(h_pop[i].objectives[0] - 50.0f) < 1e-3) {
+            replaced_count++;
+        }
+    }
+    
+    printf("  Replaced count in first island: %d (expected 1)\n", replaced_count);
+    
+    // Check flag was cleared
+    int flag;
+    cudaMemcpy(&flag, buf.d_flag, sizeof(int), cudaMemcpyDeviceToHost);
+    printf("  Flag after inject_check: %d (expected 0)\n", flag);
+    
+    bool ok = (replaced_count == 1) && (flag == 0);
+    printf("  Result: %s\n", ok ? "PASS" : "FAIL");
+    
+    // Cleanup
+    buf.destroy();
+    cudaFree(d_buf);
+    cudaFree(d_pop);
+    delete[] h_pop;
+}
+
+// ============================================================
+// Test 3: 2-GPU end-to-end (small scale)
+// ============================================================
+
+void test_2gpu_tsp_small() {
+    printf("\n=== Test 3: 2 GPU TSP (n=30) ===\n");
+    
+    int device_count;
+    cudaGetDeviceCount(&device_count);
+    if (device_count < 2) {
+        printf("  SKIP: Need at least 2 GPUs\n");
+        return;
+    }
+    
+    const int n = 30;
+    float* h_dist = new float[n * n];
+    generate_random_tsp(h_dist, n, 12345);
+    
+    auto prob = TSPProblem::create(h_dist, n);
+    
+    SolverConfig cfg;
+    cfg.pop_size = 256;
+    cfg.max_gen = 2000;
+    cfg.verbose = true;
+    cfg.seed = 42;
+    cfg.num_islands = 4;
+    cfg.use_aos = true;
+    cfg.sa_temp_init = 10.0f;
+    cfg.use_cuda_graph = true;
+    
+    // Option B3: 2 GPUs with exchange
+    cfg.num_gpus = 2;
+    cfg.multi_gpu_interval_sec = 2.0f;  // exchange every 2 seconds
+    cfg.multi_gpu_inject_mode = MultiGpuInjectMode::OneIsland;
+    
+    auto result = solve_multi_gpu(prob, cfg);
+    
+    printf("  Result: obj=%.2f, penalty=%.2f\n",
+           result.best_solution.objectives[0],
+           result.best_solution.penalty);
+    
+    delete[] h_dist;
+}
+
+// ============================================================
+// Test 4: VRP with 2 GPUs (medium scale)
+// ============================================================
+
+void test_2gpu_vrp_medium() {
+    printf("\n=== Test 4: 2 GPU VRP (n=50) ===\n");
+    
+    int device_count;
+    cudaGetDeviceCount(&device_count);
+    if (device_count < 2) {
+        printf("  SKIP: Need at least 2 GPUs (have %d)\n", device_count);
+        return;
+    }
+    
+    const int n = 50;
+    float* h_dist = new float[(n+1) * (n+1)];
+    float* h_demand = new float[n];
+    generate_random_vrp(h_dist, h_demand, n, 23456);
+    
+    auto prob = VRPProblem::create(h_dist, h_demand, n, 150.0f, 8, 16);
+    
+    SolverConfig cfg;
+    cfg.pop_size = 512;
+    cfg.max_gen = 3000;
+    cfg.verbose = true;
+    cfg.seed = 42;
+    cfg.num_islands = 8;
+    cfg.use_aos = true;
+    cfg.sa_temp_init = 15.0f;
+    cfg.use_cuda_graph = true;
+    
+    // Option B3: 2 GPUs with exchange
+    cfg.num_gpus = 2;
+    cfg.multi_gpu_interval_sec = 3.0f;  // exchange every 3 seconds
+    cfg.multi_gpu_inject_mode = MultiGpuInjectMode::HalfIslands;
+    
+    auto result = solve_multi_gpu(prob, cfg);
+    
+    printf("  Result: obj=%.2f, penalty=%.2f\n",
+           result.best_solution.objectives[0],
+           result.best_solution.penalty);
+    
+    delete[] h_dist;
+    delete[] h_demand;
+}
+
+// ============================================================
+// Test 5: Performance comparison (B3 vs simplified)
+// ============================================================
+
+void test_performance_comparison() {
+    printf("\n=== Test 5: Performance Comparison (B3 vs Simplified) ===\n");
+    
+    int device_count;
+    cudaGetDeviceCount(&device_count);
+    if (device_count < 2) {
+        printf("  SKIP: Need at least 2 GPUs\n");
+        return;
+    }
+    
+    const int n = 40;
+    float* h_dist = new float[n * n];
+    generate_random_tsp(h_dist, n, 34567);
+    
+    auto prob = TSPProblem::create(h_dist, n);
+    
+    SolverConfig cfg;
+    cfg.pop_size = 512;
+    cfg.max_gen = 5000;
+    cfg.verbose = false;
+    cfg.seed = 42;
+    cfg.num_islands = 8;
+    cfg.use_aos = true;
+    cfg.sa_temp_init = 20.0f;
+    cfg.use_cuda_graph = true;
+    
+    // Multiple runs for averaging
+    const int num_runs = 5;
+    
+    printf("\n  Running %d times with 2 GPUs...\n", num_runs);
+    
+    // Option B3: with exchange
+    float b3_sum = 0.0f;
+    cfg.num_gpus = 2;
+    cfg.multi_gpu_interval_sec = 2.0f;
+    for (int run = 0; run < num_runs; run++) {
+        cfg.seed = 42 + run * 100;
+        auto result = solve_multi_gpu(prob, cfg);
+        b3_sum += result.best_solution.objectives[0];
+        printf("    Run %d: obj=%.2f\n", run+1, result.best_solution.objectives[0]);
+    }
+    float b3_avg = b3_sum / num_runs;
+    
+    printf("\n  B3 Average: %.2f\n", b3_avg);
+    
+    delete[] h_dist;
+}
+
+// ============================================================
+// Main
+// ============================================================
+
+int main() {
+    printf("Multi-GPU B3 (Passive Injection) Test Suite\n");
+    printf("============================================\n");
+    
+    test_inject_buffer();
+    test_inject_check_kernel();
+    test_2gpu_tsp_small();
+    test_2gpu_vrp_medium();
+    test_performance_comparison();
+    
+    printf("\n=== All Tests Completed ===\n");
+    return 0;
+}
diff --git a/prototype/core/solver.cuh b/prototype/core/solver.cuh
index 161bd4d..a5ff1fa 100644
--- a/prototype/core/solver.cuh
+++ b/prototype/core/solver.cuh
@@ -528,42 +528,52 @@ __global__ void inject_to_islands_kernel(Sol* pop, int pop_size, int island_size
 // ============================================================
 // v5.0 plan B3: inject_check_kernel — passive injection check
 // ============================================================
-// During migrate, GPU checks InjectBuffer; if new solution exists, inject at worst of first island
-// atomicExch reads and clears flag atomically for thread safety
+// During migrate, GPU checks InjectBuffer; if new solution exists, inject into
+// target islands based on MultiGpuInjectMode.
 //
 // Design notes:
-// 1. Single thread (thread 0 of block 0) to avoid races
+// 1. Single thread (thread 0 of block 0) — serial over target islands (count is small)
 // 2. atomicExch reads flag and clears it so each solution is handled once
-// 3. Inject only into first island (OneIsland strategy) to preserve diversity
+// 3. Inject mode: OneIsland (island 0), HalfIslands (random half), AllIslands (all)
 // 4. Optional: if inject_buf is nullptr, skip (single-GPU unaffected)
 
 template<typename Sol>
 __global__ void inject_check_kernel(Sol* pop, int pop_size, int island_size,
-                                     InjectBuffer<Sol>* inject_buf, ObjConfig oc) {
-    // Single-thread execution
+                                     InjectBuffer<Sol>* inject_buf, ObjConfig oc,
+                                     MultiGpuInjectMode mode) {
     if (threadIdx.x != 0 || blockIdx.x != 0) return;
-    
-    // No injection buffer — return (single-GPU case)
     if (inject_buf == nullptr) return;
-    
-    // Atomically read and clear flag (each solution processed once)
+
     int flag = atomicExch(inject_buf->d_flag, 0);
-    
-    // No new solution — return
     if (flag != 1) return;
-    
-    // Read injected solution
+
     Sol inject_sol = *(inject_buf->d_solution);
-    
-    // Find worst slot on first island
+
     int num_islands = pop_size / island_size;
     if (num_islands == 0) return;
-    
-    int worst = find_worst_in_island(pop, 0, island_size, oc);
-    
-    // Replace if injection is better
-    if (is_better(inject_sol, pop[worst], oc)) {
-        pop[worst] = inject_sol;
+
+    if (mode == MultiGpuInjectMode::OneIsland) {
+        int worst = find_worst_in_island(pop, 0, island_size, oc);
+        if (is_better(inject_sol, pop[worst], oc))
+            pop[worst] = inject_sol;
+
+    } else if (mode == MultiGpuInjectMode::AllIslands) {
+        for (int i = 0; i < num_islands; i++) {
+            int worst = find_worst_in_island(pop, i * island_size, island_size, oc);
+            if (is_better(inject_sol, pop[worst], oc))
+                pop[worst] = inject_sol;
+        }
+
+    } else {  // HalfIslands — randomly select num_islands/2 islands
+        int half = (num_islands + 1) / 2;
+        unsigned seed = (unsigned)clock();
+        for (int count = 0; count < half; count++) {
+            seed = seed * 1664525u + 1013904223u;  // LCG
+            int isle = (int)(seed % (unsigned)num_islands);
+            int worst = find_worst_in_island(pop, isle * island_size, island_size, oc);
+            if (is_better(inject_sol, pop[worst], oc))
+                pop[worst] = inject_sol;
+        }
     }
 }
 
@@ -1247,7 +1257,8 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         // Must be outside Graph: inject_buf content changes dynamically
         if (inject_buf != nullptr && use_islands) {
             inject_check_kernel<<<1, 1>>>(pop.d_solutions, pop_size,
-                                           island_size, inject_buf, oc);
+                                           island_size, inject_buf, oc,
+                                           cfg.multi_gpu_inject_mode);
             CUDA_CHECK(cudaDeviceSynchronize());
         }