mirror of
https://github.com/L-yang-yang/cugenopt.git
synced 2026-04-25 12:16:21 +02:00
Initial commit: cuGenOpt GPU optimization solver
This commit is contained in:
commit
fc5a0ff4af
117 changed files with 25545 additions and 0 deletions
54
python/cugenopt/__init__.py
Normal file
54
python/cugenopt/__init__.py
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
"""
|
||||
cuGenOpt — GPU-accelerated general-purpose metaheuristic solver
|
||||
|
||||
All problems (built-in and custom) use the same JIT compilation pipeline.
|
||||
First call to each problem type takes ~8s to compile; subsequent calls are cached.
|
||||
|
||||
Usage:
|
||||
import numpy as np
|
||||
import cugenopt
|
||||
|
||||
dist = np.random.rand(20, 20).astype(np.float32)
|
||||
dist = (dist + dist.T) / 2
|
||||
np.fill_diagonal(dist, 0)
|
||||
|
||||
result = cugenopt.solve_tsp(dist, time_limit=5.0, seed=42)
|
||||
print(f"Best distance: {result['objective']:.2f}")
|
||||
print(f"Route: {result['solution'][0]}")
|
||||
"""
|
||||
|
||||
from cugenopt.builtins import (
|
||||
solve_tsp,
|
||||
solve_knapsack,
|
||||
solve_qap,
|
||||
solve_assignment,
|
||||
solve_vrp,
|
||||
solve_vrptw,
|
||||
solve_graph_color,
|
||||
solve_bin_packing,
|
||||
solve_load_balance,
|
||||
gpu_info,
|
||||
)
|
||||
from cugenopt.jit import compile_and_solve as solve_custom, clear_cache
|
||||
from cugenopt.validation import CuGenOptValidationError, CuGenOptCompileError
|
||||
from cugenopt.operators import CustomOperator
|
||||
|
||||
__version__ = "0.2.0"
|
||||
|
||||
__all__ = [
|
||||
"solve_tsp",
|
||||
"solve_knapsack",
|
||||
"solve_qap",
|
||||
"solve_assignment",
|
||||
"solve_vrp",
|
||||
"solve_vrptw",
|
||||
"solve_graph_color",
|
||||
"solve_bin_packing",
|
||||
"solve_load_balance",
|
||||
"gpu_info",
|
||||
"solve_custom",
|
||||
"clear_cache",
|
||||
"CuGenOptValidationError",
|
||||
"CuGenOptCompileError",
|
||||
"CustomOperator",
|
||||
]
|
||||
486
python/cugenopt/builtins.py
Normal file
486
python/cugenopt/builtins.py
Normal file
|
|
@ -0,0 +1,486 @@
|
|||
"""
|
||||
Built-in problem solvers — thin wrappers around compile_and_solve().
|
||||
|
||||
Each solve_xxx() function provides pre-written CUDA code snippets for
|
||||
standard combinatorial optimization problems. Under the hood they all
|
||||
call the same JIT compilation pipeline.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, Optional
|
||||
import numpy as np
|
||||
from cugenopt.jit import compile_and_solve
|
||||
from cugenopt.validation import (
|
||||
CuGenOptValidationError,
|
||||
validate_square_matrix,
|
||||
validate_1d,
|
||||
validate_positive_int,
|
||||
)
|
||||
|
||||
|
||||
def _solver_kwargs(kw: dict) -> dict:
|
||||
"""Extract solver config kwargs from user-provided dict."""
|
||||
keys = ["pop_size", "max_gen", "time_limit", "seed", "use_aos",
|
||||
"sa_temp_init", "verbose", "cuda_arch", "framework_root",
|
||||
"custom_operators"]
|
||||
return {k: kw[k] for k in keys if k in kw}
|
||||
|
||||
|
||||
# ============================================================
|
||||
# TSP
|
||||
# ============================================================
|
||||
|
||||
_TSP_OBJ = """
|
||||
if (idx != 0) return 0.0f;
|
||||
float total = 0.0f;
|
||||
const int* route = sol.data[0];
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++)
|
||||
total += d_dist[route[i] * _n + route[(i+1) % size]];
|
||||
return total;
|
||||
"""
|
||||
|
||||
def solve_tsp(dist_matrix: np.ndarray, **kw) -> Dict[str, Any]:
|
||||
"""Solve TSP. Pass distance matrix as NxN numpy float32 array.
|
||||
|
||||
Args:
|
||||
dist_matrix: NxN distance matrix (float32).
|
||||
**kw: Solver params — pop_size, max_gen, time_limit, seed, use_aos, verbose, ...
|
||||
|
||||
Returns:
|
||||
Dict with objective, penalty, solution, elapsed_ms, generations, stop_reason.
|
||||
"""
|
||||
dist = validate_square_matrix(dist_matrix, "dist_matrix")
|
||||
n = dist.shape[0]
|
||||
if n < 3:
|
||||
raise CuGenOptValidationError("TSP requires at least 3 cities")
|
||||
if n > 512:
|
||||
raise CuGenOptValidationError(
|
||||
f"TSP size {n} > 512 not supported yet. "
|
||||
f"Use solve_custom() for larger instances."
|
||||
)
|
||||
dim2 = 64 if n <= 64 else (256 if n <= 256 else 512)
|
||||
return compile_and_solve(
|
||||
compute_obj=_TSP_OBJ, data={"d_dist": dist},
|
||||
encoding="permutation", dim2=dim2, n=n,
|
||||
**_solver_kwargs(kw),
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Knapsack
|
||||
# ============================================================
|
||||
|
||||
_KNAPSACK_OBJ = """
|
||||
if (idx != 0) return 0.0f;
|
||||
float tv = 0.0f;
|
||||
const int* sel = sol.data[0];
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++)
|
||||
if (sel[i]) tv += d_values[i];
|
||||
return tv;
|
||||
"""
|
||||
|
||||
def solve_knapsack(weights: np.ndarray, values: np.ndarray,
|
||||
capacity: float, **kw) -> Dict[str, Any]:
|
||||
"""Solve 0-1 Knapsack.
|
||||
|
||||
Args:
|
||||
weights: 1D array of item weights (float32).
|
||||
values: 1D array of item values (float32).
|
||||
capacity: Knapsack capacity.
|
||||
"""
|
||||
w = validate_1d(weights, "weights")
|
||||
v = validate_1d(values, "values", length=len(w))
|
||||
n = len(w)
|
||||
if capacity <= 0:
|
||||
raise CuGenOptValidationError(f"capacity must be > 0, got {capacity}")
|
||||
penalty_code = f"""
|
||||
float tw = 0.0f;
|
||||
const int* sel = sol.data[0];
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++)
|
||||
if (sel[i]) tw += d_weights[i];
|
||||
float over = tw - {capacity}f;
|
||||
return (over > 0.0f) ? over : 0.0f;
|
||||
"""
|
||||
return compile_and_solve(
|
||||
compute_obj=_KNAPSACK_OBJ, compute_penalty=penalty_code,
|
||||
data={"d_weights": w, "d_values": v},
|
||||
encoding="binary", dim2=max(32, n), n=n,
|
||||
objectives=[("maximize", 1.0)],
|
||||
**_solver_kwargs(kw),
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# QAP
|
||||
# ============================================================
|
||||
|
||||
_QAP_OBJ = """
|
||||
if (idx != 0) return 0.0f;
|
||||
float cost = 0.0f;
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++)
|
||||
for (int j = 0; j < size; j++)
|
||||
cost += d_flow[i * _n + j] * d_dist[sol.data[0][i] * _n + sol.data[0][j]];
|
||||
return cost;
|
||||
"""
|
||||
|
||||
def solve_qap(flow_matrix: np.ndarray, dist_matrix: np.ndarray,
|
||||
**kw) -> Dict[str, Any]:
|
||||
"""Solve Quadratic Assignment Problem.
|
||||
|
||||
Args:
|
||||
flow_matrix: NxN flow matrix (float32).
|
||||
dist_matrix: NxN distance matrix (float32).
|
||||
"""
|
||||
flow = validate_square_matrix(flow_matrix, "flow_matrix")
|
||||
dist = validate_square_matrix(dist_matrix, "dist_matrix")
|
||||
n = flow.shape[0]
|
||||
if dist.shape[0] != n:
|
||||
raise CuGenOptValidationError(
|
||||
f"flow_matrix ({n}x{n}) and dist_matrix ({dist.shape[0]}x{dist.shape[0]}) "
|
||||
f"must have the same dimensions"
|
||||
)
|
||||
return compile_and_solve(
|
||||
compute_obj=_QAP_OBJ,
|
||||
data={"d_flow": flow, "d_dist": dist},
|
||||
encoding="permutation", dim2=32, n=n,
|
||||
**_solver_kwargs(kw),
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Assignment
|
||||
# ============================================================
|
||||
|
||||
_ASSIGN_OBJ = """
|
||||
if (idx != 0) return 0.0f;
|
||||
float total = 0.0f;
|
||||
const int* assign = sol.data[0];
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++)
|
||||
total += d_cost[i * _n + assign[i]];
|
||||
return total;
|
||||
"""
|
||||
|
||||
def solve_assignment(cost_matrix: np.ndarray, **kw) -> Dict[str, Any]:
|
||||
"""Solve Assignment Problem.
|
||||
|
||||
Args:
|
||||
cost_matrix: NxN cost matrix (float32).
|
||||
"""
|
||||
cost = validate_square_matrix(cost_matrix, "cost_matrix")
|
||||
n = cost.shape[0]
|
||||
return compile_and_solve(
|
||||
compute_obj=_ASSIGN_OBJ,
|
||||
data={"d_cost": cost},
|
||||
encoding="permutation", dim2=16, n=n,
|
||||
**_solver_kwargs(kw),
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# VRP (CVRP)
|
||||
# ============================================================
|
||||
|
||||
def solve_vrp(dist_matrix: np.ndarray, demand: np.ndarray,
|
||||
capacity: float, num_vehicles: int, **kw) -> Dict[str, Any]:
|
||||
"""Solve Capacitated VRP.
|
||||
|
||||
Args:
|
||||
dist_matrix: (N+1)x(N+1) distance matrix including depot at index 0.
|
||||
demand: 1D array of customer demands (length N, excluding depot).
|
||||
capacity: Vehicle capacity.
|
||||
num_vehicles: Number of vehicles.
|
||||
"""
|
||||
dist = validate_square_matrix(dist_matrix, "dist_matrix")
|
||||
n_nodes = dist.shape[0]
|
||||
n = n_nodes - 1
|
||||
dem = validate_1d(demand, "demand", length=n)
|
||||
num_vehicles = validate_positive_int(num_vehicles, "num_vehicles")
|
||||
if capacity <= 0:
|
||||
raise CuGenOptValidationError(f"capacity must be > 0, got {capacity}")
|
||||
stride = n_nodes
|
||||
max_vehicles = kw.pop("max_vehicles", num_vehicles)
|
||||
|
||||
obj_code = f"""
|
||||
if (idx != 0) return 0.0f;
|
||||
float total = 0.0f;
|
||||
for (int r = 0; r < {num_vehicles}; r++) {{
|
||||
int size = sol.dim2_sizes[r];
|
||||
if (size == 0) continue;
|
||||
float dist = 0.0f;
|
||||
int prev = 0;
|
||||
for (int j = 0; j < size; j++) {{
|
||||
int node = sol.data[r][j] + 1;
|
||||
dist += d_dist[prev * {stride} + node];
|
||||
prev = node;
|
||||
}}
|
||||
dist += d_dist[prev * {stride} + 0];
|
||||
total += dist;
|
||||
}}
|
||||
return total;
|
||||
"""
|
||||
penalty_code = f"""
|
||||
float penalty = 0.0f;
|
||||
int active = 0;
|
||||
for (int r = 0; r < {num_vehicles}; r++) {{
|
||||
int size = sol.dim2_sizes[r];
|
||||
if (size == 0) continue;
|
||||
active++;
|
||||
float load = 0.0f;
|
||||
for (int j = 0; j < size; j++)
|
||||
load += d_demand[sol.data[r][j]];
|
||||
if (load > {capacity}f)
|
||||
penalty += (load - {capacity}f) * 100.0f;
|
||||
}}
|
||||
if (active > {max_vehicles})
|
||||
penalty += (float)(active - {max_vehicles}) * 1000.0f;
|
||||
return penalty;
|
||||
"""
|
||||
return compile_and_solve(
|
||||
compute_obj=obj_code, compute_penalty=penalty_code,
|
||||
data={"d_dist": dist, "d_demand": dem},
|
||||
encoding="permutation", dim1=num_vehicles, dim2=64, n=n,
|
||||
row_mode="partition", total_elements=n, cross_row_prob=0.3,
|
||||
**_solver_kwargs(kw),
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# VRPTW
|
||||
# ============================================================
|
||||
|
||||
def solve_vrptw(dist_matrix: np.ndarray, demand: np.ndarray,
|
||||
earliest: np.ndarray, latest: np.ndarray,
|
||||
service: np.ndarray, capacity: float,
|
||||
num_vehicles: int, **kw) -> Dict[str, Any]:
|
||||
"""Solve VRP with Time Windows.
|
||||
|
||||
Args:
|
||||
dist_matrix: (N+1)x(N+1) distance matrix including depot at index 0.
|
||||
demand: Customer demands (length N).
|
||||
earliest, latest, service: Time window arrays (length N+1, including depot).
|
||||
capacity: Vehicle capacity.
|
||||
num_vehicles: Number of vehicles.
|
||||
"""
|
||||
dist = validate_square_matrix(dist_matrix, "dist_matrix")
|
||||
n_nodes = dist.shape[0]
|
||||
n = n_nodes - 1
|
||||
dem = validate_1d(demand, "demand", length=n)
|
||||
ear = validate_1d(earliest, "earliest", length=n_nodes)
|
||||
lat = validate_1d(latest, "latest", length=n_nodes)
|
||||
svc = validate_1d(service, "service", length=n_nodes)
|
||||
num_vehicles = validate_positive_int(num_vehicles, "num_vehicles")
|
||||
if capacity <= 0:
|
||||
raise CuGenOptValidationError(f"capacity must be > 0, got {capacity}")
|
||||
stride = n_nodes
|
||||
max_vehicles = kw.pop("max_vehicles", num_vehicles)
|
||||
|
||||
obj_code = f"""
|
||||
if (idx != 0) return 0.0f;
|
||||
float total = 0.0f;
|
||||
for (int r = 0; r < {num_vehicles}; r++) {{
|
||||
int size = sol.dim2_sizes[r];
|
||||
if (size == 0) continue;
|
||||
float dist = 0.0f;
|
||||
int prev = 0;
|
||||
for (int j = 0; j < size; j++) {{
|
||||
int node = sol.data[r][j] + 1;
|
||||
dist += d_dist[prev * {stride} + node];
|
||||
prev = node;
|
||||
}}
|
||||
dist += d_dist[prev * {stride} + 0];
|
||||
total += dist;
|
||||
}}
|
||||
return total;
|
||||
"""
|
||||
penalty_code = f"""
|
||||
float penalty = 0.0f;
|
||||
int active = 0;
|
||||
for (int r = 0; r < {num_vehicles}; r++) {{
|
||||
int size = sol.dim2_sizes[r];
|
||||
if (size == 0) continue;
|
||||
active++;
|
||||
float load = 0.0f;
|
||||
for (int j = 0; j < size; j++)
|
||||
load += d_demand[sol.data[r][j]];
|
||||
if (load > {capacity}f)
|
||||
penalty += (load - {capacity}f) * 100.0f;
|
||||
float time = 0.0f;
|
||||
int prev = 0;
|
||||
for (int j = 0; j < size; j++) {{
|
||||
int node = sol.data[r][j] + 1;
|
||||
time += d_dist[prev * {stride} + node];
|
||||
if (time < d_earliest[node]) time = d_earliest[node];
|
||||
if (time > d_latest[node])
|
||||
penalty += (time - d_latest[node]) * 50.0f;
|
||||
time += d_service[node];
|
||||
prev = node;
|
||||
}}
|
||||
float ret = time + d_dist[prev * {stride} + 0];
|
||||
if (ret > d_latest[0])
|
||||
penalty += (ret - d_latest[0]) * 50.0f;
|
||||
}}
|
||||
if (active > {max_vehicles})
|
||||
penalty += (float)(active - {max_vehicles}) * 1000.0f;
|
||||
return penalty;
|
||||
"""
|
||||
return compile_and_solve(
|
||||
compute_obj=obj_code, compute_penalty=penalty_code,
|
||||
data={"d_dist": dist, "d_demand": dem,
|
||||
"d_earliest": ear, "d_latest": lat, "d_service": svc},
|
||||
encoding="permutation", dim1=num_vehicles, dim2=64, n=n,
|
||||
row_mode="partition", total_elements=n, cross_row_prob=0.3,
|
||||
**_solver_kwargs(kw),
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Graph Coloring
|
||||
# ============================================================
|
||||
|
||||
_GRAPHCOLOR_OBJ = """
|
||||
if (idx != 0) return 0.0f;
|
||||
int conflicts = 0;
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++)
|
||||
for (int j = i + 1; j < size; j++)
|
||||
if (d_adj[i * _n + j] && sol.data[0][i] == sol.data[0][j])
|
||||
conflicts++;
|
||||
return (float)conflicts;
|
||||
"""
|
||||
|
||||
def solve_graph_color(adj_matrix: np.ndarray, num_colors: int,
|
||||
**kw) -> Dict[str, Any]:
|
||||
"""Solve Graph Coloring.
|
||||
|
||||
Args:
|
||||
adj_matrix: NxN adjacency matrix (int32, 1=edge, 0=no edge).
|
||||
num_colors: Number of colors available.
|
||||
"""
|
||||
adj = validate_square_matrix(adj_matrix, "adj_matrix", dtype=np.int32)
|
||||
n = adj.shape[0]
|
||||
num_colors = validate_positive_int(num_colors, "num_colors")
|
||||
return compile_and_solve(
|
||||
compute_obj=_GRAPHCOLOR_OBJ,
|
||||
int_data={"d_adj": adj},
|
||||
encoding="integer", dim2=64, n=n,
|
||||
value_lower=0, value_upper=num_colors - 1,
|
||||
**_solver_kwargs(kw),
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Bin Packing
|
||||
# ============================================================
|
||||
|
||||
def solve_bin_packing(item_weights: np.ndarray, max_bins: int,
|
||||
bin_capacity: float, **kw) -> Dict[str, Any]:
|
||||
"""Solve Bin Packing.
|
||||
|
||||
Args:
|
||||
item_weights: 1D array of item weights (float32).
|
||||
max_bins: Maximum number of bins.
|
||||
bin_capacity: Capacity of each bin.
|
||||
"""
|
||||
w = validate_1d(item_weights, "item_weights")
|
||||
n = len(w)
|
||||
max_bins = validate_positive_int(max_bins, "max_bins")
|
||||
if bin_capacity <= 0:
|
||||
raise CuGenOptValidationError(f"bin_capacity must be > 0, got {bin_capacity}")
|
||||
|
||||
obj_code = f"""
|
||||
if (idx != 0) return 0.0f;
|
||||
int used = 0;
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int b = 0; b < {max_bins}; b++) {{
|
||||
bool has = false;
|
||||
for (int i = 0; i < size; i++)
|
||||
if (sol.data[0][i] == b) {{ has = true; break; }}
|
||||
if (has) used++;
|
||||
}}
|
||||
return (float)used;
|
||||
"""
|
||||
penalty_code = f"""
|
||||
float penalty = 0.0f;
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int b = 0; b < {max_bins}; b++) {{
|
||||
float load = 0.0f;
|
||||
for (int i = 0; i < size; i++)
|
||||
if (sol.data[0][i] == b) load += d_weights[i];
|
||||
if (load > {bin_capacity}f)
|
||||
penalty += (load - {bin_capacity}f);
|
||||
}}
|
||||
return penalty;
|
||||
"""
|
||||
return compile_and_solve(
|
||||
compute_obj=obj_code, compute_penalty=penalty_code,
|
||||
data={"d_weights": w},
|
||||
encoding="integer", dim2=64, n=n,
|
||||
value_lower=0, value_upper=max_bins - 1,
|
||||
**_solver_kwargs(kw),
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Load Balancing
|
||||
# ============================================================
|
||||
|
||||
def solve_load_balance(proc_times: np.ndarray, num_machines: int,
|
||||
**kw) -> Dict[str, Any]:
|
||||
"""Solve Load Balancing (minimize makespan).
|
||||
|
||||
Args:
|
||||
proc_times: 1D array of task processing times (float32).
|
||||
num_machines: Number of machines.
|
||||
"""
|
||||
p = validate_1d(proc_times, "proc_times")
|
||||
n = len(p)
|
||||
num_machines = validate_positive_int(num_machines, "num_machines")
|
||||
|
||||
obj_code = f"""
|
||||
if (idx != 0) return 0.0f;
|
||||
float loads[{num_machines}];
|
||||
for (int m = 0; m < {num_machines}; m++) loads[m] = 0.0f;
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++)
|
||||
loads[sol.data[0][i]] += d_proc[i];
|
||||
float makespan = 0.0f;
|
||||
for (int m = 0; m < {num_machines}; m++)
|
||||
if (loads[m] > makespan) makespan = loads[m];
|
||||
return makespan;
|
||||
"""
|
||||
return compile_and_solve(
|
||||
compute_obj=obj_code,
|
||||
data={"d_proc": p},
|
||||
encoding="integer", dim2=64, n=n,
|
||||
value_lower=0, value_upper=num_machines - 1,
|
||||
**_solver_kwargs(kw),
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# GPU info (pure Python, no JIT needed)
|
||||
# ============================================================
|
||||
|
||||
def gpu_info() -> Dict[str, Any]:
|
||||
"""Get GPU device information via nvidia-smi."""
|
||||
import subprocess
|
||||
info = {"device_count": 0}
|
||||
try:
|
||||
out = subprocess.check_output(
|
||||
["nvidia-smi", "--query-gpu=name,compute_cap,memory.total,driver_version",
|
||||
"--format=csv,noheader"],
|
||||
stderr=subprocess.DEVNULL, text=True
|
||||
).strip().split("\n")[0]
|
||||
parts = [p.strip() for p in out.split(",")]
|
||||
info["device_count"] = 1
|
||||
info["name"] = parts[0]
|
||||
info["compute_capability"] = parts[1]
|
||||
info["memory"] = parts[2]
|
||||
info["driver_version"] = parts[3]
|
||||
except Exception:
|
||||
pass
|
||||
return info
|
||||
90
python/cugenopt/include/core/cuda_utils.cuh
Normal file
90
python/cugenopt/include/core/cuda_utils.cuh
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
/**
|
||||
* cuda_utils.cuh - CUDA 工具集
|
||||
*
|
||||
* 职责:错误检查、设备信息、随机数工具
|
||||
* 规则:所有 CUDA API 调用都必须用 CUDA_CHECK 包裹
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <curand_kernel.h>
|
||||
|
||||
// ============================================================
|
||||
// 错误检查
|
||||
// ============================================================
|
||||
|
||||
#define CUDA_CHECK(call) do { \
|
||||
cudaError_t err = (call); \
|
||||
if (err != cudaSuccess) { \
|
||||
fprintf(stderr, "CUDA error at %s:%d: %s\n", \
|
||||
__FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
// kernel launch 后检查(捕获异步错误)
|
||||
#define CUDA_CHECK_LAST() do { \
|
||||
cudaError_t err = cudaGetLastError(); \
|
||||
if (err != cudaSuccess) { \
|
||||
fprintf(stderr, "CUDA kernel error at %s:%d: %s\n", \
|
||||
__FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
// ============================================================
|
||||
// 设备信息
|
||||
// ============================================================
|
||||
|
||||
inline void print_device_info() {
|
||||
int device;
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDevice(&device));
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
||||
|
||||
printf("GPU: %s\n", prop.name);
|
||||
printf(" SM count: %d\n", prop.multiProcessorCount);
|
||||
printf(" Max threads/SM: %d\n", prop.maxThreadsPerMultiProcessor);
|
||||
printf(" Shared mem/blk: %zu KB\n", prop.sharedMemPerBlock / 1024);
|
||||
printf(" Global mem: %.1f GB\n", prop.totalGlobalMem / 1e9);
|
||||
printf(" Compute cap: %d.%d\n", prop.major, prop.minor);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// 随机数工具 (Device 端)
|
||||
// ============================================================
|
||||
|
||||
// 初始化 curand 状态,每个线程一个
|
||||
__global__ void init_curand_kernel(curandState* states, unsigned long long seed, int n) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < n) {
|
||||
curand_init(seed, tid, 0, &states[tid]);
|
||||
}
|
||||
}
|
||||
|
||||
// Device 端:生成 [0, bound) 的随机整数
|
||||
__device__ inline int rand_int(curandState* state, int bound) {
|
||||
return curand(state) % bound;
|
||||
}
|
||||
|
||||
// Device 端:Fisher-Yates shuffle,对 arr[0..n-1] 做随机排列
|
||||
__device__ inline void shuffle(int* arr, int n, curandState* state) {
|
||||
for (int i = n - 1; i > 0; i--) {
|
||||
int j = rand_int(state, i + 1);
|
||||
int tmp = arr[i];
|
||||
arr[i] = arr[j];
|
||||
arr[j] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Kernel 启动参数计算
|
||||
// ============================================================
|
||||
|
||||
inline int div_ceil(int a, int b) { return (a + b - 1) / b; }
|
||||
|
||||
// 计算合适的 block 数量
|
||||
inline int calc_grid_size(int n, int block_size = 256) {
|
||||
return div_ceil(n, block_size);
|
||||
}
|
||||
141
python/cugenopt/include/core/gpu_cache.cuh
Normal file
141
python/cugenopt/include/core/gpu_cache.cuh
Normal file
|
|
@ -0,0 +1,141 @@
|
|||
/**
|
||||
* gpu_cache.cuh - GPU 全局内存哈希表(通用缓存组件)
|
||||
*
|
||||
* 设计:
|
||||
* - 开放寻址,固定容量(power of 2),线性探测
|
||||
* - key = uint64_t(由 Problem 自行计算 hash)
|
||||
* - value = float(单个指标值)
|
||||
* - 无锁:允许 race condition(缓存语义,偶尔脏读可接受)
|
||||
* - 自带命中/未命中原子计数器
|
||||
*
|
||||
* 用法:
|
||||
* GpuCache cache = GpuCache::allocate(65536); // host
|
||||
* // ... pass cache as Problem member to kernels ...
|
||||
* cache.print_stats(); // host
|
||||
* cache.destroy(); // host
|
||||
*
|
||||
* 参考:scute 项目 LRUCache(key = metric_type + content_hash)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "cuda_utils.cuh"
|
||||
#include <cstdint>
|
||||
|
||||
// ============================================================
|
||||
// 常量
|
||||
// ============================================================
|
||||
|
||||
static constexpr uint64_t CACHE_EMPTY_KEY = 0xFFFFFFFFFFFFFFFFULL;
|
||||
static constexpr int CACHE_MAX_PROBE = 8; // 最大线性探测步数
|
||||
|
||||
// ============================================================
|
||||
// GpuCache 结构体(POD,可安全拷贝到 kernel)
|
||||
// ============================================================
|
||||
|
||||
struct GpuCache {
|
||||
uint64_t* keys; // GPU 全局内存
|
||||
float* values; // GPU 全局内存
|
||||
unsigned int* d_hits; // 原子计数器(GPU)
|
||||
unsigned int* d_misses; // 原子计数器(GPU)
|
||||
int capacity; // 必须是 2 的幂
|
||||
int mask; // = capacity - 1
|
||||
|
||||
// ---- Host 操作 ----
|
||||
|
||||
static GpuCache allocate(int cap = 65536) {
|
||||
GpuCache c;
|
||||
c.capacity = cap;
|
||||
c.mask = cap - 1;
|
||||
CUDA_CHECK(cudaMalloc(&c.keys, sizeof(uint64_t) * cap));
|
||||
CUDA_CHECK(cudaMalloc(&c.values, sizeof(float) * cap));
|
||||
CUDA_CHECK(cudaMalloc(&c.d_hits, sizeof(unsigned int)));
|
||||
CUDA_CHECK(cudaMalloc(&c.d_misses, sizeof(unsigned int)));
|
||||
c.clear();
|
||||
return c;
|
||||
}
|
||||
|
||||
static GpuCache disabled() {
|
||||
GpuCache c;
|
||||
c.keys = nullptr; c.values = nullptr;
|
||||
c.d_hits = nullptr; c.d_misses = nullptr;
|
||||
c.capacity = 0; c.mask = 0;
|
||||
return c;
|
||||
}
|
||||
|
||||
bool is_enabled() const { return keys != nullptr; }
|
||||
|
||||
void clear() {
|
||||
CUDA_CHECK(cudaMemset(keys, 0xFF, sizeof(uint64_t) * capacity));
|
||||
CUDA_CHECK(cudaMemset(d_hits, 0, sizeof(unsigned int)));
|
||||
CUDA_CHECK(cudaMemset(d_misses, 0, sizeof(unsigned int)));
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (keys) cudaFree(keys);
|
||||
if (values) cudaFree(values);
|
||||
if (d_hits) cudaFree(d_hits);
|
||||
if (d_misses) cudaFree(d_misses);
|
||||
keys = nullptr; values = nullptr;
|
||||
d_hits = nullptr; d_misses = nullptr;
|
||||
}
|
||||
|
||||
void print_stats() const {
|
||||
if (!keys) { printf(" Cache: disabled\n"); return; }
|
||||
unsigned int h = 0, m = 0;
|
||||
CUDA_CHECK(cudaMemcpy(&h, d_hits, sizeof(unsigned int), cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpy(&m, d_misses, sizeof(unsigned int), cudaMemcpyDeviceToHost));
|
||||
unsigned int total = h + m;
|
||||
float rate = total > 0 ? (float)h / total * 100.0f : 0.0f;
|
||||
printf(" Cache: %u lookups | %u hits + %u misses | hit rate = %.1f%%\n",
|
||||
total, h, m, rate);
|
||||
printf(" Cache: capacity = %d entries (%.1f KB)\n",
|
||||
capacity, capacity * (sizeof(uint64_t) + sizeof(float)) / 1024.0f);
|
||||
}
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// Device 函数:哈希 / 查找 / 插入
|
||||
// ============================================================
|
||||
|
||||
/// FNV-1a 哈希:对一段有序 int 序列(如路线中的客户 ID)
|
||||
__device__ inline uint64_t route_hash(const int* data, int len) {
|
||||
uint64_t h = 14695981039346656037ULL; // FNV offset basis
|
||||
for (int i = 0; i < len; i++) {
|
||||
h ^= (uint64_t)(unsigned int)data[i];
|
||||
h *= 1099511628211ULL; // FNV prime
|
||||
}
|
||||
return (h == CACHE_EMPTY_KEY) ? h - 1 : h; // 避免与哨兵值碰撞
|
||||
}
|
||||
|
||||
/// 查找:命中返回 true + 写入 out
|
||||
__device__ inline bool cache_lookup(const GpuCache& c, uint64_t key, float& out) {
|
||||
int slot = (int)(key & (uint64_t)c.mask);
|
||||
for (int p = 0; p < CACHE_MAX_PROBE; p++) {
|
||||
int idx = (slot + p) & c.mask;
|
||||
uint64_t k = c.keys[idx];
|
||||
if (k == key) {
|
||||
out = c.values[idx];
|
||||
return true;
|
||||
}
|
||||
if (k == CACHE_EMPTY_KEY) return false; // 空槽 → 一定不存在
|
||||
}
|
||||
return false; // 探测用尽
|
||||
}
|
||||
|
||||
/// 插入:写入 key-value,同 key 覆盖,探测满则驱逐首槽
|
||||
__device__ inline void cache_insert(const GpuCache& c, uint64_t key, float value) {
|
||||
int slot = (int)(key & (uint64_t)c.mask);
|
||||
for (int p = 0; p < CACHE_MAX_PROBE; p++) {
|
||||
int idx = (slot + p) & c.mask;
|
||||
uint64_t k = c.keys[idx];
|
||||
if (k == CACHE_EMPTY_KEY || k == key) {
|
||||
c.keys[idx] = key;
|
||||
c.values[idx] = value;
|
||||
return;
|
||||
}
|
||||
}
|
||||
// 探测满:驱逐首槽
|
||||
int idx = slot & c.mask;
|
||||
c.keys[idx] = key;
|
||||
c.values[idx] = value;
|
||||
}
|
||||
121
python/cugenopt/include/core/init_heuristic.cuh
Normal file
121
python/cugenopt/include/core/init_heuristic.cuh
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
|
||||
namespace heuristic_init {
|
||||
|
||||
// 单行排列:所有行填相同排列
|
||||
template<typename Sol>
|
||||
static void build_sorted_permutation(Sol& sol, const std::vector<int>& order,
|
||||
int dim1, int dim2) {
|
||||
for (int r = 0; r < dim1; r++) {
|
||||
sol.dim2_sizes[r] = dim2;
|
||||
for (int c = 0; c < dim2; c++)
|
||||
sol.data[r][c] = order[c];
|
||||
}
|
||||
sol.penalty = 0.0f;
|
||||
for (int i = 0; i < MAX_OBJ; i++) sol.objectives[i] = 0.0f;
|
||||
}
|
||||
|
||||
// Partition 模式:排列均匀切分到 dim1 行,元素不重复
|
||||
template<typename Sol>
|
||||
static void build_partition_from_order(Sol& sol, const std::vector<int>& order,
|
||||
int dim1, int total_elements) {
|
||||
int idx = 0;
|
||||
for (int r = 0; r < dim1; r++) {
|
||||
int count = total_elements / dim1;
|
||||
if (r < total_elements % dim1) count++;
|
||||
sol.dim2_sizes[r] = count;
|
||||
for (int c = 0; c < count; c++)
|
||||
sol.data[r][c] = order[idx++];
|
||||
}
|
||||
sol.penalty = 0.0f;
|
||||
for (int i = 0; i < MAX_OBJ; i++) sol.objectives[i] = 0.0f;
|
||||
}
|
||||
|
||||
template<typename Sol>
|
||||
std::vector<Sol> build_from_matrices(const HeuristicMatrix* matrices, int num_matrices,
|
||||
int dim1, int dim2, EncodingType encoding,
|
||||
bool partition_mode = false, int total_elements = 0) {
|
||||
std::vector<Sol> results;
|
||||
if (encoding != EncodingType::Permutation) return results;
|
||||
int elem_count = partition_mode ? total_elements : dim2;
|
||||
if (num_matrices <= 0 || elem_count <= 0) return results;
|
||||
|
||||
auto make_sol = [&](const std::vector<int>& order) {
|
||||
Sol sol{};
|
||||
if (partition_mode)
|
||||
build_partition_from_order(sol, order, dim1, total_elements);
|
||||
else
|
||||
build_sorted_permutation(sol, order, dim1, dim2);
|
||||
return sol;
|
||||
};
|
||||
|
||||
for (int m = 0; m < num_matrices; m++) {
|
||||
const float* mat = matrices[m].data;
|
||||
int N = matrices[m].N;
|
||||
if (!mat || N < elem_count) continue;
|
||||
|
||||
std::vector<float> row_sum(N, 0.0f);
|
||||
std::vector<float> col_sum(N, 0.0f);
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++) {
|
||||
row_sum[i] += mat[i * N + j];
|
||||
col_sum[j] += mat[i * N + j];
|
||||
}
|
||||
|
||||
// 对于 Partition (VRPTW),距离矩阵含 depot (index 0),
|
||||
// 排序只针对客户 (index 1..N-1),输出值为 0-based 客户编号
|
||||
std::vector<int> idx;
|
||||
if (partition_mode && N > elem_count) {
|
||||
for (int i = 1; i <= elem_count; i++) idx.push_back(i);
|
||||
} else {
|
||||
idx.resize(elem_count);
|
||||
std::iota(idx.begin(), idx.end(), 0);
|
||||
}
|
||||
|
||||
auto to_customer = [&](std::vector<int>& order) {
|
||||
if (partition_mode && N > elem_count) {
|
||||
for (auto& v : order) v -= 1;
|
||||
}
|
||||
};
|
||||
|
||||
// row_sum ascending
|
||||
{
|
||||
auto order = idx;
|
||||
std::sort(order.begin(), order.end(),
|
||||
[&](int a, int b) { return row_sum[a] < row_sum[b]; });
|
||||
to_customer(order);
|
||||
results.push_back(make_sol(order));
|
||||
}
|
||||
// row_sum descending
|
||||
{
|
||||
auto order = idx;
|
||||
std::sort(order.begin(), order.end(),
|
||||
[&](int a, int b) { return row_sum[a] > row_sum[b]; });
|
||||
to_customer(order);
|
||||
results.push_back(make_sol(order));
|
||||
}
|
||||
// col_sum ascending
|
||||
{
|
||||
auto order = idx;
|
||||
std::sort(order.begin(), order.end(),
|
||||
[&](int a, int b) { return col_sum[a] < col_sum[b]; });
|
||||
to_customer(order);
|
||||
results.push_back(make_sol(order));
|
||||
}
|
||||
// col_sum descending
|
||||
{
|
||||
auto order = idx;
|
||||
std::sort(order.begin(), order.end(),
|
||||
[&](int a, int b) { return col_sum[a] > col_sum[b]; });
|
||||
to_customer(order);
|
||||
results.push_back(make_sol(order));
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
} // namespace heuristic_init
|
||||
258
python/cugenopt/include/core/init_selection.cuh
Normal file
258
python/cugenopt/include/core/init_selection.cuh
Normal file
|
|
@ -0,0 +1,258 @@
|
|||
/**
|
||||
* init_selection.cuh - 初始解采样择优 + NSGA-II 选择
|
||||
*
|
||||
* Host 端逻辑,在 solver 初始化阶段调用一次。
|
||||
* 从 K × pop_size 个候选解中选出 pop_size 个作为初始种群。
|
||||
*
|
||||
* 选择策略:
|
||||
* 1. 核心目标预留名额(按 importance 分配)
|
||||
* 2. NSGA-II 选择(非支配排序 + 加权拥挤度)
|
||||
* 3. 纯随机保底(多样性)
|
||||
*
|
||||
* 单目标时自动退化为 top-N 排序,无需分支。
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
|
||||
namespace init_sel {
|
||||
|
||||
// ============================================================
|
||||
// 候选解的目标信息(从 GPU 下载后在 host 端使用)
|
||||
// ============================================================
|
||||
struct CandidateInfo {
|
||||
int idx; // 在候选数组中的原始索引
|
||||
float objs[MAX_OBJ]; // 归一化后的目标值(越小越好)
|
||||
float penalty;
|
||||
int rank; // 非支配排序层级(0 = Pareto 前沿)
|
||||
float crowding; // 拥挤度距离
|
||||
bool selected; // 是否已被选中
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// 非支配排序(Fast Non-dominated Sort)
|
||||
// ============================================================
|
||||
// 复杂度:O(M × N²),M = 目标数,N = 候选数
|
||||
// 对初始化场景(N ≤ 几千,M ≤ 4)完全可接受
|
||||
|
||||
inline void fast_nondominated_sort(std::vector<CandidateInfo>& cands,
|
||||
int num_obj,
|
||||
std::vector<std::vector<int>>& fronts) {
|
||||
int n = (int)cands.size();
|
||||
std::vector<int> dom_count(n, 0); // 被多少个解支配
|
||||
std::vector<std::vector<int>> dom_set(n); // 支配了哪些解
|
||||
|
||||
// 判断 a 是否支配 b:a 在所有目标上 ≤ b,且至少一个 <
|
||||
// 先处理 penalty:可行解支配不可行解
|
||||
auto dominates = [&](int a, int b) -> bool {
|
||||
const auto& ca = cands[a];
|
||||
const auto& cb = cands[b];
|
||||
// penalty 处理
|
||||
if (ca.penalty <= 0.0f && cb.penalty > 0.0f) return true;
|
||||
if (ca.penalty > 0.0f && cb.penalty <= 0.0f) return false;
|
||||
if (ca.penalty > 0.0f && cb.penalty > 0.0f) return ca.penalty < cb.penalty;
|
||||
|
||||
bool all_leq = true;
|
||||
bool any_lt = false;
|
||||
for (int m = 0; m < num_obj; m++) {
|
||||
if (ca.objs[m] > cb.objs[m]) { all_leq = false; break; }
|
||||
if (ca.objs[m] < cb.objs[m]) any_lt = true;
|
||||
}
|
||||
return all_leq && any_lt;
|
||||
};
|
||||
|
||||
// 计算支配关系
|
||||
for (int i = 0; i < n; i++) {
|
||||
for (int j = i + 1; j < n; j++) {
|
||||
if (dominates(i, j)) {
|
||||
dom_set[i].push_back(j);
|
||||
dom_count[j]++;
|
||||
} else if (dominates(j, i)) {
|
||||
dom_set[j].push_back(i);
|
||||
dom_count[i]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 提取各层前沿
|
||||
fronts.clear();
|
||||
std::vector<int> current_front;
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (dom_count[i] == 0) {
|
||||
cands[i].rank = 0;
|
||||
current_front.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
int front_idx = 0;
|
||||
while (!current_front.empty()) {
|
||||
fronts.push_back(current_front);
|
||||
std::vector<int> next_front;
|
||||
for (int i : current_front) {
|
||||
for (int j : dom_set[i]) {
|
||||
dom_count[j]--;
|
||||
if (dom_count[j] == 0) {
|
||||
cands[j].rank = front_idx + 1;
|
||||
next_front.push_back(j);
|
||||
}
|
||||
}
|
||||
}
|
||||
current_front = next_front;
|
||||
front_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// 加权拥挤度距离
|
||||
// ============================================================
|
||||
// 标准拥挤度 + importance 加权:核心目标维度上的间距贡献更大
|
||||
|
||||
inline void weighted_crowding_distance(std::vector<CandidateInfo>& cands,
|
||||
const std::vector<int>& front,
|
||||
int num_obj,
|
||||
const float* importance) {
|
||||
int n = (int)front.size();
|
||||
if (n <= 2) {
|
||||
for (int i : front) cands[i].crowding = 1e18f; // 边界解无穷大
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i : front) cands[i].crowding = 0.0f;
|
||||
|
||||
std::vector<int> sorted_idx(front.begin(), front.end());
|
||||
|
||||
for (int m = 0; m < num_obj; m++) {
|
||||
// 按目标 m 排序
|
||||
std::sort(sorted_idx.begin(), sorted_idx.end(),
|
||||
[&](int a, int b) { return cands[a].objs[m] < cands[b].objs[m]; });
|
||||
|
||||
float range = cands[sorted_idx[n-1]].objs[m] - cands[sorted_idx[0]].objs[m];
|
||||
if (range < 1e-12f) continue; // 该目标无区分度
|
||||
|
||||
// 边界解设为无穷大
|
||||
cands[sorted_idx[0]].crowding += 1e18f;
|
||||
cands[sorted_idx[n-1]].crowding += 1e18f;
|
||||
|
||||
// 中间解:相邻间距 × importance 权重
|
||||
float w = importance[m];
|
||||
for (int i = 1; i < n - 1; i++) {
|
||||
float gap = cands[sorted_idx[i+1]].objs[m] - cands[sorted_idx[i-1]].objs[m];
|
||||
cands[sorted_idx[i]].crowding += w * (gap / range);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// 主选择函数:从 N 个候选中选出 target 个
|
||||
// ============================================================
|
||||
// 返回被选中的候选索引
|
||||
|
||||
inline std::vector<int> nsga2_select(std::vector<CandidateInfo>& cands,
|
||||
int num_obj,
|
||||
const float* importance,
|
||||
int target,
|
||||
int num_reserved_random) {
|
||||
// --- 1. 核心目标预留名额 ---
|
||||
int num_reserve_total = target - num_reserved_random;
|
||||
// 预留比例:importance[i] × 30% 的名额(剩余 70% 给 NSGA-II)
|
||||
float reserve_ratio = 0.3f;
|
||||
|
||||
std::vector<int> selected;
|
||||
selected.reserve(target);
|
||||
|
||||
// 对每个目标,按该目标排序取 top
|
||||
for (int m = 0; m < num_obj; m++) {
|
||||
int quota = (int)(num_reserve_total * importance[m] * reserve_ratio);
|
||||
if (quota < 1 && num_obj > 1) quota = 1; // 每个目标至少 1 个
|
||||
|
||||
// 按目标 m 排序(越小越好)
|
||||
std::vector<int> by_obj(cands.size());
|
||||
for (int i = 0; i < (int)cands.size(); i++) by_obj[i] = i;
|
||||
std::sort(by_obj.begin(), by_obj.end(),
|
||||
[&](int a, int b) { return cands[a].objs[m] < cands[b].objs[m]; });
|
||||
|
||||
int added = 0;
|
||||
for (int i = 0; i < (int)by_obj.size() && added < quota; i++) {
|
||||
int idx = by_obj[i];
|
||||
if (!cands[idx].selected) {
|
||||
cands[idx].selected = true;
|
||||
selected.push_back(idx);
|
||||
added++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- 2. NSGA-II 选择填充剩余名额 ---
|
||||
int remaining = target - num_reserved_random - (int)selected.size();
|
||||
|
||||
if (remaining > 0) {
|
||||
// 非支配排序
|
||||
std::vector<std::vector<int>> fronts;
|
||||
fast_nondominated_sort(cands, num_obj, fronts);
|
||||
|
||||
for (auto& front : fronts) {
|
||||
if (remaining <= 0) break;
|
||||
|
||||
// 过滤已选中的
|
||||
std::vector<int> available;
|
||||
for (int i : front) {
|
||||
if (!cands[i].selected) available.push_back(i);
|
||||
}
|
||||
|
||||
if ((int)available.size() <= remaining) {
|
||||
// 整层都选
|
||||
for (int i : available) {
|
||||
cands[i].selected = true;
|
||||
selected.push_back(i);
|
||||
remaining--;
|
||||
}
|
||||
} else {
|
||||
// 该层需要截断:按加权拥挤度选
|
||||
weighted_crowding_distance(cands, available, num_obj, importance);
|
||||
std::sort(available.begin(), available.end(),
|
||||
[&](int a, int b) { return cands[a].crowding > cands[b].crowding; });
|
||||
for (int i = 0; i < remaining; i++) {
|
||||
cands[available[i]].selected = true;
|
||||
selected.push_back(available[i]);
|
||||
}
|
||||
remaining = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return selected;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// 单目标快速路径:直接按标量排序取 top
|
||||
// ============================================================
|
||||
inline std::vector<int> top_n_select(std::vector<CandidateInfo>& cands,
|
||||
int target,
|
||||
int num_reserved_random) {
|
||||
int to_select = target - num_reserved_random;
|
||||
|
||||
// 按 penalty 优先,然后按 objs[0](已归一化为越小越好)
|
||||
std::vector<int> indices(cands.size());
|
||||
for (int i = 0; i < (int)cands.size(); i++) indices[i] = i;
|
||||
std::sort(indices.begin(), indices.end(), [&](int a, int b) {
|
||||
if (cands[a].penalty <= 0.0f && cands[b].penalty > 0.0f) return true;
|
||||
if (cands[a].penalty > 0.0f && cands[b].penalty <= 0.0f) return false;
|
||||
if (cands[a].penalty > 0.0f && cands[b].penalty > 0.0f)
|
||||
return cands[a].penalty < cands[b].penalty;
|
||||
return cands[a].objs[0] < cands[b].objs[0];
|
||||
});
|
||||
|
||||
std::vector<int> selected;
|
||||
selected.reserve(to_select);
|
||||
for (int i = 0; i < to_select && i < (int)indices.size(); i++) {
|
||||
selected.push_back(indices[i]);
|
||||
cands[indices[i]].selected = true;
|
||||
}
|
||||
return selected;
|
||||
}
|
||||
|
||||
} // namespace init_sel
|
||||
1229
python/cugenopt/include/core/operators.cuh
Normal file
1229
python/cugenopt/include/core/operators.cuh
Normal file
File diff suppressed because it is too large
Load diff
212
python/cugenopt/include/core/population.cuh
Normal file
212
python/cugenopt/include/core/population.cuh
Normal file
|
|
@ -0,0 +1,212 @@
|
|||
/**
|
||||
* population.cuh - 种群管理
|
||||
*
|
||||
* v2.0: Block 级架构
|
||||
* - RNG 数组大小 = pop_size * block_size(每个 block 内每个线程独立 RNG)
|
||||
* - 初始化 kernel 保持 1-thread-per-solution(初始化只做一次,不需要并行)
|
||||
* - find_best_kernel 保持单线程(种群规模不大)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
|
||||
// ============================================================
|
||||
// Device 端 Kernel(模板化)
|
||||
// ============================================================
|
||||
|
||||
template<typename Sol>
|
||||
__global__ void init_permutation_kernel(Sol* pop, int pop_size,
|
||||
int dim1, int dim2_default,
|
||||
curandState* rng_states) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid >= pop_size) return;
|
||||
Sol& sol = pop[tid];
|
||||
curandState* rng = &rng_states[tid];
|
||||
for (int r = 0; r < dim1; r++) {
|
||||
sol.dim2_sizes[r] = dim2_default;
|
||||
for (int c = 0; c < dim2_default; c++) sol.data[r][c] = c;
|
||||
shuffle(sol.data[r], dim2_default, rng);
|
||||
}
|
||||
sol.penalty = 0.0f;
|
||||
}
|
||||
|
||||
template<typename Sol>
|
||||
__global__ void init_binary_kernel(Sol* pop, int pop_size,
|
||||
int dim1, int dim2_default,
|
||||
curandState* rng_states) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid >= pop_size) return;
|
||||
Sol& sol = pop[tid];
|
||||
curandState* rng = &rng_states[tid];
|
||||
for (int r = 0; r < dim1; r++) {
|
||||
sol.dim2_sizes[r] = dim2_default;
|
||||
for (int c = 0; c < dim2_default; c++) sol.data[r][c] = curand(rng) % 2;
|
||||
}
|
||||
sol.penalty = 0.0f;
|
||||
}
|
||||
|
||||
template<typename Sol>
|
||||
__global__ void init_integer_kernel(Sol* pop, int pop_size,
|
||||
int dim1, int dim2_default,
|
||||
int lb, int ub,
|
||||
curandState* rng_states) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid >= pop_size) return;
|
||||
Sol& sol = pop[tid];
|
||||
curandState* rng = &rng_states[tid];
|
||||
int range = ub - lb + 1;
|
||||
for (int r = 0; r < dim1; r++) {
|
||||
sol.dim2_sizes[r] = dim2_default;
|
||||
for (int c = 0; c < dim2_default; c++)
|
||||
sol.data[r][c] = lb + (curand(rng) % range);
|
||||
}
|
||||
sol.penalty = 0.0f;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// 多重集排列初始化 — 每个值 [0, N) 重复 R 次,总长度 N*R
|
||||
// ============================================================
|
||||
// 用于 JSP 工序排列编码:N=num_jobs, R=num_ops,值 j 出现 R 次表示工件 j
|
||||
|
||||
template<typename Sol>
|
||||
__global__ void init_multiset_perm_kernel(Sol* pop, int pop_size,
|
||||
int dim1, int num_values, int repeat_count,
|
||||
curandState* rng_states) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid >= pop_size) return;
|
||||
Sol& sol = pop[tid];
|
||||
curandState* rng = &rng_states[tid];
|
||||
int total = num_values * repeat_count;
|
||||
for (int r = 0; r < dim1; r++) {
|
||||
sol.dim2_sizes[r] = total;
|
||||
int idx = 0;
|
||||
for (int v = 0; v < num_values; v++)
|
||||
for (int k = 0; k < repeat_count; k++)
|
||||
sol.data[r][idx++] = v;
|
||||
shuffle(sol.data[r], total, rng);
|
||||
}
|
||||
sol.penalty = 0.0f;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// 分区初始化 — 元素 {0..total_elements-1} 不重复分配到 dim1 行
|
||||
// ============================================================
|
||||
|
||||
template<typename Sol>
|
||||
__global__ void init_partition_kernel(Sol* pop, int pop_size,
|
||||
int dim1, int total_elements,
|
||||
curandState* rng_states) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid >= pop_size) return;
|
||||
Sol& sol = pop[tid];
|
||||
curandState* rng = &rng_states[tid];
|
||||
|
||||
for (int i = 0; i < total_elements; i++) sol.data[0][i] = i;
|
||||
shuffle(sol.data[0], total_elements, rng);
|
||||
|
||||
int idx = 0;
|
||||
for (int r = 0; r < dim1; r++) {
|
||||
int count = total_elements / dim1;
|
||||
if (r < total_elements % dim1) count++;
|
||||
sol.dim2_sizes[r] = count;
|
||||
if (r > 0) {
|
||||
for (int c = 0; c < count; c++)
|
||||
sol.data[r][c] = sol.data[0][idx + c];
|
||||
}
|
||||
idx += count;
|
||||
}
|
||||
|
||||
sol.penalty = 0.0f;
|
||||
}
|
||||
|
||||
template<typename Sol>
|
||||
__global__ void find_best_kernel(const Sol* pop, int pop_size,
|
||||
ObjConfig oc, int* best_idx) {
|
||||
if (threadIdx.x != 0 || blockIdx.x != 0) return;
|
||||
int best = 0;
|
||||
for (int i = 1; i < pop_size; i++)
|
||||
if (is_better(pop[i], pop[best], oc)) best = i;
|
||||
*best_idx = best;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Host 端 RAII 类(模板化)
|
||||
// ============================================================
|
||||
|
||||
template<typename Sol>
|
||||
class Population {
|
||||
public:
|
||||
Sol* d_solutions = nullptr;
|
||||
curandState* d_rng_states = nullptr; // 大小 = pop_size * block_size
|
||||
int size = 0;
|
||||
int rng_count = 0; // RNG 状态总数
|
||||
|
||||
Population() = default;
|
||||
|
||||
// block_size: Block 级架构下每个 block 的线程数
|
||||
// RNG 数组大小 = pop_size * block_size(每个 block 内每个线程独立 RNG)
|
||||
void allocate(int pop_size, int block_size = 128) {
|
||||
size = pop_size;
|
||||
rng_count = pop_size * block_size;
|
||||
CUDA_CHECK(cudaMalloc(&d_solutions, sizeof(Sol) * size));
|
||||
CUDA_CHECK(cudaMalloc(&d_rng_states, sizeof(curandState) * rng_count));
|
||||
}
|
||||
|
||||
void init_rng(unsigned seed, int block_size = 256) {
|
||||
int grid = calc_grid_size(rng_count, block_size);
|
||||
init_curand_kernel<<<grid, block_size>>>(d_rng_states, seed, rng_count);
|
||||
CUDA_CHECK_LAST();
|
||||
}
|
||||
|
||||
void init_population(const ProblemConfig& cfg, int block_size = 256) {
|
||||
int grid = calc_grid_size(size, block_size);
|
||||
|
||||
if (cfg.row_mode == RowMode::Partition) {
|
||||
init_partition_kernel<<<grid, block_size>>>(
|
||||
d_solutions, size, cfg.dim1, cfg.total_elements, d_rng_states);
|
||||
} else if (cfg.encoding == EncodingType::Permutation && cfg.perm_repeat_count > 1) {
|
||||
int num_values = cfg.dim2_default / cfg.perm_repeat_count;
|
||||
init_multiset_perm_kernel<<<grid, block_size>>>(
|
||||
d_solutions, size, cfg.dim1, num_values, cfg.perm_repeat_count, d_rng_states);
|
||||
} else {
|
||||
switch (cfg.encoding) {
|
||||
case EncodingType::Permutation:
|
||||
init_permutation_kernel<<<grid, block_size>>>(
|
||||
d_solutions, size, cfg.dim1, cfg.dim2_default, d_rng_states);
|
||||
break;
|
||||
case EncodingType::Binary:
|
||||
init_binary_kernel<<<grid, block_size>>>(
|
||||
d_solutions, size, cfg.dim1, cfg.dim2_default, d_rng_states);
|
||||
break;
|
||||
case EncodingType::Integer:
|
||||
init_integer_kernel<<<grid, block_size>>>(
|
||||
d_solutions, size, cfg.dim1, cfg.dim2_default,
|
||||
cfg.value_lower_bound, cfg.value_upper_bound,
|
||||
d_rng_states);
|
||||
break;
|
||||
}
|
||||
}
|
||||
CUDA_CHECK_LAST();
|
||||
}
|
||||
|
||||
Sol download_solution(int idx) const {
|
||||
Sol h_sol;
|
||||
CUDA_CHECK(cudaMemcpy(&h_sol, d_solutions + idx, sizeof(Sol), cudaMemcpyDeviceToHost));
|
||||
return h_sol;
|
||||
}
|
||||
|
||||
~Population() {
|
||||
if (d_solutions) cudaFree(d_solutions);
|
||||
if (d_rng_states) cudaFree(d_rng_states);
|
||||
}
|
||||
|
||||
Population(const Population&) = delete;
|
||||
Population& operator=(const Population&) = delete;
|
||||
Population(Population&& o) noexcept
|
||||
: d_solutions(o.d_solutions), d_rng_states(o.d_rng_states),
|
||||
size(o.size), rng_count(o.rng_count) {
|
||||
o.d_solutions = nullptr; o.d_rng_states = nullptr;
|
||||
o.size = 0; o.rng_count = 0;
|
||||
}
|
||||
};
|
||||
125
python/cugenopt/include/core/relation_matrix.cuh
Normal file
125
python/cugenopt/include/core/relation_matrix.cuh
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
/**
|
||||
* relation_matrix.cuh - G/O 关系矩阵管理
|
||||
*
|
||||
* G[i][j]: 分组倾向(元素 i 和 j 应在同一行的倾向,对称)
|
||||
* O[i][j]: 排序倾向(元素 i 应排在 j 前面的倾向,不对称)
|
||||
*
|
||||
* 更新来源:历史最优解统计
|
||||
* 每当 host 端获取到当前 best 解,扫描所有元素对关系:
|
||||
* - 同行 → G[i][j] 增强
|
||||
* - i 在 j 前 → O[i][j] 增强
|
||||
* 使用 EMA 衰减:M[i][j] = α * M[i][j] + (1-α) * signal
|
||||
*
|
||||
* 生命周期:
|
||||
* 1. relation_matrix_create(N) — 分配 host/device 内存,初始化为 0
|
||||
* 2. relation_matrix_update(rm, sol, dim1) — 从一个解更新 G/O(host 端)
|
||||
* 3. relation_matrix_upload(rm) — 上传 h_G/h_O 到 d_G/d_O
|
||||
* 4. relation_matrix_destroy(rm) — 释放内存
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
#include <cstring>
|
||||
|
||||
// ============================================================
|
||||
// 创建 / 销毁
|
||||
// ============================================================
|
||||
|
||||
inline RelationMatrix relation_matrix_create(int N, float decay = 0.95f) {
|
||||
RelationMatrix rm;
|
||||
rm.N = N;
|
||||
rm.decay = decay;
|
||||
rm.update_count = 0;
|
||||
|
||||
size_t bytes = (size_t)N * N * sizeof(float);
|
||||
|
||||
rm.h_G = new float[N * N];
|
||||
rm.h_O = new float[N * N];
|
||||
memset(rm.h_G, 0, bytes);
|
||||
memset(rm.h_O, 0, bytes);
|
||||
|
||||
CUDA_CHECK(cudaMalloc(&rm.d_G, bytes));
|
||||
CUDA_CHECK(cudaMalloc(&rm.d_O, bytes));
|
||||
CUDA_CHECK(cudaMemset(rm.d_G, 0, bytes));
|
||||
CUDA_CHECK(cudaMemset(rm.d_O, 0, bytes));
|
||||
|
||||
return rm;
|
||||
}
|
||||
|
||||
inline void relation_matrix_destroy(RelationMatrix& rm) {
|
||||
delete[] rm.h_G;
|
||||
delete[] rm.h_O;
|
||||
CUDA_CHECK(cudaFree(rm.d_G));
|
||||
CUDA_CHECK(cudaFree(rm.d_O));
|
||||
rm.h_G = rm.h_O = nullptr;
|
||||
rm.d_G = rm.d_O = nullptr;
|
||||
rm.N = 0;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// 从一个解更新 G/O(host 端)
|
||||
// ============================================================
|
||||
// sol: 当前最优解(已下载到 host)
|
||||
// dim1: 实际使用的行数
|
||||
//
|
||||
// 逻辑:
|
||||
// 对 sol 中每对元素 (val_a, val_b):
|
||||
// 如果在同一行 → G[val_a][val_b] 增强
|
||||
// 如果 val_a 在 val_b 前面 → O[val_a][val_b] 增强
|
||||
//
|
||||
// 注意:元素值 val 必须在 [0, N) 范围内才有意义
|
||||
// 对于 partition 编码(VRP),元素值就是客户编号
|
||||
// 对于单行排列(TSP),元素值就是城市编号
|
||||
|
||||
template<typename Sol>
|
||||
void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) {
|
||||
int N = rm.N;
|
||||
float alpha = rm.decay;
|
||||
float signal_strength = 1.0f;
|
||||
|
||||
// 衰减所有现有值
|
||||
for (int i = 0; i < N * N; i++) {
|
||||
rm.h_G[i] *= alpha;
|
||||
rm.h_O[i] *= alpha;
|
||||
}
|
||||
|
||||
// 扫描解中的元素对关系
|
||||
for (int r = 0; r < dim1; r++) {
|
||||
int sz = sol.dim2_sizes[r];
|
||||
for (int c1 = 0; c1 < sz; c1++) {
|
||||
int val_a = sol.data[r][c1];
|
||||
if (val_a < 0 || val_a >= N) continue;
|
||||
|
||||
for (int c2 = c1 + 1; c2 < sz; c2++) {
|
||||
int val_b = sol.data[r][c2];
|
||||
if (val_b < 0 || val_b >= N) continue;
|
||||
|
||||
// 同行 → G 增强(对称)
|
||||
rm.h_G[val_a * N + val_b] += (1.0f - alpha) * signal_strength;
|
||||
rm.h_G[val_b * N + val_a] += (1.0f - alpha) * signal_strength;
|
||||
|
||||
// val_a 在 val_b 前 → O[val_a][val_b] 增强
|
||||
rm.h_O[val_a * N + val_b] += (1.0f - alpha) * signal_strength;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 裁剪到 [0, 1]
|
||||
for (int i = 0; i < N * N; i++) {
|
||||
if (rm.h_G[i] > 1.0f) rm.h_G[i] = 1.0f;
|
||||
if (rm.h_O[i] > 1.0f) rm.h_O[i] = 1.0f;
|
||||
}
|
||||
|
||||
rm.update_count++;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// 上传到 GPU
|
||||
// ============================================================
|
||||
|
||||
inline void relation_matrix_upload(const RelationMatrix& rm) {
|
||||
size_t bytes = (size_t)rm.N * rm.N * sizeof(float);
|
||||
CUDA_CHECK(cudaMemcpy(rm.d_G, rm.h_G, bytes, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(rm.d_O, rm.h_O, bytes, cudaMemcpyHostToDevice));
|
||||
}
|
||||
1530
python/cugenopt/include/core/solver.cuh
Normal file
1530
python/cugenopt/include/core/solver.cuh
Normal file
File diff suppressed because it is too large
Load diff
721
python/cugenopt/include/core/types.cuh
Normal file
721
python/cugenopt/include/core/types.cuh
Normal file
|
|
@ -0,0 +1,721 @@
|
|||
/**
|
||||
* types.cuh - 核心类型定义
|
||||
*
|
||||
* 包含:编码类型、Solution 模板、ProblemConfig/SolverConfig、
|
||||
* SeqRegistry(AOS 序列级权重)、KStepConfig(多步执行)、
|
||||
* RelationMatrix(G/O 关系矩阵)、ProblemBase(CRTP 基类)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
// ============================================================
|
||||
// 编译时常量
|
||||
// ============================================================
|
||||
constexpr int MAX_OBJ = 4; // 最多 4 个目标(16字节,不值得模板化)
|
||||
constexpr int MAX_SEQ = 32; // 最大序列数(内置 ~16 + 自定义算子 ≤8,留余量)
|
||||
constexpr int MAX_K = 3; // 多步执行的最大步数(K=1,2,3)
|
||||
// AOS 权重上下限(归一化后)
|
||||
constexpr float AOS_WEIGHT_FLOOR = 0.05f; // 最低权重保底(确保充分探索)
|
||||
constexpr float AOS_WEIGHT_CAP = 0.35f; // 最高权重上限(防止赢者通吃)
|
||||
|
||||
// ============================================================
|
||||
// 枚举类型
|
||||
// ============================================================
|
||||
|
||||
enum class EncodingType {
|
||||
Permutation, // 排列:元素不重复
|
||||
Binary, // 0-1:flip 是主要算子
|
||||
Integer // 有界整数
|
||||
};
|
||||
|
||||
enum class RowMode {
|
||||
Single, // dim1=1,单行(TSP/QAP/Knapsack 等大部分问题)
|
||||
Fixed, // dim1>1,行等长不可变(JSP-Int/Schedule,禁止 SPLIT/MERGE)
|
||||
Partition // dim1>1,元素分区到各行,行长可变(CVRP/VRPTW)
|
||||
};
|
||||
|
||||
enum class ObjDir {
|
||||
Minimize,
|
||||
Maximize
|
||||
};
|
||||
|
||||
// 多目标比较模式
|
||||
enum class CompareMode {
|
||||
Weighted, // 加权求和:sum(weight[i] * obj[i]),越小越好
|
||||
Lexicographic // 字典法:按优先级逐目标比较,前面的目标优先
|
||||
};
|
||||
|
||||
enum class MigrateStrategy {
|
||||
Ring, // 环形:各岛最优→邻岛最差(慢传播,高多样性)
|
||||
TopN, // 全局 Top-N 轮转分发(快传播,强收敛)
|
||||
Hybrid // 两者兼顾:Top-N 替换最差 + Ring 替换次差
|
||||
};
|
||||
|
||||
|
||||
// ============================================================
|
||||
// SeqID — 统一的 OperationSequence 编号
|
||||
// ============================================================
|
||||
// 每个 SeqID 对应一种具体的搜索操作(原子或多步)
|
||||
// AOS 权重跟踪粒度 = SeqID(每个序列独立权重)
|
||||
//
|
||||
// 命名规则:SEQ_{编码}_{操作名}
|
||||
// 跨编码共享的行级操作统一编号
|
||||
|
||||
namespace seq {
|
||||
|
||||
// --- Permutation 行内(元素级)---
|
||||
constexpr int SEQ_PERM_SWAP = 0; // swap 两个位置
|
||||
constexpr int SEQ_PERM_REVERSE = 1; // 2-opt(反转区间)
|
||||
constexpr int SEQ_PERM_INSERT = 2; // insert(移动到新位置)
|
||||
constexpr int SEQ_PERM_3OPT = 3; // 3-opt(断 3 边重连)
|
||||
|
||||
// --- Permutation 行内(片段级)---
|
||||
constexpr int SEQ_PERM_OR_OPT = 4; // or-opt(移动连续 k 个元素)
|
||||
|
||||
// --- Permutation 行内(组合级)---
|
||||
constexpr int SEQ_PERM_DOUBLE_SWAP = 30; // 连续两次 swap(同行)
|
||||
constexpr int SEQ_PERM_TRIPLE_SWAP = 31; // 连续三次 swap(同行)
|
||||
|
||||
// --- Permutation 跨行(元素级)---
|
||||
constexpr int SEQ_PERM_CROSS_RELOCATE = 5; // 单元素移行
|
||||
constexpr int SEQ_PERM_CROSS_SWAP = 6; // 单元素换行
|
||||
|
||||
// --- Permutation 跨行(片段级)---
|
||||
constexpr int SEQ_PERM_SEG_RELOCATE = 7; // 片段移行
|
||||
constexpr int SEQ_PERM_SEG_SWAP = 8; // 片段换行(2-opt*)
|
||||
constexpr int SEQ_PERM_CROSS_EXCHANGE = 9; // 片段互换(保序)
|
||||
|
||||
// --- Binary 行内(元素级)---
|
||||
constexpr int SEQ_BIN_FLIP = 0; // 翻转一个位
|
||||
constexpr int SEQ_BIN_SWAP = 1; // 交换两个位
|
||||
|
||||
// --- Binary 行内(片段级)---
|
||||
constexpr int SEQ_BIN_SEG_FLIP = 2; // 翻转连续 k 个位
|
||||
constexpr int SEQ_BIN_K_FLIP = 3; // 同时翻转 k 个随机位
|
||||
|
||||
// --- Binary 跨行 ---
|
||||
constexpr int SEQ_BIN_CROSS_SWAP = 4; // 两行各一个位互换
|
||||
constexpr int SEQ_BIN_SEG_CROSS_SWAP = 5; // 两行各取一段互换
|
||||
|
||||
// --- 共享:行级(编码无关)---
|
||||
constexpr int SEQ_ROW_SWAP = 10; // 交换两行
|
||||
constexpr int SEQ_ROW_REVERSE = 11; // 反转行排列
|
||||
constexpr int SEQ_ROW_SPLIT = 12; // 一行拆两行
|
||||
constexpr int SEQ_ROW_MERGE = 13; // 两行合并
|
||||
|
||||
// --- 特殊 ---
|
||||
constexpr int SEQ_PERTURBATION = 14; // 扰动(多步不可逆)
|
||||
|
||||
// --- Integer 行内(元素级)---
|
||||
constexpr int SEQ_INT_RANDOM_RESET = 0; // 随机一个位置重置为 [lb, ub] 内随机值
|
||||
constexpr int SEQ_INT_DELTA = 1; // 随机一个位置 ±k(clamp 到 [lb, ub])
|
||||
constexpr int SEQ_INT_SWAP = 2; // 交换两个位置的值
|
||||
|
||||
// --- Integer 行内(片段级)---
|
||||
constexpr int SEQ_INT_SEG_RESET = 3; // 连续 k 个位置全部重置
|
||||
constexpr int SEQ_INT_K_DELTA = 4; // 随机 k 个位置各自 ±1
|
||||
|
||||
// --- Integer 跨行 ---
|
||||
constexpr int SEQ_INT_CROSS_SWAP = 5; // 两行各一个位置互换
|
||||
|
||||
// --- LNS(大邻域搜索)---
|
||||
constexpr int SEQ_LNS_SEGMENT_SHUFFLE = 20; // 打乱连续片段
|
||||
constexpr int SEQ_LNS_SCATTER_SHUFFLE = 21; // 打乱随机分散位置
|
||||
constexpr int SEQ_LNS_GUIDED_REBUILD = 22; // 关系矩阵引导重建
|
||||
|
||||
} // namespace seq
|
||||
|
||||
// ============================================================
|
||||
// RelationMatrix — G/O 关系矩阵(GPU global memory)
|
||||
// ============================================================
|
||||
// G[i][j]: 元素 i 和 j 的分组倾向(对称,越大越倾向同组)
|
||||
// O[i][j]: 元素 i 排在 j 前面的倾向(不对称)
|
||||
// 存储为一维数组 [N * N],行优先
|
||||
// 小规模 N<200 直接 Dense,P2 再做稀疏化
|
||||
//
|
||||
// 更新时机:host 端,每个 batch 间隙
|
||||
// 使用时机:kernel 中 SEQ_LNS_GUIDED_REBUILD 读取
|
||||
|
||||
struct RelationMatrix {
|
||||
float* d_G; // GPU 上的 G 矩阵 [N * N]
|
||||
float* d_O; // GPU 上的 O 矩阵 [N * N]
|
||||
float* h_G; // Host 上的 G 矩阵 [N * N](用于更新后上传)
|
||||
float* h_O; // Host 上的 O 矩阵 [N * N]
|
||||
int N; // 元素总数
|
||||
float decay; // 衰减系数 α(默认 0.95)
|
||||
int update_count; // 已更新次数(用于冷启动判断)
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// SeqRegistry — 运行时可用序列注册表
|
||||
// ============================================================
|
||||
// 根据 EncodingType 和 dim1 自动确定哪些序列可用
|
||||
// 传到 GPU 供 sample_sequence() 使用
|
||||
|
||||
enum class SeqCategory : int {
|
||||
InRow = 0, // 行内算子(swap, reverse, insert, ...)
|
||||
CrossRow = 1, // 跨行算子(cross_relocate, cross_swap, seg_relocate, ...)
|
||||
RowLevel = 2, // 行级算子(row_swap, row_reverse, split, merge)
|
||||
LNS = 3, // 大邻域搜索
|
||||
};
|
||||
|
||||
struct SeqRegistry {
|
||||
int ids[MAX_SEQ]; // 可用序列的 SeqID 列表
|
||||
int count; // 可用序列数量
|
||||
float weights[MAX_SEQ]; // 每个序列的当前权重(归一化后用于采样)
|
||||
float max_w[MAX_SEQ]; // 每个序列的权重上限(0 = 不限,用全局 cap)
|
||||
SeqCategory categories[MAX_SEQ]; // 每个序列的分类(约束导向用)
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// KStepConfig — 多步执行的步数选择配置
|
||||
// ============================================================
|
||||
// K=1: 单步(当前行为),K=2/3: 连续执行多个序列后再评估
|
||||
// 两层权重体系的第一层
|
||||
//
|
||||
// 自适应策略:
|
||||
// - 初始 K=1 权重很大(保守),K>1 权重小
|
||||
// - K>1 带来改进 → 增大该 K 的权重
|
||||
// - 长时间无改进 → 重置/增大 K>1 权重(跳出局部最优)
|
||||
|
||||
struct KStepConfig {
|
||||
float weights[MAX_K]; // K=1,2,3 的采样权重(归一化)
|
||||
int stagnation_count; // 连续无改进的 batch 数(用于触发重置)
|
||||
int stagnation_limit; // 触发重置的阈值(默认 5 个 batch)
|
||||
};
|
||||
|
||||
// 构建默认 K 步配置
|
||||
inline KStepConfig build_kstep_config() {
|
||||
KStepConfig kc;
|
||||
kc.weights[0] = 0.80f; // K=1: 初始主导
|
||||
kc.weights[1] = 0.15f; // K=2: 少量探索
|
||||
kc.weights[2] = 0.05f; // K=3: 极少探索
|
||||
kc.stagnation_count = 0;
|
||||
kc.stagnation_limit = 5;
|
||||
return kc;
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// ProblemProfile — 基于结构特征推断的问题画像
|
||||
// ============================================================
|
||||
// 第一层:纯结构推断(不感知语义),用于驱动算子注册和初始权重
|
||||
// 未来第二层:可扩展更细粒度的画像(如多属性、高约束等)
|
||||
|
||||
enum class ScaleClass { Small, Medium, Large };
|
||||
enum class StructClass { SingleSeq, MultiFixed, MultiPartition };
|
||||
|
||||
struct ProblemProfile {
|
||||
EncodingType encoding;
|
||||
ScaleClass scale;
|
||||
StructClass structure;
|
||||
float cross_row_prob;
|
||||
};
|
||||
|
||||
// classify_problem() 定义在 ProblemConfig 之后
|
||||
|
||||
// ============================================================
|
||||
// 权重预设 — 由 ScaleClass 驱动
|
||||
// ============================================================
|
||||
|
||||
struct WeightPreset {
|
||||
float w_cubic;
|
||||
float w_quadratic;
|
||||
float w_lns;
|
||||
float lns_cap;
|
||||
};
|
||||
|
||||
inline WeightPreset get_weight_preset(ScaleClass scale) {
|
||||
switch (scale) {
|
||||
case ScaleClass::Small: return { 0.50f, 0.80f, 0.006f, 0.01f };
|
||||
case ScaleClass::Medium: return { 0.30f, 0.70f, 0.004f, 0.01f };
|
||||
case ScaleClass::Large: return { 0.05f, 0.30f, 0.001f, 0.01f };
|
||||
}
|
||||
return { 0.50f, 0.80f, 0.006f, 0.01f };
|
||||
}
|
||||
|
||||
// classify_problem() 和 build_seq_registry() 定义在 ProblemConfig 之后
|
||||
|
||||
// ============================================================
|
||||
// Solution<D1, D2> — 解的模板化表示
|
||||
// ============================================================
|
||||
// D1: 行数上限 (TSP=1, VRP≤16, Schedule≤8)
|
||||
// D2: 每行列数上限 (TSP≤64, 背包≤32)
|
||||
// 每个 Problem 选择最小够用的 D1/D2,编译器生成紧凑的结构
|
||||
|
||||
template<int D1, int D2>
|
||||
struct Solution {
|
||||
static constexpr int DIM1 = D1; // 编译时行数上限
|
||||
static constexpr int DIM2 = D2; // 编译时列数上限
|
||||
int data[D1][D2]; // D1×D2×4 字节
|
||||
int dim2_sizes[D1]; // D1×4 字节
|
||||
float objectives[MAX_OBJ]; // 16 字节(固定)
|
||||
float penalty; // 4 字节
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// ProblemConfig — 问题的运行时元信息
|
||||
// ============================================================
|
||||
|
||||
struct ProblemConfig {
|
||||
EncodingType encoding;
|
||||
int dim1; // 实际使用的行数 (≤ D1)
|
||||
int dim2_default; // 实际使用的列数 (≤ D2)
|
||||
int num_objectives;
|
||||
ObjDir obj_dirs[MAX_OBJ];
|
||||
float obj_weights[MAX_OBJ]; // Weighted 模式下的权重
|
||||
// 多目标比较
|
||||
CompareMode compare_mode = CompareMode::Weighted;
|
||||
int obj_priority[MAX_OBJ] = {0, 1, 2, 3}; // Lexicographic 模式下的比较顺序(索引)
|
||||
float obj_tolerance[MAX_OBJ] = {0.0f, 0.0f, 0.0f, 0.0f}; // 字典法容差:差值 <= tol 视为相等
|
||||
int value_lower_bound;
|
||||
int value_upper_bound;
|
||||
// v3.4: 统一行模式
|
||||
RowMode row_mode = RowMode::Single; // 行模式(Single/Fixed/Partition)
|
||||
float cross_row_prob = 0.0f; // 跨行 move 概率(0=纯行内操作)
|
||||
int total_elements = 0; // Partition 模式下的总元素数
|
||||
int perm_repeat_count = 1; // 排列中每个值的重复次数(1=标准排列,>1=多重集排列)
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// SolverConfig — 求解器参数
|
||||
// ============================================================
|
||||
|
||||
struct SolverConfig {
|
||||
int pop_size = 0; // 种群大小(0 = 自动匹配 GPU 最大并行度)
|
||||
int max_gen = 1000;
|
||||
float mutation_rate = 0.1f;
|
||||
unsigned seed = 42;
|
||||
bool verbose = true;
|
||||
int print_every = 100;
|
||||
// 岛屿模型参数
|
||||
int num_islands = 1; // 0 = 自适应,1 = 纯爬山(无岛屿),>1 = 岛屿模型
|
||||
int migrate_interval = 100; // 每隔多少代执行一次迁移
|
||||
MigrateStrategy migrate_strategy = MigrateStrategy::Hybrid;
|
||||
// 模拟退火参数
|
||||
float sa_temp_init = 0.0f; // 初始温度(0 = 禁用 SA,纯爬山)
|
||||
float sa_alpha = 0.998f; // 冷却率(每代乘以 alpha)
|
||||
// v1.0: 交叉参数
|
||||
float crossover_rate = 0.1f; // 每代中执行交叉的概率(vs 变异)
|
||||
// v2.0: 自适应算子选择
|
||||
bool use_aos = false; // 启用 AOS(batch 间更新算子权重)
|
||||
float aos_weight_floor = AOS_WEIGHT_FLOOR; // 运行时可覆盖的 floor
|
||||
float aos_weight_cap = AOS_WEIGHT_CAP; // 运行时可覆盖的 cap
|
||||
// v2.1: 初始解策略
|
||||
int init_oversample = 4; // 采样倍数(1 = 不做采样择优,即纯随机)
|
||||
float init_random_ratio = 0.3f; // 纯随机解占比(多样性保底)
|
||||
// v3.0: 工程可用性
|
||||
float time_limit_sec = 0.0f; // 时间限制(秒,0 = 不限制,按 max_gen 跑完)
|
||||
int stagnation_limit = 0; // 收敛检测:连续多少个 batch 无改进后 reheat(0 = 禁用)
|
||||
float reheat_ratio = 0.5f; // reheat 时温度恢复到初始温度的比例
|
||||
// v3.5: CUDA Graph
|
||||
bool use_cuda_graph = false; // 启用 CUDA Graph(减少 kernel launch 开销)
|
||||
// v3.6: AOS 更新频率控制
|
||||
int aos_update_interval = 10; // 每隔多少个 batch 更新一次 AOS 权重(降低 cudaMemcpy 同步频率)
|
||||
// v4.0: 约束导向 + 分层搜索
|
||||
bool use_constraint_directed = false; // 启用约束导向(根据 penalty 比例动态调整跨行算子权重)
|
||||
bool use_phased_search = false; // 启用分层搜索(按进度调整全局 floor/cap)
|
||||
// 分层搜索参数:三期阈值
|
||||
float phase_explore_end = 0.30f; // 探索期结束(进度比例)
|
||||
float phase_refine_start = 0.70f; // 精细期开始(进度比例)
|
||||
// 约束导向参数
|
||||
float constraint_boost_max = 2.5f; // 高约束时跨行算子 cap 提升倍率上限
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// classify_problem — 从 ProblemConfig 推断问题画像
|
||||
// ============================================================
|
||||
|
||||
inline ProblemProfile classify_problem(const ProblemConfig& pcfg) {
|
||||
ProblemProfile p;
|
||||
p.encoding = pcfg.encoding;
|
||||
|
||||
if (pcfg.dim2_default <= 100) p.scale = ScaleClass::Small;
|
||||
else if (pcfg.dim2_default <= 250) p.scale = ScaleClass::Medium;
|
||||
else p.scale = ScaleClass::Large;
|
||||
|
||||
if (pcfg.dim1 <= 1)
|
||||
p.structure = StructClass::SingleSeq;
|
||||
else if (pcfg.row_mode == RowMode::Partition)
|
||||
p.structure = StructClass::MultiPartition;
|
||||
else
|
||||
p.structure = StructClass::MultiFixed;
|
||||
|
||||
p.cross_row_prob = pcfg.cross_row_prob;
|
||||
return p;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// build_seq_registry — 由 ProblemProfile 驱动的算子注册
|
||||
// ============================================================
|
||||
|
||||
inline SeqRegistry build_seq_registry(const ProblemProfile& prof) {
|
||||
SeqRegistry reg;
|
||||
reg.count = 0;
|
||||
for (int i = 0; i < MAX_SEQ; i++) {
|
||||
reg.ids[i] = -1; reg.weights[i] = 0.0f;
|
||||
reg.max_w[i] = 0.0f; reg.categories[i] = SeqCategory::InRow;
|
||||
}
|
||||
|
||||
auto add = [&](int id, float w, SeqCategory cat, float cap = 0.0f) {
|
||||
if (reg.count >= MAX_SEQ) return;
|
||||
reg.ids[reg.count] = id;
|
||||
reg.weights[reg.count] = w;
|
||||
reg.max_w[reg.count] = cap;
|
||||
reg.categories[reg.count] = cat;
|
||||
reg.count++;
|
||||
};
|
||||
|
||||
WeightPreset wp = get_weight_preset(prof.scale);
|
||||
bool multi_row = (prof.structure != StructClass::SingleSeq);
|
||||
float cr = prof.cross_row_prob;
|
||||
|
||||
if (prof.encoding == EncodingType::Permutation) {
|
||||
add(seq::SEQ_PERM_SWAP, 1.0f, SeqCategory::InRow);
|
||||
add(seq::SEQ_PERM_REVERSE, 1.0f, SeqCategory::InRow);
|
||||
add(seq::SEQ_PERM_INSERT, 1.0f, SeqCategory::InRow);
|
||||
add(seq::SEQ_PERM_DOUBLE_SWAP, 0.5f, SeqCategory::InRow);
|
||||
add(seq::SEQ_PERM_TRIPLE_SWAP, 0.3f, SeqCategory::InRow);
|
||||
|
||||
add(seq::SEQ_PERM_3OPT, wp.w_cubic, SeqCategory::InRow);
|
||||
add(seq::SEQ_PERM_OR_OPT, wp.w_quadratic, SeqCategory::InRow);
|
||||
|
||||
if (multi_row && cr > 0.0f) {
|
||||
add(seq::SEQ_PERM_CROSS_RELOCATE, 0.6f * cr, SeqCategory::CrossRow);
|
||||
add(seq::SEQ_PERM_CROSS_SWAP, 0.6f * cr, SeqCategory::CrossRow);
|
||||
add(seq::SEQ_PERM_SEG_RELOCATE, 0.5f * cr, SeqCategory::CrossRow);
|
||||
add(seq::SEQ_PERM_SEG_SWAP, 0.5f * cr, SeqCategory::CrossRow);
|
||||
add(seq::SEQ_PERM_CROSS_EXCHANGE, 0.4f * cr, SeqCategory::CrossRow);
|
||||
}
|
||||
if (multi_row) {
|
||||
add(seq::SEQ_ROW_SWAP, 0.3f, SeqCategory::RowLevel);
|
||||
add(seq::SEQ_ROW_REVERSE, 0.2f, SeqCategory::RowLevel);
|
||||
if (prof.structure == StructClass::MultiPartition) {
|
||||
add(seq::SEQ_ROW_SPLIT, 0.2f, SeqCategory::RowLevel);
|
||||
add(seq::SEQ_ROW_MERGE, 0.2f, SeqCategory::RowLevel);
|
||||
}
|
||||
}
|
||||
add(seq::SEQ_LNS_SEGMENT_SHUFFLE, wp.w_lns, SeqCategory::LNS, wp.lns_cap);
|
||||
add(seq::SEQ_LNS_SCATTER_SHUFFLE, wp.w_lns, SeqCategory::LNS, wp.lns_cap);
|
||||
add(seq::SEQ_LNS_GUIDED_REBUILD, wp.w_lns, SeqCategory::LNS, wp.lns_cap);
|
||||
}
|
||||
else if (prof.encoding == EncodingType::Binary) {
|
||||
add(seq::SEQ_BIN_FLIP, 1.0f, SeqCategory::InRow);
|
||||
add(seq::SEQ_BIN_SWAP, 0.8f, SeqCategory::InRow);
|
||||
add(seq::SEQ_BIN_SEG_FLIP, 0.6f, SeqCategory::InRow);
|
||||
add(seq::SEQ_BIN_K_FLIP, 0.6f, SeqCategory::InRow);
|
||||
if (multi_row && cr > 0.0f) {
|
||||
add(seq::SEQ_BIN_CROSS_SWAP, 0.5f * cr, SeqCategory::CrossRow);
|
||||
add(seq::SEQ_BIN_SEG_CROSS_SWAP, 0.4f * cr, SeqCategory::CrossRow);
|
||||
}
|
||||
if (multi_row) {
|
||||
add(seq::SEQ_ROW_SWAP, 0.3f, SeqCategory::RowLevel);
|
||||
add(seq::SEQ_ROW_REVERSE, 0.2f, SeqCategory::RowLevel);
|
||||
if (prof.structure == StructClass::MultiPartition) {
|
||||
add(seq::SEQ_ROW_SPLIT, 0.2f, SeqCategory::RowLevel);
|
||||
add(seq::SEQ_ROW_MERGE, 0.2f, SeqCategory::RowLevel);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (prof.encoding == EncodingType::Integer) {
|
||||
add(seq::SEQ_INT_RANDOM_RESET, 1.0f, SeqCategory::InRow);
|
||||
add(seq::SEQ_INT_DELTA, 1.0f, SeqCategory::InRow);
|
||||
add(seq::SEQ_INT_SWAP, 0.8f, SeqCategory::InRow);
|
||||
add(seq::SEQ_INT_SEG_RESET, 0.6f, SeqCategory::InRow);
|
||||
add(seq::SEQ_INT_K_DELTA, 0.6f, SeqCategory::InRow);
|
||||
if (multi_row && cr > 0.0f) {
|
||||
add(seq::SEQ_INT_CROSS_SWAP, 0.5f * cr, SeqCategory::CrossRow);
|
||||
}
|
||||
if (multi_row) {
|
||||
add(seq::SEQ_ROW_SWAP, 0.3f, SeqCategory::RowLevel);
|
||||
add(seq::SEQ_ROW_REVERSE, 0.2f, SeqCategory::RowLevel);
|
||||
if (prof.structure == StructClass::MultiPartition) {
|
||||
add(seq::SEQ_ROW_SPLIT, 0.2f, SeqCategory::RowLevel);
|
||||
add(seq::SEQ_ROW_MERGE, 0.2f, SeqCategory::RowLevel);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < reg.count; i++) sum += reg.weights[i];
|
||||
if (sum > 0.0f) {
|
||||
for (int i = 0; i < reg.count; i++) reg.weights[i] /= sum;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// ObjConfig — 传到 GPU 的目标比较配置(紧凑结构)
|
||||
// ============================================================
|
||||
|
||||
struct ObjConfig {
|
||||
int num_obj;
|
||||
CompareMode mode;
|
||||
ObjDir dirs[MAX_OBJ]; // 每个目标的方向
|
||||
float weights[MAX_OBJ]; // Weighted 模式下的权重
|
||||
int priority[MAX_OBJ]; // Lexicographic 模式下的比较顺序
|
||||
float tolerance[MAX_OBJ]; // Lexicographic 模式下的容差
|
||||
};
|
||||
|
||||
// 从 ProblemConfig 构造 ObjConfig(CPU 端)
|
||||
inline ObjConfig make_obj_config(const ProblemConfig& pcfg) {
|
||||
ObjConfig oc;
|
||||
oc.num_obj = pcfg.num_objectives;
|
||||
oc.mode = pcfg.compare_mode;
|
||||
for (int i = 0; i < MAX_OBJ; i++) {
|
||||
oc.dirs[i] = pcfg.obj_dirs[i];
|
||||
oc.weights[i] = pcfg.obj_weights[i];
|
||||
oc.priority[i] = pcfg.obj_priority[i];
|
||||
oc.tolerance[i] = pcfg.obj_tolerance[i];
|
||||
}
|
||||
return oc;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// SolveResult — solve() 的返回值
|
||||
// ============================================================
|
||||
|
||||
enum class StopReason { MaxGen, TimeLimit, Stagnation };
|
||||
|
||||
template<typename Sol>
|
||||
struct SolveResult {
|
||||
Sol best_solution;
|
||||
float elapsed_ms = 0.0f;
|
||||
int generations = 0;
|
||||
StopReason stop_reason = StopReason::MaxGen;
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// 目标重要性映射 — 统一 Weighted / Lexicographic 的重要性度量
|
||||
// ============================================================
|
||||
// 用于初始化选种(NSGA-II 加权拥挤度 + 核心目标预留名额)
|
||||
// Weighted: importance[i] = weight[i] / Σweight
|
||||
// Lexicographic: importance[i] = 0.5^rank[i] / Σ(0.5^rank)
|
||||
// → 第一优先级 ~57%,第二 ~29%,第三 ~14%
|
||||
|
||||
inline void compute_importance(const ObjConfig& oc, float* importance) {
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < oc.num_obj; i++) {
|
||||
if (oc.mode == CompareMode::Weighted) {
|
||||
importance[i] = oc.weights[i];
|
||||
} else {
|
||||
int rank = oc.priority[i];
|
||||
importance[i] = 1.0f;
|
||||
for (int r = 0; r < rank; r++) importance[i] *= 0.5f; // 0.5^rank
|
||||
}
|
||||
sum += importance[i];
|
||||
}
|
||||
if (sum > 0.0f) {
|
||||
for (int i = 0; i < oc.num_obj; i++)
|
||||
importance[i] /= sum;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// 比较工具 — 支持 Weighted / Lexicographic
|
||||
// ============================================================
|
||||
|
||||
// 将目标值统一为"越小越好":Maximize 目标取负
|
||||
__device__ __host__ inline float normalize_obj(float val, ObjDir dir) {
|
||||
return (dir == ObjDir::Maximize) ? -val : val;
|
||||
}
|
||||
|
||||
// 核心比较:a 是否优于 b
|
||||
template<typename Sol>
|
||||
__device__ inline bool is_better(const Sol& a, const Sol& b,
|
||||
const ObjConfig& oc) {
|
||||
// penalty 优先:可行解一定优于不可行解
|
||||
if (a.penalty <= 0.0f && b.penalty > 0.0f) return true;
|
||||
if (a.penalty > 0.0f && b.penalty <= 0.0f) return false;
|
||||
if (a.penalty > 0.0f && b.penalty > 0.0f) return a.penalty < b.penalty;
|
||||
|
||||
if (oc.mode == CompareMode::Weighted) {
|
||||
// 加权求和(权重已包含方向信息:Maximize 目标用负权重,或由 normalize_obj 处理)
|
||||
float sum_a = 0.0f, sum_b = 0.0f;
|
||||
for (int i = 0; i < oc.num_obj; i++) {
|
||||
float na = normalize_obj(a.objectives[i], oc.dirs[i]);
|
||||
float nb = normalize_obj(b.objectives[i], oc.dirs[i]);
|
||||
sum_a += oc.weights[i] * na;
|
||||
sum_b += oc.weights[i] * nb;
|
||||
}
|
||||
return sum_a < sum_b;
|
||||
} else {
|
||||
// 字典法:按 priority 顺序逐目标比较
|
||||
for (int p = 0; p < oc.num_obj; p++) {
|
||||
int idx = oc.priority[p];
|
||||
float va = normalize_obj(a.objectives[idx], oc.dirs[idx]);
|
||||
float vb = normalize_obj(b.objectives[idx], oc.dirs[idx]);
|
||||
float diff = va - vb;
|
||||
if (diff < -oc.tolerance[idx]) return true; // a 明显更好
|
||||
if (diff > oc.tolerance[idx]) return false; // b 明显更好
|
||||
// 在容差内视为相等 → 继续比较下一个目标
|
||||
}
|
||||
return false; // 所有目标都在容差内相等
|
||||
}
|
||||
}
|
||||
|
||||
// 标量化(SA 接受概率用):返回越小越好的标量
|
||||
template<typename Sol>
|
||||
__device__ __host__ inline float scalar_objective(const Sol& sol,
|
||||
const ObjConfig& oc) {
|
||||
if (oc.mode == CompareMode::Weighted) {
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < oc.num_obj; i++)
|
||||
sum += oc.weights[i] * normalize_obj(sol.objectives[i], oc.dirs[i]);
|
||||
return sum;
|
||||
} else {
|
||||
// 字典法下 SA 用第一优先级目标作为标量
|
||||
int idx = oc.priority[0];
|
||||
return normalize_obj(sol.objectives[idx], oc.dirs[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
// 轻量比较:直接操作 float[] 目标数组(避免复制整个 Sol)
|
||||
__device__ inline bool obj_is_better(const float* new_objs, const float* old_objs,
|
||||
const ObjConfig& oc) {
|
||||
if (oc.mode == CompareMode::Weighted) {
|
||||
float sum_new = 0.0f, sum_old = 0.0f;
|
||||
for (int i = 0; i < oc.num_obj; i++) {
|
||||
sum_new += oc.weights[i] * normalize_obj(new_objs[i], oc.dirs[i]);
|
||||
sum_old += oc.weights[i] * normalize_obj(old_objs[i], oc.dirs[i]);
|
||||
}
|
||||
return sum_new < sum_old;
|
||||
} else {
|
||||
for (int p = 0; p < oc.num_obj; p++) {
|
||||
int idx = oc.priority[p];
|
||||
float va = normalize_obj(new_objs[idx], oc.dirs[idx]);
|
||||
float vb = normalize_obj(old_objs[idx], oc.dirs[idx]);
|
||||
float diff = va - vb;
|
||||
if (diff < -oc.tolerance[idx]) return true;
|
||||
if (diff > oc.tolerance[idx]) return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// 轻量标量化:直接操作 float[] 目标数组
|
||||
__device__ __host__ inline float obj_scalar(const float* objs, const ObjConfig& oc) {
|
||||
if (oc.mode == CompareMode::Weighted) {
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < oc.num_obj; i++)
|
||||
sum += oc.weights[i] * normalize_obj(objs[i], oc.dirs[i]);
|
||||
return sum;
|
||||
} else {
|
||||
int idx = oc.priority[0];
|
||||
return normalize_obj(objs[idx], oc.dirs[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// AOSStats — 自适应算子选择统计(每个 block 一份)
|
||||
// ============================================================
|
||||
// v3.0: 粒度从 3 层 → MAX_SEQ 个序列
|
||||
// 记录每个序列的使用次数和改进次数
|
||||
// batch 结束后由 host 聚合,更新 SeqRegistry 权重
|
||||
|
||||
struct AOSStats {
|
||||
// 算子层统计(第二层)
|
||||
int usage[MAX_SEQ]; // 各序列使用次数
|
||||
int improvement[MAX_SEQ]; // 各序列改进次数(delta < 0 且被接受)
|
||||
// K 步数层统计(第一层)
|
||||
int k_usage[MAX_K]; // K=1,2,3 各自使用次数
|
||||
int k_improvement[MAX_K]; // K=1,2,3 各自改进次数
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// ObjDef — 单个目标的定义(编译期常量)
|
||||
// ============================================================
|
||||
|
||||
struct ObjDef {
|
||||
ObjDir dir; // 优化方向
|
||||
float weight; // Weighted 模式下的权重
|
||||
float tolerance; // Lexicographic 模式下的容差
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// HeuristicMatrix — 启发式初始解构造用的数据矩阵描述
|
||||
// ============================================================
|
||||
|
||||
struct HeuristicMatrix {
|
||||
const float* data; // host 端 N*N 矩阵
|
||||
int N; // 维度
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// ProblemBase<Derived, D1, D2> — CRTP 基类
|
||||
//
|
||||
// 用户继承此基类,提供:
|
||||
// static constexpr ObjDef OBJ_DEFS[] = {...}; — 目标元信息
|
||||
// __device__ float compute_obj(int idx, ...) const; — 目标分发
|
||||
// __device__ float compute_penalty(...) const;
|
||||
//
|
||||
// 约定:OBJ_DEFS 和 compute_obj 紧挨着写,case N 对应 OBJ_DEFS[N]
|
||||
// NUM_OBJ 由 sizeof(OBJ_DEFS) 自动推导,无需手动维护
|
||||
//
|
||||
// 基类自动提供:
|
||||
// evaluate(sol) — 遍历目标列表调用 compute_obj
|
||||
// fill_obj_config(cfg) — 从 OBJ_DEFS 自动填充 ProblemConfig
|
||||
// obj_config() — 直接生成 ObjConfig
|
||||
// ============================================================
|
||||
|
||||
template<typename Derived, int D1_, int D2_>
|
||||
struct ProblemBase {
|
||||
static constexpr int D1 = D1_;
|
||||
static constexpr int D2 = D2_;
|
||||
using Sol = Solution<D1, D2>;
|
||||
|
||||
// NUM_OBJ 从 OBJ_DEFS 数组自动推导
|
||||
static constexpr int NUM_OBJ = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
|
||||
|
||||
// 自动评估:遍历目标列表
|
||||
__device__ void evaluate(Sol& sol) const {
|
||||
const auto& self = static_cast<const Derived&>(*this);
|
||||
constexpr int n = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
|
||||
for (int i = 0; i < n; i++)
|
||||
sol.objectives[i] = self.compute_obj(i, sol);
|
||||
sol.penalty = self.compute_penalty(sol);
|
||||
}
|
||||
|
||||
// 从 OBJ_DEFS 自动填充 ProblemConfig 的目标部分
|
||||
void fill_obj_config(ProblemConfig& cfg) const {
|
||||
constexpr int n = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
|
||||
cfg.num_objectives = n;
|
||||
for (int i = 0; i < n; i++) {
|
||||
cfg.obj_dirs[i] = Derived::OBJ_DEFS[i].dir;
|
||||
cfg.obj_weights[i] = Derived::OBJ_DEFS[i].weight;
|
||||
cfg.obj_tolerance[i] = Derived::OBJ_DEFS[i].tolerance;
|
||||
cfg.obj_priority[i] = i; // 列表顺序即优先级
|
||||
}
|
||||
}
|
||||
|
||||
// 直接生成 ObjConfig(供 solver 使用)
|
||||
ObjConfig obj_config() const {
|
||||
ProblemConfig pcfg;
|
||||
fill_obj_config(pcfg);
|
||||
return make_obj_config(pcfg);
|
||||
}
|
||||
|
||||
// 每个 block 在 global memory 中的热数据工作集大小(字节)
|
||||
// 用于 auto pop_size 估算 L2 cache 压力
|
||||
// 默认 = shared_mem_bytes()(数据在 smem 时,gmem 工作集为 0 不影响)
|
||||
// 子类覆盖:当 shared_mem_bytes() 返回 0(数据放不进 smem)时,
|
||||
// 返回实际数据大小(如距离矩阵 n*n*sizeof(float))
|
||||
size_t working_set_bytes() const {
|
||||
return static_cast<const Derived&>(*this).shared_mem_bytes();
|
||||
}
|
||||
|
||||
// 可选:初始化 G/O 关系矩阵(为 GUIDED_REBUILD 提供先验知识)
|
||||
// G[i*N+j]: 元素 i 和 j 的分组倾向(对称,[0,1],越大越倾向同组)
|
||||
// O[i*N+j]: 元素 i 排在 j 前面的倾向(不对称,[0,1])
|
||||
// 默认不提供(全零),搜索过程中通过 EMA 从历史好解积累
|
||||
// 用户覆盖示例:距离近 → G 和 O 都高
|
||||
void init_relation_matrix(float* h_G, float* h_O, int N) const {
|
||||
(void)h_G; (void)h_O; (void)N; // 默认:不做任何事(保持全零)
|
||||
}
|
||||
|
||||
// 可选:返回 host 端数据矩阵供启发式初始解构造
|
||||
// 默认返回 0(不提供),子类 override 后填充 out 数组并返回实际数量
|
||||
int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
|
||||
(void)out; (void)max_count;
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
114
python/cugenopt/include/problems/assignment.cuh
Normal file
114
python/cugenopt/include/problems/assignment.cuh
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
/**
|
||||
* assignment.cuh - 指派问题
|
||||
*
|
||||
* 继承 ProblemBase,使用 ObjDef 目标注册机制
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
#include "operators.cuh"
|
||||
|
||||
struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
|
||||
const float* d_cost;
|
||||
const float* h_cost; // host 端成本矩阵(用于 init_relation_matrix)
|
||||
int n;
|
||||
|
||||
// ---- 目标计算 ----
|
||||
__device__ float calc_total_cost(const Sol& sol) const {
|
||||
float total = 0.0f;
|
||||
const int* assign = sol.data[0];
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++)
|
||||
total += d_cost[i * n + assign[i]];
|
||||
return total;
|
||||
}
|
||||
|
||||
// ---- 目标定义(OBJ_DEFS 与 compute_obj 必须一一对应)----
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_cost
|
||||
};
|
||||
__device__ float compute_obj(int idx, const Sol& sol) const {
|
||||
switch (idx) {
|
||||
case 0: return calc_total_cost(sol); // OBJ_DEFS[0]
|
||||
default: return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float compute_penalty(const Sol& sol) const {
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = EncodingType::Permutation;
|
||||
cfg.dim1 = 1; cfg.dim2_default = n;
|
||||
fill_obj_config(cfg);
|
||||
return cfg;
|
||||
}
|
||||
|
||||
// ---- shared memory 接口 ----
|
||||
static constexpr size_t SMEM_LIMIT = 48 * 1024;
|
||||
|
||||
size_t shared_mem_bytes() const {
|
||||
size_t need = (size_t)n * n * sizeof(float);
|
||||
return need <= SMEM_LIMIT ? need : 0;
|
||||
}
|
||||
|
||||
size_t working_set_bytes() const {
|
||||
return (size_t)n * n * sizeof(float);
|
||||
}
|
||||
|
||||
__device__ void load_shared(char* smem, int tid, int bsz) {
|
||||
float* sc = reinterpret_cast<float*>(smem);
|
||||
int total = n * n;
|
||||
for (int i = tid; i < total; i += bsz) sc[i] = d_cost[i];
|
||||
d_cost = sc;
|
||||
}
|
||||
|
||||
// 成本先验:task j 和 task k 如果被相似 agent 偏好,G 值高
|
||||
// O 矩阵:task j 在位置 i 成本低 → O[j][k] 略高(j 倾向排在 k 前面的位置)
|
||||
void init_relation_matrix(float* G, float* O, int N) const {
|
||||
if (!h_cost || N != n) return;
|
||||
// 对每个 task,构建成本向量,task 间余弦相似度 → G
|
||||
// 简化:成本列向量的相关性
|
||||
float max_c = 0.0f;
|
||||
for (int i = 0; i < N * N; i++)
|
||||
if (h_cost[i] > max_c) max_c = h_cost[i];
|
||||
if (max_c <= 0.0f) return;
|
||||
|
||||
for (int j = 0; j < N; j++)
|
||||
for (int k = 0; k < N; k++) {
|
||||
if (j == k) continue;
|
||||
// G: 两个 task 的成本向量越相似 → 越可能互换
|
||||
float dot = 0.0f, nj = 0.0f, nk = 0.0f;
|
||||
for (int i = 0; i < N; i++) {
|
||||
float cj = h_cost[i * N + j] / max_c;
|
||||
float ck = h_cost[i * N + k] / max_c;
|
||||
dot += cj * ck;
|
||||
nj += cj * cj;
|
||||
nk += ck * ck;
|
||||
}
|
||||
float denom = sqrtf(nj) * sqrtf(nk);
|
||||
float sim = (denom > 1e-6f) ? dot / denom : 0.0f;
|
||||
G[j * N + k] = sim * 0.2f;
|
||||
O[j * N + k] = sim * 0.05f;
|
||||
}
|
||||
}
|
||||
|
||||
static AssignmentProblem create(const float* hc, int n) {
|
||||
AssignmentProblem prob;
|
||||
prob.n = n;
|
||||
prob.h_cost = hc;
|
||||
float* dc;
|
||||
CUDA_CHECK(cudaMalloc(&dc, sizeof(float)*n*n));
|
||||
CUDA_CHECK(cudaMemcpy(dc, hc, sizeof(float)*n*n, cudaMemcpyHostToDevice));
|
||||
prob.d_cost = dc;
|
||||
return prob;
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (d_cost) { cudaFree(const_cast<float*>(d_cost)); d_cost = nullptr; }
|
||||
h_cost = nullptr;
|
||||
}
|
||||
};
|
||||
97
python/cugenopt/include/problems/bin_packing.cuh
Normal file
97
python/cugenopt/include/problems/bin_packing.cuh
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
/**
|
||||
* bin_packing.cuh - 一维装箱问题(Integer 编码 + 约束)
|
||||
*
|
||||
* N 个物品,每个重量 w[i],装入最多 B 个箱子,每个箱子容量 C。
|
||||
* 决策变量:data[0][i] ∈ [0, B-1],表示物品 i 放入的箱子编号。
|
||||
* 目标:最小化使用的箱子数。
|
||||
* 约束:每个箱子总重不超过 C,超出部分作为 penalty。
|
||||
*
|
||||
* 验证实例:8 物品 weights=[7,5,3,4,6,2,8,1], C=10, 最优=4 箱
|
||||
* 箱0={7,3}=10, 箱1={5,4,1}=10, 箱2={6,2}=8, 箱3={8}=8
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
|
||||
struct BinPackingProblem : ProblemBase<BinPackingProblem, 1, 64> {
|
||||
const float* d_weights;
|
||||
int n; // 物品数
|
||||
int max_bins; // 最大箱子数 B
|
||||
float capacity; // 箱子容量 C
|
||||
|
||||
__device__ float calc_bins_used(const Sol& sol) const {
|
||||
bool used[32] = {};
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++) {
|
||||
int b = sol.data[0][i];
|
||||
if (b >= 0 && b < max_bins) used[b] = true;
|
||||
}
|
||||
int count = 0;
|
||||
for (int b = 0; b < max_bins; b++)
|
||||
if (used[b]) count++;
|
||||
return (float)count;
|
||||
}
|
||||
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{ObjDir::Minimize, 1.0f, 0.0f},
|
||||
};
|
||||
__device__ float compute_obj(int idx, const Sol& sol) const {
|
||||
switch (idx) {
|
||||
case 0: return calc_bins_used(sol);
|
||||
default: return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float compute_penalty(const Sol& sol) const {
|
||||
float penalty = 0.0f;
|
||||
float load[32] = {};
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++) {
|
||||
int b = sol.data[0][i];
|
||||
if (b >= 0 && b < max_bins)
|
||||
load[b] += d_weights[i];
|
||||
}
|
||||
for (int b = 0; b < max_bins; b++) {
|
||||
float over = load[b] - capacity;
|
||||
if (over > 0.0f) penalty += over * 10.0f;
|
||||
}
|
||||
return penalty;
|
||||
}
|
||||
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = EncodingType::Integer;
|
||||
cfg.dim1 = 1; cfg.dim2_default = n;
|
||||
cfg.value_lower_bound = 0;
|
||||
cfg.value_upper_bound = max_bins - 1;
|
||||
fill_obj_config(cfg);
|
||||
return cfg;
|
||||
}
|
||||
|
||||
size_t shared_mem_bytes() const {
|
||||
return (size_t)n * sizeof(float);
|
||||
}
|
||||
|
||||
__device__ void load_shared(char* smem, int tid, int bsz) {
|
||||
float* sw = reinterpret_cast<float*>(smem);
|
||||
for (int i = tid; i < n; i += bsz) sw[i] = d_weights[i];
|
||||
d_weights = sw;
|
||||
}
|
||||
|
||||
static BinPackingProblem create(const float* h_weights, int n,
|
||||
int max_bins, float capacity) {
|
||||
BinPackingProblem prob;
|
||||
prob.n = n; prob.max_bins = max_bins; prob.capacity = capacity;
|
||||
float* dw;
|
||||
CUDA_CHECK(cudaMalloc(&dw, sizeof(float) * n));
|
||||
CUDA_CHECK(cudaMemcpy(dw, h_weights, sizeof(float) * n, cudaMemcpyHostToDevice));
|
||||
prob.d_weights = dw;
|
||||
return prob;
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (d_weights) cudaFree(const_cast<float*>(d_weights));
|
||||
d_weights = nullptr;
|
||||
}
|
||||
};
|
||||
79
python/cugenopt/include/problems/graph_color.cuh
Normal file
79
python/cugenopt/include/problems/graph_color.cuh
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
/**
|
||||
* graph_color.cuh - 图着色问题(Integer 编码)
|
||||
*
|
||||
* N 个节点的图,用 k 种颜色着色。
|
||||
* 决策变量:data[0][i] ∈ [0, k-1],表示节点 i 的颜色。
|
||||
* 目标:最小化冲突边数(相邻节点同色的边数)。
|
||||
*
|
||||
* 验证实例:Petersen 图(10 节点 15 边,色数=3,最优冲突=0)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
|
||||
struct GraphColorProblem : ProblemBase<GraphColorProblem, 1, 64> {
|
||||
const int* d_adj; // 邻接矩阵 [N*N](1=相邻, 0=不相邻)
|
||||
int n; // 节点数
|
||||
int k; // 颜色数
|
||||
|
||||
__device__ float calc_conflicts(const Sol& sol) const {
|
||||
int conflicts = 0;
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++)
|
||||
for (int j = i + 1; j < size; j++)
|
||||
if (d_adj[i * n + j] && sol.data[0][i] == sol.data[0][j])
|
||||
conflicts++;
|
||||
return (float)conflicts;
|
||||
}
|
||||
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{ObjDir::Minimize, 1.0f, 0.0f},
|
||||
};
|
||||
__device__ float compute_obj(int idx, const Sol& sol) const {
|
||||
switch (idx) {
|
||||
case 0: return calc_conflicts(sol);
|
||||
default: return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float compute_penalty(const Sol& sol) const {
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = EncodingType::Integer;
|
||||
cfg.dim1 = 1; cfg.dim2_default = n;
|
||||
cfg.value_lower_bound = 0;
|
||||
cfg.value_upper_bound = k - 1;
|
||||
fill_obj_config(cfg);
|
||||
return cfg;
|
||||
}
|
||||
|
||||
size_t shared_mem_bytes() const {
|
||||
return (size_t)n * n * sizeof(int);
|
||||
}
|
||||
|
||||
__device__ void load_shared(char* smem, int tid, int bsz) {
|
||||
int* sa = reinterpret_cast<int*>(smem);
|
||||
int total = n * n;
|
||||
for (int i = tid; i < total; i += bsz) sa[i] = d_adj[i];
|
||||
d_adj = sa;
|
||||
}
|
||||
|
||||
static GraphColorProblem create(const int* h_adj, int n, int k) {
|
||||
GraphColorProblem prob;
|
||||
prob.n = n; prob.k = k;
|
||||
int* da;
|
||||
CUDA_CHECK(cudaMalloc(&da, sizeof(int) * n * n));
|
||||
CUDA_CHECK(cudaMemcpy(da, h_adj, sizeof(int) * n * n, cudaMemcpyHostToDevice));
|
||||
prob.d_adj = da;
|
||||
return prob;
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (d_adj) cudaFree(const_cast<int*>(d_adj));
|
||||
d_adj = nullptr;
|
||||
}
|
||||
};
|
||||
271
python/cugenopt/include/problems/jsp.cuh
Normal file
271
python/cugenopt/include/problems/jsp.cuh
Normal file
|
|
@ -0,0 +1,271 @@
|
|||
/**
|
||||
* jsp.cuh - 车间调度问题 (Job Shop Scheduling Problem)
|
||||
*
|
||||
* J 个工件,每个工件有 O 道工序,每道工序指定机器和耗时。
|
||||
*
|
||||
* === 编码方案 A:Integer 多行(时间表编码)===
|
||||
* JSPProblem: data[j][i] = 工件 j 的第 i 道工序的开始时间
|
||||
* dim1 = num_jobs, dim2_default = num_ops
|
||||
* row_mode = Fixed(禁止 ROW_SPLIT/ROW_MERGE)
|
||||
* 每行代表一个工件的固定工序序列,行长度不可变
|
||||
*
|
||||
* === 编码方案 B:Permutation 多重集(工序排列编码)===
|
||||
* JSPPermProblem: data[0][k] = 工件编号(0..J-1),长度 J*O
|
||||
* 值 j 出现 O 次。从左到右扫描,第 t 次遇到值 j 表示工件 j 的第 t 道工序。
|
||||
* dim1 = 1, dim2_default = J*O, perm_repeat_count = O
|
||||
* 标准 Permutation 算子(swap/reverse/insert)天然保持多重集结构
|
||||
*
|
||||
* 目标:Minimize makespan(所有工件完成时间的最大值)。
|
||||
* 约束:
|
||||
* (a) 工序顺序:同一工件的工序必须按序执行
|
||||
* (b) 机器冲突:同一机器同一时刻只能处理一个工序
|
||||
*
|
||||
* 验证实例:自定义 3 工件 3 机器 (3x3),最优 makespan = 12
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
|
||||
// ============================================================
|
||||
// 编码方案 A:Integer 多行(时间表编码)
|
||||
// ============================================================
|
||||
|
||||
struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
|
||||
const int* d_machine; // 工序所需机器 [J*O]
|
||||
const float* d_duration; // 工序耗时 [J*O]
|
||||
int num_jobs; // 工件数 J
|
||||
int num_ops; // 每工件工序数 O
|
||||
int num_machines; // 机器数 M
|
||||
int time_horizon; // 时间上界
|
||||
|
||||
__device__ float calc_makespan(const Sol& sol) const {
|
||||
float makespan = 0.0f;
|
||||
for (int j = 0; j < num_jobs; j++) {
|
||||
int last = num_ops - 1;
|
||||
float end = (float)sol.data[j][last] + d_duration[j * num_ops + last];
|
||||
if (end > makespan) makespan = end;
|
||||
}
|
||||
return makespan;
|
||||
}
|
||||
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{ObjDir::Minimize, 1.0f, 0.0f},
|
||||
};
|
||||
__device__ float compute_obj(int idx, const Sol& sol) const {
|
||||
switch (idx) {
|
||||
case 0: return calc_makespan(sol);
|
||||
default: return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float compute_penalty(const Sol& sol) const {
|
||||
float penalty = 0.0f;
|
||||
|
||||
// (a) 工序顺序约束
|
||||
for (int j = 0; j < num_jobs; j++) {
|
||||
for (int i = 1; i < num_ops; i++) {
|
||||
float prev_end = (float)sol.data[j][i-1] + d_duration[j * num_ops + (i-1)];
|
||||
float curr_start = (float)sol.data[j][i];
|
||||
if (curr_start < prev_end)
|
||||
penalty += (prev_end - curr_start) * 10.0f;
|
||||
}
|
||||
}
|
||||
|
||||
// (b) 机器冲突约束
|
||||
int total = num_jobs * num_ops;
|
||||
for (int a = 0; a < total; a++) {
|
||||
int ja = a / num_ops, ia = a % num_ops;
|
||||
int m_a = d_machine[a];
|
||||
float s_a = (float)sol.data[ja][ia];
|
||||
float e_a = s_a + d_duration[a];
|
||||
for (int b = a + 1; b < total; b++) {
|
||||
if (d_machine[b] != m_a) continue;
|
||||
int jb = b / num_ops, ib = b % num_ops;
|
||||
float s_b = (float)sol.data[jb][ib];
|
||||
float e_b = s_b + d_duration[b];
|
||||
float overlap = fminf(e_a, e_b) - fmaxf(s_a, s_b);
|
||||
if (overlap > 0.0f)
|
||||
penalty += overlap * 10.0f;
|
||||
}
|
||||
}
|
||||
|
||||
return penalty;
|
||||
}
|
||||
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = EncodingType::Integer;
|
||||
cfg.dim1 = num_jobs;
|
||||
cfg.dim2_default = num_ops;
|
||||
cfg.value_lower_bound = 0;
|
||||
cfg.value_upper_bound = time_horizon - 1;
|
||||
cfg.row_mode = RowMode::Fixed;
|
||||
fill_obj_config(cfg);
|
||||
return cfg;
|
||||
}
|
||||
|
||||
size_t shared_mem_bytes() const {
|
||||
int total = num_jobs * num_ops;
|
||||
return (size_t)total * (sizeof(int) + sizeof(float));
|
||||
}
|
||||
|
||||
__device__ void load_shared(char* smem, int tid, int bsz) {
|
||||
int total = num_jobs * num_ops;
|
||||
int* sm = reinterpret_cast<int*>(smem);
|
||||
for (int i = tid; i < total; i += bsz) sm[i] = d_machine[i];
|
||||
d_machine = sm;
|
||||
|
||||
float* sd = reinterpret_cast<float*>(sm + total);
|
||||
for (int i = tid; i < total; i += bsz) sd[i] = d_duration[i];
|
||||
d_duration = sd;
|
||||
}
|
||||
|
||||
static JSPProblem create(const int* h_machine, const float* h_duration,
|
||||
int num_jobs, int num_ops, int num_machines,
|
||||
int time_horizon) {
|
||||
JSPProblem prob;
|
||||
prob.num_jobs = num_jobs;
|
||||
prob.num_ops = num_ops;
|
||||
prob.num_machines = num_machines;
|
||||
prob.time_horizon = time_horizon;
|
||||
|
||||
int total = num_jobs * num_ops;
|
||||
int* dm;
|
||||
CUDA_CHECK(cudaMalloc(&dm, sizeof(int) * total));
|
||||
CUDA_CHECK(cudaMemcpy(dm, h_machine, sizeof(int) * total, cudaMemcpyHostToDevice));
|
||||
prob.d_machine = dm;
|
||||
|
||||
float* dd;
|
||||
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * total));
|
||||
CUDA_CHECK(cudaMemcpy(dd, h_duration, sizeof(float) * total, cudaMemcpyHostToDevice));
|
||||
prob.d_duration = dd;
|
||||
|
||||
return prob;
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (d_machine) { cudaFree(const_cast<int*>(d_machine)); d_machine = nullptr; }
|
||||
if (d_duration) { cudaFree(const_cast<float*>(d_duration)); d_duration = nullptr; }
|
||||
}
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// 编码方案 B:Permutation 多重集(工序排列编码)
|
||||
// ============================================================
|
||||
// data[0] 是长度 J*O 的排列,值域 [0, J),每个值出现 O 次
|
||||
// 从左到右扫描:第 t 次遇到值 j → 安排工件 j 的第 t 道工序
|
||||
// 贪心解码:每道工序安排在"最早可行时间"(满足工序顺序 + 机器空闲)
|
||||
|
||||
struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
|
||||
const int* d_machine; // 工序所需机器 [J*O]
|
||||
const float* d_duration; // 工序耗时 [J*O]
|
||||
int num_jobs;
|
||||
int num_ops;
|
||||
int num_machines;
|
||||
|
||||
// 贪心解码:从排列生成调度方案,返回 makespan
|
||||
__device__ float decode_and_makespan(const Sol& sol) const {
|
||||
int total = num_jobs * num_ops;
|
||||
int size = sol.dim2_sizes[0];
|
||||
if (size < total) return 1e9f;
|
||||
|
||||
float job_avail[8]; // 每个工件的下一道工序最早开始时间
|
||||
float mach_avail[8]; // 每台机器的最早空闲时间
|
||||
int job_next_op[8]; // 每个工件的下一道待安排工序编号
|
||||
|
||||
for (int j = 0; j < num_jobs; j++) { job_avail[j] = 0.0f; job_next_op[j] = 0; }
|
||||
for (int m = 0; m < num_machines; m++) mach_avail[m] = 0.0f;
|
||||
|
||||
float makespan = 0.0f;
|
||||
for (int k = 0; k < total; k++) {
|
||||
int j = sol.data[0][k];
|
||||
if (j < 0 || j >= num_jobs) return 1e9f;
|
||||
int op = job_next_op[j];
|
||||
if (op >= num_ops) continue; // 该工件已安排完
|
||||
|
||||
int flat = j * num_ops + op;
|
||||
int m = d_machine[flat];
|
||||
float dur = d_duration[flat];
|
||||
|
||||
// 最早开始时间 = max(工件前序完成, 机器空闲)
|
||||
float start = fmaxf(job_avail[j], mach_avail[m]);
|
||||
float end = start + dur;
|
||||
|
||||
job_avail[j] = end;
|
||||
mach_avail[m] = end;
|
||||
job_next_op[j] = op + 1;
|
||||
|
||||
if (end > makespan) makespan = end;
|
||||
}
|
||||
|
||||
return makespan;
|
||||
}
|
||||
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{ObjDir::Minimize, 1.0f, 0.0f},
|
||||
};
|
||||
__device__ float compute_obj(int idx, const Sol& sol) const {
|
||||
switch (idx) {
|
||||
case 0: return decode_and_makespan(sol);
|
||||
default: return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
// 贪心解码天然满足约束,penalty 始终为 0
|
||||
__device__ float compute_penalty(const Sol& sol) const {
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = EncodingType::Permutation;
|
||||
cfg.dim1 = 1;
|
||||
cfg.dim2_default = num_jobs * num_ops;
|
||||
cfg.perm_repeat_count = num_ops;
|
||||
fill_obj_config(cfg);
|
||||
return cfg;
|
||||
}
|
||||
|
||||
size_t shared_mem_bytes() const {
|
||||
int total = num_jobs * num_ops;
|
||||
return (size_t)total * (sizeof(int) + sizeof(float));
|
||||
}
|
||||
|
||||
__device__ void load_shared(char* smem, int tid, int bsz) {
|
||||
int total = num_jobs * num_ops;
|
||||
int* sm = reinterpret_cast<int*>(smem);
|
||||
for (int i = tid; i < total; i += bsz) sm[i] = d_machine[i];
|
||||
d_machine = sm;
|
||||
|
||||
float* sd = reinterpret_cast<float*>(sm + total);
|
||||
for (int i = tid; i < total; i += bsz) sd[i] = d_duration[i];
|
||||
d_duration = sd;
|
||||
}
|
||||
|
||||
static JSPPermProblem create(const int* h_machine, const float* h_duration,
|
||||
int num_jobs, int num_ops, int num_machines) {
|
||||
JSPPermProblem prob;
|
||||
prob.num_jobs = num_jobs;
|
||||
prob.num_ops = num_ops;
|
||||
prob.num_machines = num_machines;
|
||||
|
||||
int total = num_jobs * num_ops;
|
||||
int* dm;
|
||||
CUDA_CHECK(cudaMalloc(&dm, sizeof(int) * total));
|
||||
CUDA_CHECK(cudaMemcpy(dm, h_machine, sizeof(int) * total, cudaMemcpyHostToDevice));
|
||||
prob.d_machine = dm;
|
||||
|
||||
float* dd;
|
||||
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * total));
|
||||
CUDA_CHECK(cudaMemcpy(dd, h_duration, sizeof(float) * total, cudaMemcpyHostToDevice));
|
||||
prob.d_duration = dd;
|
||||
|
||||
return prob;
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (d_machine) { cudaFree(const_cast<int*>(d_machine)); d_machine = nullptr; }
|
||||
if (d_duration) { cudaFree(const_cast<float*>(d_duration)); d_duration = nullptr; }
|
||||
}
|
||||
};
|
||||
88
python/cugenopt/include/problems/knapsack.cuh
Normal file
88
python/cugenopt/include/problems/knapsack.cuh
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
/**
|
||||
* knapsack.cuh - 0-1 背包问题
|
||||
*
|
||||
* 继承 ProblemBase,使用 ObjDef 目标注册机制
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
#include "operators.cuh"
|
||||
|
||||
struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
|
||||
// 问题数据(d_weights 是物品重量,非目标权重)
|
||||
const float* d_weights;
|
||||
const float* d_values;
|
||||
float capacity;
|
||||
int n;
|
||||
|
||||
// ---- 目标计算 ----
|
||||
__device__ float calc_total_value(const Sol& sol) const {
|
||||
float tv = 0.0f;
|
||||
const int* sel = sol.data[0];
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++)
|
||||
if (sel[i]) tv += d_values[i];
|
||||
return tv;
|
||||
}
|
||||
|
||||
// ---- 目标定义(OBJ_DEFS 与 compute_obj 必须一一对应)----
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{ObjDir::Maximize, 1.0f, 0.0f}, // case 0: calc_total_value
|
||||
};
|
||||
__device__ float compute_obj(int idx, const Sol& sol) const {
|
||||
switch (idx) {
|
||||
case 0: return calc_total_value(sol); // OBJ_DEFS[0]
|
||||
default: return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float compute_penalty(const Sol& sol) const {
|
||||
float tw = 0.0f;
|
||||
const int* sel = sol.data[0];
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++)
|
||||
if (sel[i]) tw += d_weights[i];
|
||||
float over = tw - capacity;
|
||||
return (over > 0.0f) ? over : 0.0f;
|
||||
}
|
||||
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = EncodingType::Binary;
|
||||
cfg.dim1 = 1; cfg.dim2_default = n;
|
||||
fill_obj_config(cfg);
|
||||
return cfg;
|
||||
}
|
||||
|
||||
// ---- shared memory 接口 ----
|
||||
size_t shared_mem_bytes() const {
|
||||
return 2 * (size_t)n * sizeof(float);
|
||||
}
|
||||
|
||||
__device__ void load_shared(char* smem, int tid, int bsz) {
|
||||
float* sw = reinterpret_cast<float*>(smem);
|
||||
float* sv = sw + n;
|
||||
for (int i = tid; i < n; i += bsz) { sw[i] = d_weights[i]; sv[i] = d_values[i]; }
|
||||
d_weights = sw;
|
||||
d_values = sv;
|
||||
}
|
||||
|
||||
static KnapsackProblem create(const float* hw, const float* hv, int n, float cap) {
|
||||
KnapsackProblem prob;
|
||||
prob.n = n; prob.capacity = cap;
|
||||
float *dw, *dv;
|
||||
CUDA_CHECK(cudaMalloc(&dw, sizeof(float)*n));
|
||||
CUDA_CHECK(cudaMalloc(&dv, sizeof(float)*n));
|
||||
CUDA_CHECK(cudaMemcpy(dw, hw, sizeof(float)*n, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(dv, hv, sizeof(float)*n, cudaMemcpyHostToDevice));
|
||||
prob.d_weights = dw; prob.d_values = dv;
|
||||
return prob;
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (d_weights) cudaFree(const_cast<float*>(d_weights));
|
||||
if (d_values) cudaFree(const_cast<float*>(d_values));
|
||||
d_weights = nullptr; d_values = nullptr;
|
||||
}
|
||||
};
|
||||
83
python/cugenopt/include/problems/load_balance.cuh
Normal file
83
python/cugenopt/include/problems/load_balance.cuh
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
/**
|
||||
* load_balance.cuh - 离散负载均衡问题(Integer 编码验证)
|
||||
*
|
||||
* N 个任务分配到 M 台机器,每个任务有一个处理时间 p[i]。
|
||||
* 决策变量:data[0][i] ∈ [0, M-1],表示任务 i 分配到哪台机器。
|
||||
* 目标:最小化 makespan(最大机器负载)。
|
||||
*
|
||||
* 已知 NP-hard(等价于 multiprocessor scheduling / load balancing)。
|
||||
* LPT(最长处理时间优先)贪心可得 4/3 近似。
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
|
||||
struct LoadBalanceProblem : ProblemBase<LoadBalanceProblem, 1, 64> {
|
||||
const float* d_proc_time; // 任务处理时间 [N]
|
||||
int n; // 任务数
|
||||
int m; // 机器数
|
||||
|
||||
__device__ float calc_makespan(const Sol& sol) const {
|
||||
float load[32] = {}; // 最多 32 台机器
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++) {
|
||||
int machine = sol.data[0][i];
|
||||
if (machine >= 0 && machine < m)
|
||||
load[machine] += d_proc_time[i];
|
||||
}
|
||||
float max_load = 0.0f;
|
||||
for (int j = 0; j < m; j++)
|
||||
if (load[j] > max_load) max_load = load[j];
|
||||
return max_load;
|
||||
}
|
||||
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: makespan
|
||||
};
|
||||
__device__ float compute_obj(int idx, const Sol& sol) const {
|
||||
switch (idx) {
|
||||
case 0: return calc_makespan(sol);
|
||||
default: return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float compute_penalty(const Sol& sol) const {
|
||||
return 0.0f; // 无约束(任何分配都合法)
|
||||
}
|
||||
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = EncodingType::Integer;
|
||||
cfg.dim1 = 1; cfg.dim2_default = n;
|
||||
cfg.value_lower_bound = 0;
|
||||
cfg.value_upper_bound = m - 1;
|
||||
fill_obj_config(cfg);
|
||||
return cfg;
|
||||
}
|
||||
|
||||
size_t shared_mem_bytes() const {
|
||||
return (size_t)n * sizeof(float);
|
||||
}
|
||||
|
||||
__device__ void load_shared(char* smem, int tid, int bsz) {
|
||||
float* sp = reinterpret_cast<float*>(smem);
|
||||
for (int i = tid; i < n; i += bsz) sp[i] = d_proc_time[i];
|
||||
d_proc_time = sp;
|
||||
}
|
||||
|
||||
static LoadBalanceProblem create(const float* h_proc_time, int n, int m) {
|
||||
LoadBalanceProblem prob;
|
||||
prob.n = n; prob.m = m;
|
||||
float* dp;
|
||||
CUDA_CHECK(cudaMalloc(&dp, sizeof(float) * n));
|
||||
CUDA_CHECK(cudaMemcpy(dp, h_proc_time, sizeof(float) * n, cudaMemcpyHostToDevice));
|
||||
prob.d_proc_time = dp;
|
||||
return prob;
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (d_proc_time) cudaFree(const_cast<float*>(d_proc_time));
|
||||
d_proc_time = nullptr;
|
||||
}
|
||||
};
|
||||
84
python/cugenopt/include/problems/qap.cuh
Normal file
84
python/cugenopt/include/problems/qap.cuh
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
/**
|
||||
* qap.cuh - 二次分配问题 (Quadratic Assignment Problem)
|
||||
*
|
||||
* N 个设施分配到 N 个位置(排列编码)。
|
||||
* 决策变量:data[0][i] = 设施 i 分配到的位置。
|
||||
* 目标:Minimize sum(flow[i][j] * dist[perm[i]][perm[j]])
|
||||
*
|
||||
* 验证实例:自定义 5x5
|
||||
* flow: 设施间的物流量
|
||||
* dist: 位置间的距离
|
||||
* 已知最优 = 58
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
|
||||
struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
|
||||
const float* d_flow; // 物流量矩阵 [N*N]
|
||||
const float* d_dist; // 距离矩阵 [N*N]
|
||||
int n;
|
||||
|
||||
__device__ float calc_cost(const Sol& sol) const {
|
||||
float cost = 0.0f;
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++)
|
||||
for (int j = 0; j < size; j++)
|
||||
cost += d_flow[i * n + j] * d_dist[sol.data[0][i] * n + sol.data[0][j]];
|
||||
return cost;
|
||||
}
|
||||
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{ObjDir::Minimize, 1.0f, 0.0f},
|
||||
};
|
||||
__device__ float compute_obj(int idx, const Sol& sol) const {
|
||||
switch (idx) {
|
||||
case 0: return calc_cost(sol);
|
||||
default: return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float compute_penalty(const Sol& sol) const {
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = EncodingType::Permutation;
|
||||
cfg.dim1 = 1; cfg.dim2_default = n;
|
||||
fill_obj_config(cfg);
|
||||
return cfg;
|
||||
}
|
||||
|
||||
size_t shared_mem_bytes() const {
|
||||
return 2 * (size_t)n * n * sizeof(float);
|
||||
}
|
||||
|
||||
__device__ void load_shared(char* smem, int tid, int bsz) {
|
||||
float* sf = reinterpret_cast<float*>(smem);
|
||||
float* sd = sf + n * n;
|
||||
int total = n * n;
|
||||
for (int i = tid; i < total; i += bsz) { sf[i] = d_flow[i]; sd[i] = d_dist[i]; }
|
||||
d_flow = sf;
|
||||
d_dist = sd;
|
||||
}
|
||||
|
||||
static QAPProblem create(const float* h_flow, const float* h_dist, int n) {
|
||||
QAPProblem prob;
|
||||
prob.n = n;
|
||||
float *df, *dd;
|
||||
CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n));
|
||||
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
|
||||
CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
|
||||
prob.d_flow = df; prob.d_dist = dd;
|
||||
return prob;
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (d_flow) cudaFree(const_cast<float*>(d_flow));
|
||||
if (d_dist) cudaFree(const_cast<float*>(d_dist));
|
||||
d_flow = nullptr; d_dist = nullptr;
|
||||
}
|
||||
};
|
||||
101
python/cugenopt/include/problems/schedule.cuh
Normal file
101
python/cugenopt/include/problems/schedule.cuh
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
/**
|
||||
* schedule.cuh - 排班问题
|
||||
*
|
||||
* 继承 ProblemBase,使用 ObjDef 目标注册机制
|
||||
* 2 个目标:总成本(min)+ 不公平度(min,权重更高)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
#include "operators.cuh"
|
||||
|
||||
struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
|
||||
const float* d_cost;
|
||||
int days, emps, required;
|
||||
|
||||
// ---- 目标计算 ----
|
||||
__device__ float calc_total_cost(const Sol& sol) const {
|
||||
float total = 0.0f;
|
||||
for (int d = 0; d < days; d++)
|
||||
for (int e = 0; e < emps; e++)
|
||||
if (sol.data[d][e]) total += d_cost[d * emps + e];
|
||||
return total;
|
||||
}
|
||||
|
||||
__device__ float calc_unfairness(const Sol& sol) const {
|
||||
int workdays[D2];
|
||||
for (int e = 0; e < emps; e++) workdays[e] = 0;
|
||||
for (int d = 0; d < days; d++)
|
||||
for (int e = 0; e < emps; e++)
|
||||
if (sol.data[d][e]) workdays[e]++;
|
||||
int max_w = 0, min_w = days;
|
||||
for (int e = 0; e < emps; e++) {
|
||||
if (workdays[e] > max_w) max_w = workdays[e];
|
||||
if (workdays[e] < min_w) min_w = workdays[e];
|
||||
}
|
||||
return (float)(max_w - min_w);
|
||||
}
|
||||
|
||||
// ---- 目标定义(OBJ_DEFS 与 compute_obj 必须一一对应)----
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_cost
|
||||
{ObjDir::Minimize, 5.0f, 0.0f}, // case 1: calc_unfairness
|
||||
};
|
||||
__device__ float compute_obj(int idx, const Sol& sol) const {
|
||||
switch (idx) {
|
||||
case 0: return calc_total_cost(sol); // OBJ_DEFS[0]
|
||||
case 1: return calc_unfairness(sol); // OBJ_DEFS[1]
|
||||
default: return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float compute_penalty(const Sol& sol) const {
|
||||
float penalty = 0.0f;
|
||||
for (int d = 0; d < days; d++) {
|
||||
int count = 0;
|
||||
for (int e = 0; e < emps; e++)
|
||||
if (sol.data[d][e]) count++;
|
||||
int diff = count - required;
|
||||
penalty += (diff > 0) ? (float)diff : (float)(-diff);
|
||||
}
|
||||
return penalty;
|
||||
}
|
||||
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = EncodingType::Binary;
|
||||
cfg.dim1 = days; cfg.dim2_default = emps;
|
||||
cfg.row_mode = RowMode::Fixed;
|
||||
fill_obj_config(cfg);
|
||||
return cfg;
|
||||
}
|
||||
|
||||
// 默认回退全量(基类行为)— 不需要覆盖 evaluate_move
|
||||
|
||||
// ---- shared memory 接口 ----
|
||||
size_t shared_mem_bytes() const {
|
||||
return (size_t)days * emps * sizeof(float);
|
||||
}
|
||||
|
||||
__device__ void load_shared(char* smem, int tid, int bsz) {
|
||||
float* sc = reinterpret_cast<float*>(smem);
|
||||
int total = days * emps;
|
||||
for (int i = tid; i < total; i += bsz) sc[i] = d_cost[i];
|
||||
d_cost = sc;
|
||||
}
|
||||
|
||||
static ScheduleProblem create(const float* hc, int days, int emps, int req) {
|
||||
ScheduleProblem prob;
|
||||
prob.days = days; prob.emps = emps; prob.required = req;
|
||||
float* dc;
|
||||
CUDA_CHECK(cudaMalloc(&dc, sizeof(float)*days*emps));
|
||||
CUDA_CHECK(cudaMemcpy(dc, hc, sizeof(float)*days*emps, cudaMemcpyHostToDevice));
|
||||
prob.d_cost = dc;
|
||||
return prob;
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (d_cost) { cudaFree(const_cast<float*>(d_cost)); d_cost = nullptr; }
|
||||
}
|
||||
};
|
||||
110
python/cugenopt/include/problems/tsp.cuh
Normal file
110
python/cugenopt/include/problems/tsp.cuh
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
/**
|
||||
* tsp.cuh - TSP 问题定义
|
||||
*
|
||||
* 继承 ProblemBase,使用 ObjDef 目标注册机制
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
#include "operators.cuh"
|
||||
|
||||
struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
|
||||
// 问题数据
|
||||
const float* d_dist;
|
||||
const float* h_dist; // host 端距离矩阵(用于 init_relation_matrix)
|
||||
int n;
|
||||
|
||||
// ---- 目标计算 ----
|
||||
__device__ float calc_total_distance(const Sol& sol) const {
|
||||
float total = 0.0f;
|
||||
const int* route = sol.data[0];
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++)
|
||||
total += d_dist[route[i] * n + route[(i + 1) % size]];
|
||||
return total;
|
||||
}
|
||||
|
||||
// ---- 目标定义(OBJ_DEFS 与 compute_obj 必须一一对应)----
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_distance
|
||||
};
|
||||
__device__ float compute_obj(int idx, const Sol& sol) const {
|
||||
switch (idx) {
|
||||
case 0: return calc_total_distance(sol); // OBJ_DEFS[0]
|
||||
default: return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float compute_penalty(const Sol& sol) const {
|
||||
return 0.0f; // TSP 无约束
|
||||
}
|
||||
|
||||
// ---- config(编码/维度部分,目标由基类自动填充)----
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = EncodingType::Permutation;
|
||||
cfg.dim1 = 1; cfg.dim2_default = n;
|
||||
fill_obj_config(cfg);
|
||||
return cfg;
|
||||
}
|
||||
|
||||
// ---- shared memory 接口 ----
|
||||
static constexpr size_t SMEM_LIMIT = 48 * 1024;
|
||||
|
||||
size_t shared_mem_bytes() const {
|
||||
size_t need = (size_t)n * n * sizeof(float);
|
||||
return need <= SMEM_LIMIT ? need : 0;
|
||||
}
|
||||
|
||||
size_t working_set_bytes() const {
|
||||
return (size_t)n * n * sizeof(float);
|
||||
}
|
||||
|
||||
__device__ void load_shared(char* smem, int tid, int bsz) {
|
||||
float* sd = reinterpret_cast<float*>(smem);
|
||||
int total = n * n;
|
||||
for (int i = tid; i < total; i += bsz)
|
||||
sd[i] = d_dist[i];
|
||||
d_dist = sd;
|
||||
}
|
||||
|
||||
// 距离先验:距离近 → G/O 分数高
|
||||
void init_relation_matrix(float* G, float* O, int N) const {
|
||||
if (!h_dist || N != n) return;
|
||||
float max_d = 0.0f;
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++)
|
||||
if (h_dist[i * N + j] > max_d) max_d = h_dist[i * N + j];
|
||||
if (max_d <= 0.0f) return;
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++) {
|
||||
if (i == j) continue;
|
||||
float proximity = 1.0f - h_dist[i * N + j] / max_d;
|
||||
G[i * N + j] = proximity * 0.3f;
|
||||
O[i * N + j] = proximity * 0.1f;
|
||||
}
|
||||
}
|
||||
|
||||
int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
|
||||
if (max_count < 1 || !h_dist) return 0;
|
||||
out[0] = {h_dist, n};
|
||||
return 1;
|
||||
}
|
||||
|
||||
static TSPProblem create(const float* h_dist_ptr, int n) {
|
||||
TSPProblem prob;
|
||||
prob.n = n;
|
||||
prob.h_dist = h_dist_ptr;
|
||||
float* dd;
|
||||
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
|
||||
CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));
|
||||
prob.d_dist = dd;
|
||||
return prob;
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
|
||||
h_dist = nullptr;
|
||||
}
|
||||
};
|
||||
107
python/cugenopt/include/problems/tsp_large.cuh
Normal file
107
python/cugenopt/include/problems/tsp_large.cuh
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
/**
|
||||
* tsp_large.cuh - 大规模 TSP 问题定义 (最多 256 城市)
|
||||
*
|
||||
* 继承 ProblemBase,逻辑与 tsp.cuh 一致,仅 D2 上限不同
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
#include "operators.cuh"
|
||||
|
||||
struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
|
||||
const float* d_dist;
|
||||
const float* h_dist;
|
||||
int n;
|
||||
|
||||
// ---- 目标计算 ----
|
||||
__device__ float calc_total_distance(const Sol& sol) const {
|
||||
float total = 0.0f;
|
||||
const int* route = sol.data[0];
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++)
|
||||
total += d_dist[route[i] * n + route[(i + 1) % size]];
|
||||
return total;
|
||||
}
|
||||
|
||||
// ---- 目标定义(OBJ_DEFS 与 compute_obj 必须一一对应)----
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_distance
|
||||
};
|
||||
__device__ float compute_obj(int idx, const Sol& sol) const {
|
||||
switch (idx) {
|
||||
case 0: return calc_total_distance(sol); // OBJ_DEFS[0]
|
||||
default: return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float compute_penalty(const Sol& sol) const {
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = EncodingType::Permutation;
|
||||
cfg.dim1 = 1; cfg.dim2_default = n;
|
||||
fill_obj_config(cfg);
|
||||
return cfg;
|
||||
}
|
||||
|
||||
static constexpr size_t SMEM_LIMIT = 48 * 1024;
|
||||
|
||||
size_t shared_mem_bytes() const {
|
||||
size_t need = (size_t)n * n * sizeof(float);
|
||||
return need <= SMEM_LIMIT ? need : 0;
|
||||
}
|
||||
|
||||
// 距离矩阵的实际大小(不管是否放进 smem)
|
||||
size_t working_set_bytes() const {
|
||||
return (size_t)n * n * sizeof(float);
|
||||
}
|
||||
|
||||
__device__ void load_shared(char* smem, int tid, int bsz) {
|
||||
float* sd = reinterpret_cast<float*>(smem);
|
||||
int total = n * n;
|
||||
for (int i = tid; i < total; i += bsz)
|
||||
sd[i] = d_dist[i];
|
||||
d_dist = sd;
|
||||
}
|
||||
|
||||
void init_relation_matrix(float* G, float* O, int N) const {
|
||||
if (!h_dist || N != n) return;
|
||||
float max_d = 0.0f;
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++)
|
||||
if (h_dist[i * N + j] > max_d) max_d = h_dist[i * N + j];
|
||||
if (max_d <= 0.0f) return;
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++) {
|
||||
if (i == j) continue;
|
||||
float proximity = 1.0f - h_dist[i * N + j] / max_d;
|
||||
G[i * N + j] = proximity * 0.3f;
|
||||
O[i * N + j] = proximity * 0.1f;
|
||||
}
|
||||
}
|
||||
|
||||
int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
|
||||
if (max_count < 1 || !h_dist) return 0;
|
||||
out[0] = {h_dist, n};
|
||||
return 1;
|
||||
}
|
||||
|
||||
static TSPLargeProblem create(const float* h_dist_ptr, int n) {
|
||||
TSPLargeProblem prob;
|
||||
prob.n = n;
|
||||
prob.h_dist = h_dist_ptr;
|
||||
float* dd;
|
||||
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
|
||||
CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));
|
||||
prob.d_dist = dd;
|
||||
return prob;
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
|
||||
h_dist = nullptr;
|
||||
}
|
||||
};
|
||||
99
python/cugenopt/include/problems/tsp_xlarge.cuh
Normal file
99
python/cugenopt/include/problems/tsp_xlarge.cuh
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
/**
|
||||
* tsp_xlarge.cuh - 超大规模 TSP 问题定义 (最多 512 城市)
|
||||
*
|
||||
* 继承 ProblemBase,逻辑与 tsp_large.cuh 一致,D2=512
|
||||
* 注意:距离矩阵 512×512×4B = 1MB,远超 48KB shared memory
|
||||
* 因此 shared_mem_bytes() 返回 0,距离矩阵留在 global memory
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
#include "operators.cuh"
|
||||
|
||||
struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
|
||||
const float* d_dist;
|
||||
const float* h_dist; // host 端距离矩阵(用于 init_relation_matrix)
|
||||
int n;
|
||||
|
||||
__device__ float calc_total_distance(const Sol& sol) const {
|
||||
float total = 0.0f;
|
||||
const int* route = sol.data[0];
|
||||
int size = sol.dim2_sizes[0];
|
||||
for (int i = 0; i < size; i++)
|
||||
total += d_dist[route[i] * n + route[(i + 1) % size]];
|
||||
return total;
|
||||
}
|
||||
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{ObjDir::Minimize, 1.0f, 0.0f},
|
||||
};
|
||||
__device__ float compute_obj(int idx, const Sol& sol) const {
|
||||
switch (idx) {
|
||||
case 0: return calc_total_distance(sol);
|
||||
default: return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float compute_penalty(const Sol& sol) const { return 0.0f; }
|
||||
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = EncodingType::Permutation;
|
||||
cfg.dim1 = 1; cfg.dim2_default = n;
|
||||
fill_obj_config(cfg);
|
||||
return cfg;
|
||||
}
|
||||
|
||||
// 距离矩阵太大,不放 shared memory
|
||||
size_t shared_mem_bytes() const { return 0; }
|
||||
__device__ void load_shared(char*, int, int) {}
|
||||
|
||||
size_t working_set_bytes() const {
|
||||
return (size_t)n * n * sizeof(float);
|
||||
}
|
||||
|
||||
// 用距离矩阵初始化 G/O 先验:距离近 → 分数高
|
||||
void init_relation_matrix(float* G, float* O, int N) const {
|
||||
if (!h_dist || N != n) return;
|
||||
// 找最大距离用于归一化
|
||||
float max_d = 0.0f;
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++)
|
||||
if (h_dist[i * N + j] > max_d) max_d = h_dist[i * N + j];
|
||||
if (max_d <= 0.0f) return;
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
if (i == j) continue;
|
||||
// 距离近 → G 高(分组倾向强)
|
||||
float proximity = 1.0f - h_dist[i * N + j] / max_d;
|
||||
G[i * N + j] = proximity * 0.3f; // 初始信号不要太强,留空间给 EMA
|
||||
// 距离近 → O 也给一点信号(对称的,不偏向任何方向)
|
||||
O[i * N + j] = proximity * 0.1f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
|
||||
if (max_count < 1 || !h_dist) return 0;
|
||||
out[0] = {h_dist, n};
|
||||
return 1;
|
||||
}
|
||||
|
||||
static TSPXLargeProblem create(const float* h_dist_ptr, int n) {
|
||||
TSPXLargeProblem prob;
|
||||
prob.n = n;
|
||||
prob.h_dist = h_dist_ptr; // 保留 host 指针
|
||||
float* dd;
|
||||
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
|
||||
CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));
|
||||
prob.d_dist = dd;
|
||||
return prob;
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
|
||||
h_dist = nullptr;
|
||||
}
|
||||
};
|
||||
184
python/cugenopt/include/problems/vrp.cuh
Normal file
184
python/cugenopt/include/problems/vrp.cuh
Normal file
|
|
@ -0,0 +1,184 @@
|
|||
/**
|
||||
* vrp.cuh - 容量约束车辆路径问题 (CVRP)
|
||||
*
|
||||
* 继承 ProblemBase,使用 ObjDef 目标注册机制
|
||||
* 多行编码(D1=K 条路线,分区初始化 + 跨行算子)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
#include "operators.cuh"
|
||||
#include "gpu_cache.cuh"
|
||||
|
||||
struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
|
||||
// GPU 数据
|
||||
const float* d_dist;
|
||||
const float* d_demand;
|
||||
const float* h_dist; // host 端距离矩阵(含 depot,用于 init_relation_matrix)
|
||||
int n;
|
||||
int stride;
|
||||
float capacity;
|
||||
int num_vehicles;
|
||||
int max_vehicles;
|
||||
GpuCache cache;
|
||||
|
||||
// ---- 目标计算 ----
|
||||
__device__ float compute_route_dist(const int* route, int size) const {
|
||||
if (size == 0) return 0.0f;
|
||||
float dist = 0.0f;
|
||||
int prev = 0;
|
||||
for (int j = 0; j < size; j++) {
|
||||
int node = route[j] + 1;
|
||||
dist += d_dist[prev * stride + node];
|
||||
prev = node;
|
||||
}
|
||||
dist += d_dist[prev * stride + 0];
|
||||
return dist;
|
||||
}
|
||||
|
||||
__device__ float eval_route(const int* route, int size) const {
|
||||
if (size == 0) return 0.0f;
|
||||
if (!cache.keys) return compute_route_dist(route, size);
|
||||
|
||||
uint64_t key = route_hash(route, size);
|
||||
float dist;
|
||||
if (cache_lookup(cache, key, dist)) {
|
||||
atomicAdd(cache.d_hits, 1);
|
||||
return dist;
|
||||
}
|
||||
dist = compute_route_dist(route, size);
|
||||
cache_insert(cache, key, dist);
|
||||
atomicAdd(cache.d_misses, 1);
|
||||
return dist;
|
||||
}
|
||||
|
||||
__device__ float calc_total_distance(const Sol& sol) const {
|
||||
float total = 0.0f;
|
||||
for (int r = 0; r < num_vehicles; r++)
|
||||
total += eval_route(sol.data[r], sol.dim2_sizes[r]);
|
||||
return total;
|
||||
}
|
||||
|
||||
// ---- 目标定义(OBJ_DEFS 与 compute_obj 必须一一对应)----
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_distance
|
||||
};
|
||||
__device__ float compute_obj(int idx, const Sol& sol) const {
|
||||
switch (idx) {
|
||||
case 0: return calc_total_distance(sol); // OBJ_DEFS[0]
|
||||
default: return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float compute_penalty(const Sol& sol) const {
|
||||
float penalty = 0.0f;
|
||||
int active = 0;
|
||||
for (int r = 0; r < num_vehicles; r++) {
|
||||
int size = sol.dim2_sizes[r];
|
||||
if (size == 0) continue;
|
||||
active++;
|
||||
float load = 0.0f;
|
||||
for (int j = 0; j < size; j++)
|
||||
load += d_demand[sol.data[r][j]];
|
||||
if (load > capacity)
|
||||
penalty += (load - capacity) * 100.0f;
|
||||
}
|
||||
if (active > max_vehicles)
|
||||
penalty += (float)(active - max_vehicles) * 1000.0f;
|
||||
return penalty;
|
||||
}
|
||||
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = EncodingType::Permutation;
|
||||
cfg.dim1 = num_vehicles;
|
||||
cfg.dim2_default = 0;
|
||||
fill_obj_config(cfg);
|
||||
cfg.cross_row_prob = 0.3f;
|
||||
cfg.row_mode = RowMode::Partition;
|
||||
cfg.total_elements = n;
|
||||
return cfg;
|
||||
}
|
||||
|
||||
// ---- shared memory 接口 ----
|
||||
static constexpr size_t SMEM_LIMIT = 48 * 1024;
|
||||
|
||||
size_t shared_mem_bytes() const {
|
||||
size_t dist_bytes = (size_t)stride * stride * sizeof(float);
|
||||
size_t demand_bytes = (size_t)n * sizeof(float);
|
||||
size_t total = dist_bytes + demand_bytes;
|
||||
return total <= SMEM_LIMIT ? total : 0;
|
||||
}
|
||||
|
||||
size_t working_set_bytes() const {
|
||||
return (size_t)stride * stride * sizeof(float) + (size_t)n * sizeof(float);
|
||||
}
|
||||
|
||||
__device__ void load_shared(char* smem, int tid, int bsz) {
|
||||
float* sd = reinterpret_cast<float*>(smem);
|
||||
int dist_size = stride * stride;
|
||||
for (int i = tid; i < dist_size; i += bsz) sd[i] = d_dist[i];
|
||||
d_dist = sd;
|
||||
float* sdem = sd + dist_size;
|
||||
for (int i = tid; i < n; i += bsz) sdem[i] = d_demand[i];
|
||||
d_demand = sdem;
|
||||
}
|
||||
|
||||
void enable_cache(int cap = 65536) { cache = GpuCache::allocate(cap); }
|
||||
void print_cache_stats() const { cache.print_stats(); }
|
||||
|
||||
// 距离先验:客户间距离近 → G/O 分数高
|
||||
// 注意:h_dist 含 depot(stride×stride),元素编号 0..n-1 对应 node 1..n
|
||||
void init_relation_matrix(float* G, float* O, int N) const {
|
||||
if (!h_dist || N != n) return;
|
||||
float max_d = 0.0f;
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++) {
|
||||
float d = h_dist[(i + 1) * stride + (j + 1)]; // 跳过 depot
|
||||
if (d > max_d) max_d = d;
|
||||
}
|
||||
if (max_d <= 0.0f) return;
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++) {
|
||||
if (i == j) continue;
|
||||
float d = h_dist[(i + 1) * stride + (j + 1)];
|
||||
float proximity = 1.0f - d / max_d;
|
||||
G[i * N + j] = proximity * 0.3f;
|
||||
O[i * N + j] = proximity * 0.1f;
|
||||
}
|
||||
}
|
||||
|
||||
static VRPProblem create(const float* h_dist_ptr, const float* h_demand,
|
||||
int n, float capacity,
|
||||
int num_vehicles, int max_vehicles) {
|
||||
VRPProblem prob;
|
||||
prob.n = n;
|
||||
prob.stride = n + 1;
|
||||
prob.capacity = capacity;
|
||||
prob.num_vehicles = num_vehicles;
|
||||
prob.max_vehicles = max_vehicles;
|
||||
prob.cache = GpuCache::disabled();
|
||||
prob.h_dist = h_dist_ptr;
|
||||
|
||||
int n_nodes = n + 1;
|
||||
float* dd;
|
||||
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n_nodes * n_nodes));
|
||||
CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n_nodes * n_nodes, cudaMemcpyHostToDevice));
|
||||
prob.d_dist = dd;
|
||||
|
||||
float* ddem;
|
||||
CUDA_CHECK(cudaMalloc(&ddem, sizeof(float) * n));
|
||||
CUDA_CHECK(cudaMemcpy(ddem, h_demand, sizeof(float) * n, cudaMemcpyHostToDevice));
|
||||
prob.d_demand = ddem;
|
||||
|
||||
return prob;
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
|
||||
if (d_demand) { cudaFree(const_cast<float*>(d_demand)); d_demand = nullptr; }
|
||||
h_dist = nullptr;
|
||||
cache.destroy();
|
||||
}
|
||||
};
|
||||
192
python/cugenopt/include/problems/vrptw.cuh
Normal file
192
python/cugenopt/include/problems/vrptw.cuh
Normal file
|
|
@ -0,0 +1,192 @@
|
|||
/**
|
||||
* vrptw.cuh - 带时间窗的车辆路径问题 (VRPTW)
|
||||
*
|
||||
* 在 CVRP 基础上增加时间窗约束。
|
||||
* 编码:Perm 多行分区(同 CVRP),data[r][j] = 路线 r 的第 j 个客户。
|
||||
* 目标:Minimize 总距离。
|
||||
* 约束:(a) 容量约束, (b) 时间窗约束(到达时间必须 ≤ latest,早到需等待)。
|
||||
*
|
||||
* 验证实例:8 客户 3 车, 手工设计坐标+时间窗, 确保有已知可行解。
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
|
||||
struct VRPTWProblem : ProblemBase<VRPTWProblem, 8, 64> {
|
||||
const float* d_dist; // 距离矩阵 [(n+1)*(n+1)](含 depot)
|
||||
const float* d_demand; // 需求 [n]
|
||||
const float* d_earliest; // 最早服务时间 [n+1](含 depot)
|
||||
const float* d_latest; // 最晚服务时间 [n+1](含 depot)
|
||||
const float* d_service; // 服务耗时 [n+1](含 depot)
|
||||
int n; // 客户数(不含 depot)
|
||||
int stride; // n+1
|
||||
float capacity;
|
||||
int num_vehicles;
|
||||
int max_vehicles;
|
||||
|
||||
__device__ float compute_route_dist(const int* route, int size) const {
|
||||
if (size == 0) return 0.0f;
|
||||
float dist = 0.0f;
|
||||
int prev = 0;
|
||||
for (int j = 0; j < size; j++) {
|
||||
int node = route[j] + 1;
|
||||
dist += d_dist[prev * stride + node];
|
||||
prev = node;
|
||||
}
|
||||
dist += d_dist[prev * stride + 0];
|
||||
return dist;
|
||||
}
|
||||
|
||||
__device__ float calc_total_distance(const Sol& sol) const {
|
||||
float total = 0.0f;
|
||||
for (int r = 0; r < num_vehicles; r++)
|
||||
total += compute_route_dist(sol.data[r], sol.dim2_sizes[r]);
|
||||
return total;
|
||||
}
|
||||
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{ObjDir::Minimize, 1.0f, 0.0f},
|
||||
};
|
||||
__device__ float compute_obj(int idx, const Sol& sol) const {
|
||||
switch (idx) {
|
||||
case 0: return calc_total_distance(sol);
|
||||
default: return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float compute_penalty(const Sol& sol) const {
|
||||
float penalty = 0.0f;
|
||||
int active = 0;
|
||||
for (int r = 0; r < num_vehicles; r++) {
|
||||
int size = sol.dim2_sizes[r];
|
||||
if (size == 0) continue;
|
||||
active++;
|
||||
|
||||
// 容量约束
|
||||
float load = 0.0f;
|
||||
for (int j = 0; j < size; j++)
|
||||
load += d_demand[sol.data[r][j]];
|
||||
if (load > capacity)
|
||||
penalty += (load - capacity) * 100.0f;
|
||||
|
||||
// 时间窗约束:模拟路线行驶
|
||||
float time = 0.0f;
|
||||
int prev = 0;
|
||||
for (int j = 0; j < size; j++) {
|
||||
int node = sol.data[r][j] + 1;
|
||||
float travel = d_dist[prev * stride + node];
|
||||
time += travel;
|
||||
// 早到需等待
|
||||
if (time < d_earliest[node])
|
||||
time = d_earliest[node];
|
||||
// 迟到产生惩罚
|
||||
if (time > d_latest[node])
|
||||
penalty += (time - d_latest[node]) * 50.0f;
|
||||
time += d_service[node];
|
||||
prev = node;
|
||||
}
|
||||
// 返回 depot 的时间窗
|
||||
float return_time = time + d_dist[prev * stride + 0];
|
||||
if (return_time > d_latest[0])
|
||||
penalty += (return_time - d_latest[0]) * 50.0f;
|
||||
}
|
||||
if (active > max_vehicles)
|
||||
penalty += (float)(active - max_vehicles) * 1000.0f;
|
||||
return penalty;
|
||||
}
|
||||
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = EncodingType::Permutation;
|
||||
cfg.dim1 = num_vehicles;
|
||||
cfg.dim2_default = 0;
|
||||
fill_obj_config(cfg);
|
||||
cfg.cross_row_prob = 0.3f;
|
||||
cfg.row_mode = RowMode::Partition;
|
||||
cfg.total_elements = n;
|
||||
return cfg;
|
||||
}
|
||||
|
||||
static constexpr size_t SMEM_LIMIT = 48 * 1024;
|
||||
|
||||
size_t shared_mem_bytes() const {
|
||||
size_t dist_bytes = (size_t)stride * stride * sizeof(float);
|
||||
size_t aux_bytes = (size_t)(n + 1) * 4 * sizeof(float); // demand(n) + earliest/latest/service(n+1 each)
|
||||
size_t total = dist_bytes + aux_bytes;
|
||||
return total <= SMEM_LIMIT ? total : 0;
|
||||
}
|
||||
|
||||
size_t working_set_bytes() const {
|
||||
return (size_t)stride * stride * sizeof(float) + (size_t)(n + 1) * 4 * sizeof(float);
|
||||
}
|
||||
|
||||
__device__ void load_shared(char* smem, int tid, int bsz) {
|
||||
float* sd = reinterpret_cast<float*>(smem);
|
||||
int dist_size = stride * stride;
|
||||
for (int i = tid; i < dist_size; i += bsz) sd[i] = d_dist[i];
|
||||
d_dist = sd;
|
||||
|
||||
float* sdem = sd + dist_size;
|
||||
for (int i = tid; i < n; i += bsz) sdem[i] = d_demand[i];
|
||||
d_demand = sdem;
|
||||
|
||||
float* se = sdem + n;
|
||||
int nn = n + 1;
|
||||
for (int i = tid; i < nn; i += bsz) se[i] = d_earliest[i];
|
||||
d_earliest = se;
|
||||
|
||||
float* sl = se + nn;
|
||||
for (int i = tid; i < nn; i += bsz) sl[i] = d_latest[i];
|
||||
d_latest = sl;
|
||||
|
||||
float* ss = sl + nn;
|
||||
for (int i = tid; i < nn; i += bsz) ss[i] = d_service[i];
|
||||
d_service = ss;
|
||||
}
|
||||
|
||||
static VRPTWProblem create(const float* h_dist, const float* h_demand,
|
||||
const float* h_earliest, const float* h_latest,
|
||||
const float* h_service,
|
||||
int n, float capacity,
|
||||
int num_vehicles, int max_vehicles) {
|
||||
VRPTWProblem prob;
|
||||
prob.n = n;
|
||||
prob.stride = n + 1;
|
||||
prob.capacity = capacity;
|
||||
prob.num_vehicles = num_vehicles;
|
||||
prob.max_vehicles = max_vehicles;
|
||||
|
||||
int nn = n + 1;
|
||||
float *dd, *ddem, *de, *dl, *ds;
|
||||
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * nn * nn));
|
||||
CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * nn * nn, cudaMemcpyHostToDevice));
|
||||
prob.d_dist = dd;
|
||||
|
||||
CUDA_CHECK(cudaMalloc(&ddem, sizeof(float) * n));
|
||||
CUDA_CHECK(cudaMemcpy(ddem, h_demand, sizeof(float) * n, cudaMemcpyHostToDevice));
|
||||
prob.d_demand = ddem;
|
||||
|
||||
CUDA_CHECK(cudaMalloc(&de, sizeof(float) * nn));
|
||||
CUDA_CHECK(cudaMemcpy(de, h_earliest, sizeof(float) * nn, cudaMemcpyHostToDevice));
|
||||
prob.d_earliest = de;
|
||||
|
||||
CUDA_CHECK(cudaMalloc(&dl, sizeof(float) * nn));
|
||||
CUDA_CHECK(cudaMemcpy(dl, h_latest, sizeof(float) * nn, cudaMemcpyHostToDevice));
|
||||
prob.d_latest = dl;
|
||||
|
||||
CUDA_CHECK(cudaMalloc(&ds, sizeof(float) * nn));
|
||||
CUDA_CHECK(cudaMemcpy(ds, h_service, sizeof(float) * nn, cudaMemcpyHostToDevice));
|
||||
prob.d_service = ds;
|
||||
|
||||
return prob;
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
|
||||
if (d_demand) { cudaFree(const_cast<float*>(d_demand)); d_demand = nullptr; }
|
||||
if (d_earliest) { cudaFree(const_cast<float*>(d_earliest)); d_earliest = nullptr; }
|
||||
if (d_latest) { cudaFree(const_cast<float*>(d_latest)); d_latest = nullptr; }
|
||||
if (d_service) { cudaFree(const_cast<float*>(d_service)); d_service = nullptr; }
|
||||
}
|
||||
};
|
||||
599
python/cugenopt/jit.py
Normal file
599
python/cugenopt/jit.py
Normal file
|
|
@ -0,0 +1,599 @@
|
|||
"""
|
||||
JIT compiler for custom cuGenOpt problems.
|
||||
|
||||
Workflow:
|
||||
1. User provides CUDA code snippets (compute_obj, compute_penalty) + data arrays
|
||||
2. Python fills the .cu template with user code
|
||||
3. nvcc compiles to executable (cached by content hash)
|
||||
4. subprocess runs executable, parses JSON output
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import struct
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cugenopt.validation import (
|
||||
CuGenOptCompileError,
|
||||
CuGenOptValidationError,
|
||||
validate_cuda_snippet,
|
||||
validate_data_dict,
|
||||
validate_encoding,
|
||||
validate_positive_int,
|
||||
)
|
||||
|
||||
_TEMPLATE_PATH = Path(__file__).parent / "jit_template.cu"
|
||||
_CACHE_DIR = Path.home() / ".cache" / "cugenopt" / "jit"
|
||||
|
||||
|
||||
def _detect_framework_root() -> Path:
|
||||
"""Find the cuGenOpt framework root (contains core/types.cuh).
|
||||
|
||||
Search order:
|
||||
1. Bundled headers inside the installed package (pip install wheel)
|
||||
2. Development layout (source tree)
|
||||
3. CUGENOPT_ROOT env var
|
||||
"""
|
||||
pkg_dir = Path(__file__).parent # cugenopt/
|
||||
|
||||
# 1. Bundled headers (wheel layout: cugenopt/include/core/types.cuh)
|
||||
bundled = pkg_dir / "include"
|
||||
if (bundled / "core" / "types.cuh").exists():
|
||||
return bundled
|
||||
|
||||
# 2. Development layout (python/../prototype)
|
||||
dev_root = pkg_dir.parent # python/
|
||||
for rel in ["../prototype", "../../prototype"]:
|
||||
c = (dev_root / rel).resolve()
|
||||
if (c / "core" / "types.cuh").exists():
|
||||
return c
|
||||
|
||||
# 3. CUGENOPT_ROOT env var
|
||||
env_root = os.environ.get("CUGENOPT_ROOT")
|
||||
if env_root:
|
||||
p = Path(env_root) / "prototype"
|
||||
if (p / "core" / "types.cuh").exists():
|
||||
return p
|
||||
|
||||
raise RuntimeError(
|
||||
"Cannot find cuGenOpt framework headers. "
|
||||
"Set CUGENOPT_ROOT env var to the generic_solver directory, "
|
||||
"or reinstall: pip install cugenopt"
|
||||
)
|
||||
|
||||
|
||||
def _detect_cuda_arch() -> str:
|
||||
"""Detect GPU compute capability via nvidia-smi."""
|
||||
try:
|
||||
out = subprocess.check_output(
|
||||
["nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader"],
|
||||
stderr=subprocess.DEVNULL, text=True
|
||||
).strip().split("\n")[0]
|
||||
major, minor = out.strip().split(".")
|
||||
return f"sm_{major}{minor}"
|
||||
except Exception:
|
||||
return "sm_75"
|
||||
|
||||
|
||||
def _is_windows() -> bool:
|
||||
return os.name == "nt"
|
||||
|
||||
|
||||
def _nvcc_name() -> str:
|
||||
return "nvcc.exe" if _is_windows() else "nvcc"
|
||||
|
||||
|
||||
def _find_nvcc() -> str:
|
||||
"""Find nvcc binary. Search order: PATH → pip-installed → common system paths."""
|
||||
# 1. System PATH
|
||||
nvcc = shutil.which(_nvcc_name())
|
||||
if nvcc:
|
||||
return nvcc
|
||||
|
||||
# 2. pip-installed nvidia-cuda-nvcc / nvidia-cuda-nvcc-cu12
|
||||
import sys
|
||||
import site
|
||||
pip_search_dirs = [sys.prefix, *site.getsitepackages(), site.getusersitepackages()]
|
||||
nvcc_bin = _nvcc_name()
|
||||
for base in pip_search_dirs:
|
||||
pip_subpaths = [
|
||||
os.path.join("nvidia", "cuda_nvcc", "bin", nvcc_bin),
|
||||
os.path.join("nvidia", "cu13", "bin", nvcc_bin),
|
||||
os.path.join("nvidia", "cu12", "bin", nvcc_bin),
|
||||
os.path.join("bin", nvcc_bin),
|
||||
]
|
||||
if _is_windows():
|
||||
pip_subpaths += [
|
||||
os.path.join("Scripts", nvcc_bin),
|
||||
os.path.join("Library", "bin", nvcc_bin),
|
||||
]
|
||||
for subpath in pip_subpaths:
|
||||
candidate = os.path.join(base, subpath)
|
||||
if os.path.isfile(candidate):
|
||||
return candidate
|
||||
|
||||
# 3. Common system paths
|
||||
if _is_windows():
|
||||
cuda_base = os.path.join(os.environ.get("CUDA_PATH", ""),
|
||||
"bin", "nvcc.exe")
|
||||
if os.path.isfile(cuda_base):
|
||||
return cuda_base
|
||||
for ver in ["12.4", "12.6", "12.0", "11.8"]:
|
||||
candidate = os.path.join(
|
||||
"C:\\", "Program Files", "NVIDIA GPU Computing Toolkit",
|
||||
"CUDA", f"v{ver}", "bin", "nvcc.exe")
|
||||
if os.path.isfile(candidate):
|
||||
return candidate
|
||||
else:
|
||||
for candidate in [
|
||||
"/usr/local/cuda/bin/nvcc",
|
||||
"/usr/local/cuda-12.4/bin/nvcc",
|
||||
"/usr/local/cuda-12.6/bin/nvcc",
|
||||
"/usr/local/cuda-12.0/bin/nvcc",
|
||||
"/usr/local/cuda-11.8/bin/nvcc",
|
||||
]:
|
||||
if os.path.isfile(candidate):
|
||||
return candidate
|
||||
|
||||
raise RuntimeError(
|
||||
"nvcc not found. Install the CUDA compiler:\n"
|
||||
" pip install nvidia-cuda-nvcc-cu12\n"
|
||||
"Or install CUDA Toolkit: https://developer.nvidia.com/cuda-downloads"
|
||||
)
|
||||
|
||||
|
||||
def _content_hash(source: str) -> str:
|
||||
return hashlib.sha256(source.encode()).hexdigest()[:16]
|
||||
|
||||
|
||||
def _fallback_compile_without_bad_ops(
|
||||
custom_operators: list,
|
||||
encoding: str,
|
||||
template: str,
|
||||
replacements: dict,
|
||||
base_cmd: list,
|
||||
fw_root: Path,
|
||||
original_stderr: str,
|
||||
) -> tuple:
|
||||
"""When compilation fails with custom operators, try excluding them one by one.
|
||||
|
||||
Strategy:
|
||||
1. Try compiling without ALL custom operators (baseline)
|
||||
2. If baseline compiles, try adding operators back one by one
|
||||
3. Report which operators were disabled
|
||||
|
||||
Returns (source, exe_path, cu_path) of the successful compilation.
|
||||
Raises CuGenOptCompileError if even baseline fails.
|
||||
"""
|
||||
import warnings
|
||||
from cugenopt.operators import generate_custom_operator_cuda
|
||||
|
||||
# Try baseline (no custom ops)
|
||||
baseline_replacements = dict(replacements)
|
||||
baseline_replacements["{{CUSTOM_OP_DEFINES}}"] = ""
|
||||
baseline_replacements["{{CUSTOM_OP_SWITCH}}"] = ""
|
||||
baseline_replacements["{{CUSTOM_OP_REGISTRY}}"] = ""
|
||||
|
||||
baseline_source = template
|
||||
for key, val in baseline_replacements.items():
|
||||
baseline_source = baseline_source.replace(key, val)
|
||||
|
||||
h = _content_hash(baseline_source)
|
||||
cu_path = _CACHE_DIR / f"custom_{h}.cu"
|
||||
exe_suffix = ".exe" if _is_windows() else ""
|
||||
exe_path = _CACHE_DIR / f"custom_{h}{exe_suffix}"
|
||||
|
||||
if not exe_path.exists():
|
||||
cu_path.write_text(baseline_source)
|
||||
cmd = base_cmd + [str(cu_path), "-o", str(exe_path)]
|
||||
proc = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if proc.returncode != 0:
|
||||
raise CuGenOptCompileError(original_stderr, str(cu_path))
|
||||
|
||||
disabled_names = [op.name for op in custom_operators]
|
||||
warnings.warn(
|
||||
f"Custom operator(s) caused compilation failure. "
|
||||
f"Disabled: {', '.join(disabled_names)}. "
|
||||
f"Solving with built-in operators only.\n"
|
||||
f"Fix your operator code and retry. "
|
||||
f"Original error:\n{original_stderr[:500]}",
|
||||
RuntimeWarning,
|
||||
stacklevel=4,
|
||||
)
|
||||
return baseline_source, exe_path, cu_path
|
||||
|
||||
|
||||
def _build_custom_op_defines(switch_block: str, registry_block: str) -> str:
|
||||
"""Build #define to enable custom operator hooks in framework headers."""
|
||||
return "#define CUGENOPT_HAS_CUSTOM_OPS"
|
||||
|
||||
|
||||
def _write_binary_float(path: str, arr: np.ndarray):
|
||||
arr = np.ascontiguousarray(arr, dtype=np.float32)
|
||||
with open(path, "wb") as f:
|
||||
f.write(arr.tobytes())
|
||||
|
||||
|
||||
def _write_binary_int(path: str, arr: np.ndarray):
|
||||
arr = np.ascontiguousarray(arr, dtype=np.int32)
|
||||
with open(path, "wb") as f:
|
||||
f.write(arr.tobytes())
|
||||
|
||||
|
||||
class DataArray:
|
||||
"""Describes a data array to be passed to the custom problem."""
|
||||
|
||||
def __init__(self, name: str, data: np.ndarray, dtype: str = "float"):
|
||||
self.name = name
|
||||
self.data = data
|
||||
self.dtype = dtype # "float" or "int"
|
||||
self.size = data.size # total element count
|
||||
|
||||
|
||||
def _build_encoding_str(encoding: str) -> str:
|
||||
mapping = {
|
||||
"permutation": "EncodingType::Permutation",
|
||||
"binary": "EncodingType::Binary",
|
||||
"integer": "EncodingType::Integer",
|
||||
}
|
||||
return mapping.get(encoding.lower(), "EncodingType::Permutation")
|
||||
|
||||
|
||||
def _build_row_mode_str(row_mode: str) -> str:
|
||||
mapping = {
|
||||
"single": "RowMode::Single",
|
||||
"fixed": "RowMode::Fixed",
|
||||
"partition": "RowMode::Partition",
|
||||
}
|
||||
return mapping.get(row_mode.lower(), "RowMode::Single")
|
||||
|
||||
|
||||
def _build_obj_dir_str(direction: str) -> str:
|
||||
return "ObjDir::Maximize" if direction.lower().startswith("max") else "ObjDir::Minimize"
|
||||
|
||||
|
||||
def compile_and_solve(
|
||||
compute_obj: str,
|
||||
compute_penalty: str = "return 0.0f;",
|
||||
data: Optional[Dict[str, np.ndarray]] = None,
|
||||
int_data: Optional[Dict[str, np.ndarray]] = None,
|
||||
encoding: str = "permutation",
|
||||
dim1: int = 1,
|
||||
dim2: int = 64,
|
||||
n: Optional[int] = None,
|
||||
row_mode: str = "single",
|
||||
total_elements: int = 0,
|
||||
cross_row_prob: float = 0.0,
|
||||
perm_repeat_count: int = 1,
|
||||
value_lower: int = 0,
|
||||
value_upper: int = 1,
|
||||
objectives: Optional[List[Tuple[str, float]]] = None,
|
||||
shared_mem: Optional[str] = None,
|
||||
load_shared: Optional[str] = None,
|
||||
pop_size: int = 0,
|
||||
max_gen: int = 1000,
|
||||
time_limit: float = 0.0,
|
||||
seed: int = 42,
|
||||
use_aos: bool = False,
|
||||
sa_temp_init: float = 0.0,
|
||||
verbose: bool = False,
|
||||
framework_root: Optional[str] = None,
|
||||
cuda_arch: Optional[str] = None,
|
||||
custom_operators: Optional[List] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
JIT-compile and solve a custom optimization problem.
|
||||
|
||||
Args:
|
||||
compute_obj: CUDA code for the compute_obj function body.
|
||||
Available variables: idx (objective index), sol (const Sol&),
|
||||
and any data fields you declared.
|
||||
compute_penalty: CUDA code for compute_penalty body.
|
||||
data: Dict of name -> numpy float32 array for problem data.
|
||||
int_data: Dict of name -> numpy int32 array for problem data.
|
||||
encoding: "permutation", "binary", or "integer".
|
||||
dim1: Number of rows in solution (1 for most problems).
|
||||
dim2: Max columns per row.
|
||||
n: Problem size (number of elements). If None, inferred from data.
|
||||
row_mode: "single", "fixed", or "partition".
|
||||
total_elements: For partition mode, total elements across all rows.
|
||||
cross_row_prob: Probability of cross-row operations.
|
||||
perm_repeat_count: For multiset permutation (JSP-style).
|
||||
value_lower, value_upper: Bounds for integer encoding.
|
||||
objectives: List of (direction, weight) tuples. Default: [("minimize", 1.0)].
|
||||
shared_mem: Expression for shared_mem_bytes() return value.
|
||||
Use '_n' for problem size. E.g. "(size_t)_n * _n * sizeof(float)"
|
||||
load_shared: CUDA code for load_shared body.
|
||||
pop_size, max_gen, time_limit, seed, use_aos, sa_temp_init, verbose:
|
||||
Solver configuration parameters.
|
||||
framework_root: Path to cuGenOpt framework. Auto-detected if None.
|
||||
cuda_arch: CUDA architecture (e.g. "sm_75"). Auto-detected if None.
|
||||
|
||||
Returns:
|
||||
Dict with keys: objective, penalty, solution, elapsed_ms, generations,
|
||||
stop_reason, objectives.
|
||||
|
||||
Example:
|
||||
>>> result = compile_and_solve(
|
||||
... compute_obj='''
|
||||
... if (idx != 0) return 0.0f;
|
||||
... float total = 0.0f;
|
||||
... const int* route = sol.data[0];
|
||||
... int size = sol.dim2_sizes[0];
|
||||
... for (int i = 0; i < size; i++)
|
||||
... total += d_dist[route[i] * _n + route[(i+1) % size]];
|
||||
... return total;
|
||||
... ''',
|
||||
... data={"d_dist": dist_matrix},
|
||||
... encoding="permutation", dim2=64, n=20,
|
||||
... time_limit=5.0,
|
||||
... )
|
||||
"""
|
||||
if data is None:
|
||||
data = {}
|
||||
if int_data is None:
|
||||
int_data = {}
|
||||
if objectives is None:
|
||||
objectives = [("minimize", 1.0)]
|
||||
|
||||
# --- Input validation ---
|
||||
compute_obj = validate_cuda_snippet(compute_obj, "compute_obj")
|
||||
if compute_penalty != "return 0.0f;":
|
||||
compute_penalty = validate_cuda_snippet(compute_penalty, "compute_penalty")
|
||||
encoding = validate_encoding(encoding)
|
||||
dim1 = validate_positive_int(dim1, "dim1")
|
||||
dim2 = validate_positive_int(dim2, "dim2")
|
||||
max_gen = validate_positive_int(max_gen, "max_gen")
|
||||
if data:
|
||||
data = validate_data_dict(data, "float")
|
||||
if int_data:
|
||||
int_data = validate_data_dict(int_data, "int")
|
||||
|
||||
# Infer n from data if not provided
|
||||
if n is None:
|
||||
for arr in data.values():
|
||||
if arr.ndim == 2:
|
||||
n = arr.shape[0]
|
||||
break
|
||||
elif arr.ndim == 1:
|
||||
n = arr.shape[0]
|
||||
break
|
||||
if n is None:
|
||||
n = dim2
|
||||
n = validate_positive_int(n, "n")
|
||||
|
||||
# Framework root
|
||||
fw_root = Path(framework_root) if framework_root else _detect_framework_root()
|
||||
|
||||
# Read template
|
||||
template = _TEMPLATE_PATH.read_text()
|
||||
|
||||
# Build data fields
|
||||
all_data = []
|
||||
data_fields_lines = []
|
||||
for name, arr in data.items():
|
||||
all_data.append(DataArray(name, arr, "float"))
|
||||
data_fields_lines.append(f" const float* {name};")
|
||||
for name, arr in int_data.items():
|
||||
all_data.append(DataArray(name, arr, "int"))
|
||||
data_fields_lines.append(f" const int* {name};")
|
||||
data_fields = "\n".join(data_fields_lines) if data_fields_lines else " // no data fields"
|
||||
|
||||
# Build OBJ_DEFS
|
||||
obj_defs_parts = []
|
||||
for direction, weight in objectives:
|
||||
obj_defs_parts.append(f"{{{_build_obj_dir_str(direction)}, {weight}f, 0.0f}}")
|
||||
obj_defs = ", ".join(obj_defs_parts)
|
||||
|
||||
# Build shared memory
|
||||
if shared_mem is None:
|
||||
total_bytes_parts = []
|
||||
for da in all_data:
|
||||
elem_size = "sizeof(float)" if da.dtype == "float" else "sizeof(int)"
|
||||
total_bytes_parts.append(f"(size_t){da.size} * {elem_size}")
|
||||
if total_bytes_parts:
|
||||
total_expr = " + ".join(total_bytes_parts)
|
||||
shared_mem_expr = f"size_t need = {total_expr};\n return (need <= 48 * 1024) ? need : 0;"
|
||||
else:
|
||||
shared_mem_expr = "return 0;"
|
||||
else:
|
||||
shared_mem_expr = f"size_t need = {shared_mem};\n return (need <= 48 * 1024) ? need : 0;"
|
||||
|
||||
# Build load_shared
|
||||
if load_shared is None:
|
||||
load_lines = []
|
||||
offset = "smem"
|
||||
for i, da in enumerate(all_data):
|
||||
ctype = "float" if da.dtype == "float" else "int"
|
||||
ptr_name = f"s_{da.name}"
|
||||
if i == 0:
|
||||
load_lines.append(f" {ctype}* {ptr_name} = reinterpret_cast<{ctype}*>(smem);")
|
||||
else:
|
||||
prev = all_data[i - 1]
|
||||
prev_ptr = f"s_{prev.name}"
|
||||
load_lines.append(f" {ctype}* {ptr_name} = reinterpret_cast<{ctype}*>({prev_ptr} + {prev.size});")
|
||||
load_lines.append(f" for (int i = tid; i < {da.size}; i += bsz) {ptr_name}[i] = {da.name}[i];")
|
||||
load_lines.append(f" {da.name} = {ptr_name};")
|
||||
load_shared_body = "\n".join(load_lines) if load_lines else " // no data to load"
|
||||
else:
|
||||
load_shared_body = load_shared
|
||||
|
||||
# Build destroy body
|
||||
destroy_lines = []
|
||||
for da in all_data:
|
||||
ctype = "float" if da.dtype == "float" else "int"
|
||||
destroy_lines.append(f" if ({da.name}) cudaFree(const_cast<{ctype}*>({da.name}));")
|
||||
destroy_body = "\n".join(destroy_lines) if destroy_lines else " // nothing to free"
|
||||
|
||||
# Build data load body (main function: read binary files, cudaMalloc, cudaMemcpy)
|
||||
data_load_lines = []
|
||||
for da in all_data:
|
||||
if da.dtype == "float":
|
||||
data_load_lines.append(f' snprintf(path, sizeof(path), "%s/{da.name}.bin", data_dir);')
|
||||
data_load_lines.append(f' float* h_{da.name} = read_binary_floats(path, {da.size});')
|
||||
data_load_lines.append(f' float* d_{da.name}; CUDA_CHECK(cudaMalloc(&d_{da.name}, sizeof(float) * {da.size}));')
|
||||
data_load_lines.append(f' CUDA_CHECK(cudaMemcpy(d_{da.name}, h_{da.name}, sizeof(float) * {da.size}, cudaMemcpyHostToDevice));')
|
||||
data_load_lines.append(f' prob.{da.name} = d_{da.name};')
|
||||
data_load_lines.append(f' delete[] h_{da.name};')
|
||||
else:
|
||||
data_load_lines.append(f' snprintf(path, sizeof(path), "%s/{da.name}.bin", data_dir);')
|
||||
data_load_lines.append(f' int* h_{da.name} = read_binary_ints(path, {da.size});')
|
||||
data_load_lines.append(f' int* d_{da.name}; CUDA_CHECK(cudaMalloc(&d_{da.name}, sizeof(int) * {da.size}));')
|
||||
data_load_lines.append(f' CUDA_CHECK(cudaMemcpy(d_{da.name}, h_{da.name}, sizeof(int) * {da.size}, cudaMemcpyHostToDevice));')
|
||||
data_load_lines.append(f' prob.{da.name} = d_{da.name};')
|
||||
data_load_lines.append(f' delete[] h_{da.name};')
|
||||
data_load_body = "\n".join(data_load_lines) if data_load_lines else " // no data to load"
|
||||
|
||||
# Build solver config
|
||||
config_lines = [
|
||||
f" cfg.pop_size = {pop_size};",
|
||||
f" cfg.max_gen = {max_gen};",
|
||||
f" cfg.seed = {seed};",
|
||||
f" cfg.verbose = {'true' if verbose else 'false'};",
|
||||
f" cfg.use_aos = {'true' if use_aos else 'false'};",
|
||||
]
|
||||
if time_limit > 0:
|
||||
config_lines.append(f" cfg.time_limit_sec = {time_limit}f;")
|
||||
if sa_temp_init > 0:
|
||||
config_lines.append(f" cfg.sa_temp_init = {sa_temp_init}f;")
|
||||
solver_config = "\n".join(config_lines)
|
||||
|
||||
dim2_default = n if row_mode.lower() == "single" else 0
|
||||
if total_elements == 0 and row_mode.lower() == "partition":
|
||||
total_elements = n
|
||||
|
||||
# Process custom operators
|
||||
custom_op_defines_block = ""
|
||||
custom_op_switch = ""
|
||||
custom_op_registry = ""
|
||||
if custom_operators:
|
||||
from cugenopt.operators import generate_custom_operator_cuda
|
||||
switch_block, registry_block, filtered = generate_custom_operator_cuda(
|
||||
custom_operators, encoding
|
||||
)
|
||||
if filtered:
|
||||
custom_op_switch = switch_block
|
||||
custom_op_registry = registry_block
|
||||
custom_op_defines_block = _build_custom_op_defines(switch_block, registry_block)
|
||||
|
||||
# Fill template
|
||||
source = template
|
||||
replacements = {
|
||||
"{{D1}}": str(dim1),
|
||||
"{{D2}}": str(dim2),
|
||||
"{{DATA_FIELDS}}": data_fields,
|
||||
"{{OBJ_DEFS}}": obj_defs,
|
||||
"{{COMPUTE_OBJ}}": compute_obj,
|
||||
"{{COMPUTE_PENALTY}}": compute_penalty,
|
||||
"{{ENCODING}}": _build_encoding_str(encoding),
|
||||
"{{DIM1}}": str(dim1),
|
||||
"{{DIM2_DEFAULT}}": str(dim2_default),
|
||||
"{{ROW_MODE}}": _build_row_mode_str(row_mode),
|
||||
"{{TOTAL_ELEMENTS}}": str(total_elements),
|
||||
"{{CROSS_ROW_PROB}}": f"{cross_row_prob}f",
|
||||
"{{PERM_REPEAT_COUNT}}": str(perm_repeat_count),
|
||||
"{{VALUE_LOWER}}": str(value_lower),
|
||||
"{{VALUE_UPPER}}": str(value_upper),
|
||||
"{{SHARED_MEM_EXPR}}": shared_mem_expr,
|
||||
"{{LOAD_SHARED_BODY}}": load_shared_body,
|
||||
"{{DESTROY_BODY}}": destroy_body,
|
||||
"{{DATA_LOAD_BODY}}": data_load_body,
|
||||
"{{SOLVER_CONFIG}}": solver_config,
|
||||
"{{NUM_OBJ}}": str(len(objectives)),
|
||||
"{{CUSTOM_OP_DEFINES}}": custom_op_defines_block,
|
||||
"{{CUSTOM_OP_SWITCH}}": custom_op_switch,
|
||||
"{{CUSTOM_OP_REGISTRY}}": custom_op_registry,
|
||||
}
|
||||
for key, val in replacements.items():
|
||||
source = source.replace(key, val)
|
||||
|
||||
# Hash for caching
|
||||
h = _content_hash(source)
|
||||
_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
cu_path = _CACHE_DIR / f"custom_{h}.cu"
|
||||
exe_suffix = ".exe" if _is_windows() else ""
|
||||
exe_path = _CACHE_DIR / f"custom_{h}{exe_suffix}"
|
||||
|
||||
# Compile if needed
|
||||
if not exe_path.exists():
|
||||
cu_path.write_text(source)
|
||||
|
||||
if cuda_arch is None:
|
||||
cuda_arch = _detect_cuda_arch()
|
||||
nvcc = _find_nvcc()
|
||||
|
||||
cmd = [
|
||||
nvcc, "-O2", "-std=c++17", "--extended-lambda", "--expt-relaxed-constexpr",
|
||||
f"-arch={cuda_arch}",
|
||||
f"-I{fw_root}",
|
||||
f"-I{fw_root / 'core'}",
|
||||
f"-I{fw_root / 'problems'}",
|
||||
str(cu_path), "-o", str(exe_path),
|
||||
]
|
||||
proc = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if proc.returncode != 0:
|
||||
if custom_operators and len(custom_operators) > 0:
|
||||
source, exe_path, cu_path = _fallback_compile_without_bad_ops(
|
||||
custom_operators, encoding, template, replacements,
|
||||
cmd[:-3], fw_root, proc.stderr,
|
||||
)
|
||||
else:
|
||||
raise CuGenOptCompileError(proc.stderr, str(cu_path))
|
||||
|
||||
# Write data to temp dir
|
||||
with tempfile.TemporaryDirectory(prefix="cugenopt_") as tmpdir:
|
||||
# Write n
|
||||
with open(os.path.join(tmpdir, "n.bin"), "wb") as f:
|
||||
f.write(struct.pack("i", n))
|
||||
|
||||
# Write data arrays
|
||||
for da in all_data:
|
||||
bin_path = os.path.join(tmpdir, f"{da.name}.bin")
|
||||
if da.dtype == "float":
|
||||
_write_binary_float(bin_path, da.data)
|
||||
else:
|
||||
_write_binary_int(bin_path, da.data)
|
||||
|
||||
# Run
|
||||
proc = subprocess.run(
|
||||
[str(exe_path), tmpdir],
|
||||
capture_output=True, text=True, timeout=max(300, time_limit * 3 + 60)
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
raise RuntimeError(f"Execution failed:\n{proc.stderr}")
|
||||
|
||||
# Parse JSON output (find the last line that starts with '{')
|
||||
output_lines = proc.stdout.strip().split("\n")
|
||||
json_line = None
|
||||
for line in reversed(output_lines):
|
||||
line = line.strip()
|
||||
if line.startswith("{"):
|
||||
json_line = line
|
||||
break
|
||||
|
||||
if json_line is None:
|
||||
raise RuntimeError(
|
||||
f"No JSON output found.\nstdout:\n{proc.stdout}\nstderr:\n{proc.stderr}"
|
||||
)
|
||||
|
||||
result = json.loads(json_line)
|
||||
|
||||
# Convert solution lists to numpy arrays
|
||||
if "solution" in result:
|
||||
result["solution"] = [np.array(row, dtype=np.int32) for row in result["solution"]]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def clear_cache():
|
||||
"""Remove all cached JIT compilations."""
|
||||
if _CACHE_DIR.exists():
|
||||
shutil.rmtree(_CACHE_DIR)
|
||||
print(f"Cleared JIT cache: {_CACHE_DIR}")
|
||||
204
python/cugenopt/jit_template.cu
Normal file
204
python/cugenopt/jit_template.cu
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
/**
|
||||
* JIT template — auto-generated by cugenopt.solve_custom()
|
||||
*
|
||||
* Placeholders (replaced by Python at runtime):
|
||||
* {{D1}}, {{D2}} — Solution dimensions
|
||||
* {{ENCODING}} — EncodingType enum value
|
||||
* {{ROW_MODE}} — RowMode enum value
|
||||
* {{DIM1}}, {{DIM2_DEFAULT}}, {{TOTAL_ELEMENTS}}
|
||||
* {{VALUE_LOWER}}, {{VALUE_UPPER}}
|
||||
* {{CROSS_ROW_PROB}}, {{PERM_REPEAT_COUNT}}
|
||||
* {{OBJ_DEFS}} — ObjDef array initializer
|
||||
* {{DATA_FIELDS}} — struct field declarations
|
||||
* {{COMPUTE_OBJ}} — user's compute_obj body
|
||||
* {{COMPUTE_PENALTY}} — user's compute_penalty body
|
||||
* {{SHARED_MEM_EXPR}} — shared_mem_bytes() return expression
|
||||
* {{LOAD_SHARED_BODY}} — load_shared() body
|
||||
* {{DESTROY_BODY}} — destroy() body
|
||||
* {{DATA_LOAD_BODY}} — main() data loading code
|
||||
* {{SOLVER_CONFIG}} — SolverConfig field assignments
|
||||
* {{NUM_OBJ}} — number of objectives
|
||||
* {{CUSTOM_OP_DEFINES}} — #define CUGENOPT_HAS_CUSTOM_OPS (or empty)
|
||||
* {{CUSTOM_OP_SWITCH}} — switch cases for execute_custom_op (or empty)
|
||||
* {{CUSTOM_OP_REGISTRY}} — add() calls for register_custom_operators (or empty)
|
||||
*/
|
||||
|
||||
{{CUSTOM_OP_DEFINES}}
|
||||
|
||||
#include "types.cuh"
|
||||
#include "cuda_utils.cuh"
|
||||
|
||||
struct CustomProblem;
|
||||
|
||||
#ifdef CUGENOPT_HAS_CUSTOM_OPS
|
||||
template<typename Sol>
|
||||
__device__ inline bool execute_custom_op(int seq_id, Sol& sol, int dim1,
|
||||
EncodingType encoding, curandState* rng,
|
||||
int val_lb, int val_ub,
|
||||
const void* prob_data);
|
||||
#endif
|
||||
|
||||
#include "solver.cuh"
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
struct CustomProblem : ProblemBase<CustomProblem, {{D1}}, {{D2}}> {
|
||||
{{DATA_FIELDS}}
|
||||
int _n;
|
||||
|
||||
static constexpr ObjDef OBJ_DEFS[] = {
|
||||
{{OBJ_DEFS}}
|
||||
};
|
||||
|
||||
__device__ float compute_obj(int idx, const Sol& sol) const {
|
||||
{{COMPUTE_OBJ}}
|
||||
}
|
||||
|
||||
__device__ float compute_penalty(const Sol& sol) const {
|
||||
{{COMPUTE_PENALTY}}
|
||||
}
|
||||
|
||||
ProblemConfig config() const {
|
||||
ProblemConfig cfg;
|
||||
cfg.encoding = {{ENCODING}};
|
||||
cfg.dim1 = {{DIM1}};
|
||||
cfg.dim2_default = {{DIM2_DEFAULT}};
|
||||
cfg.row_mode = {{ROW_MODE}};
|
||||
cfg.total_elements = {{TOTAL_ELEMENTS}};
|
||||
cfg.cross_row_prob = {{CROSS_ROW_PROB}};
|
||||
cfg.perm_repeat_count = {{PERM_REPEAT_COUNT}};
|
||||
cfg.value_lower_bound = {{VALUE_LOWER}};
|
||||
cfg.value_upper_bound = {{VALUE_UPPER}};
|
||||
fill_obj_config(cfg);
|
||||
return cfg;
|
||||
}
|
||||
|
||||
size_t shared_mem_bytes() const {
|
||||
{{SHARED_MEM_EXPR}}
|
||||
}
|
||||
|
||||
__device__ void load_shared(char* smem, int tid, int bsz) {
|
||||
{{LOAD_SHARED_BODY}}
|
||||
}
|
||||
|
||||
void destroy() {
|
||||
{{DESTROY_BODY}}
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef CUGENOPT_HAS_CUSTOM_OPS
|
||||
template<typename Sol>
|
||||
__device__ inline bool execute_custom_op(int seq_id, Sol& sol, int dim1,
|
||||
EncodingType encoding, curandState* rng,
|
||||
int val_lb, int val_ub,
|
||||
const void* prob_data) {
|
||||
const CustomProblem* prob = static_cast<const CustomProblem*>(prob_data);
|
||||
switch (seq_id) {
|
||||
{{CUSTOM_OP_SWITCH}}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CUGENOPT_HAS_CUSTOM_OPS
|
||||
inline void register_custom_operators(SeqRegistry& reg) {
|
||||
auto add = [&](int id, float w, float cap) {
|
||||
if (reg.count >= MAX_SEQ) return;
|
||||
reg.ids[reg.count] = id;
|
||||
reg.weights[reg.count] = w;
|
||||
reg.max_w[reg.count] = cap;
|
||||
reg.count++;
|
||||
};
|
||||
{{CUSTOM_OP_REGISTRY}}
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < reg.count; i++) sum += reg.weights[i];
|
||||
if (sum > 0.0f) {
|
||||
for (int i = 0; i < reg.count; i++) reg.weights[i] /= sum;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static float* read_binary_floats(const char* path, int count) {
|
||||
FILE* f = fopen(path, "rb");
|
||||
if (!f) { fprintf(stderr, "Cannot open %s\n", path); exit(1); }
|
||||
float* buf = new float[count];
|
||||
size_t r = fread(buf, sizeof(float), count, f);
|
||||
fclose(f);
|
||||
if ((int)r != count) { fprintf(stderr, "Short read %s: %d/%d\n", path, (int)r, count); exit(1); }
|
||||
return buf;
|
||||
}
|
||||
|
||||
static int* read_binary_ints(const char* path, int count) {
|
||||
FILE* f = fopen(path, "rb");
|
||||
if (!f) { fprintf(stderr, "Cannot open %s\n", path); exit(1); }
|
||||
int* buf = new int[count];
|
||||
size_t r = fread(buf, sizeof(int), count, f);
|
||||
fclose(f);
|
||||
if ((int)r != count) { fprintf(stderr, "Short read %s: %d/%d\n", path, (int)r, count); exit(1); }
|
||||
return buf;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: %s <data_dir>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
const char* data_dir = argv[1];
|
||||
char path[512];
|
||||
|
||||
// Read problem size
|
||||
snprintf(path, sizeof(path), "%s/n.bin", data_dir);
|
||||
int n;
|
||||
{ FILE* f = fopen(path, "rb"); fread(&n, sizeof(int), 1, f); fclose(f); }
|
||||
|
||||
// Read data arrays and upload to GPU
|
||||
CustomProblem prob;
|
||||
prob._n = n;
|
||||
{{DATA_LOAD_BODY}}
|
||||
|
||||
// Configure solver
|
||||
SolverConfig cfg;
|
||||
{{SOLVER_CONFIG}}
|
||||
|
||||
// Solve
|
||||
#ifdef CUGENOPT_HAS_CUSTOM_OPS
|
||||
auto result = solve(prob, cfg, nullptr, 0, register_custom_operators);
|
||||
#else
|
||||
auto result = solve(prob, cfg);
|
||||
#endif
|
||||
|
||||
// Output JSON result
|
||||
auto& best = result.best_solution;
|
||||
printf("{\"objective\":%.10g", best.objectives[0]);
|
||||
printf(",\"penalty\":%.10g", best.penalty);
|
||||
printf(",\"elapsed_ms\":%.4f", result.elapsed_ms);
|
||||
printf(",\"generations\":%d", result.generations);
|
||||
printf(",\"stop_reason\":\"%s\"",
|
||||
result.stop_reason == StopReason::TimeLimit ? "time_limit" :
|
||||
result.stop_reason == StopReason::Stagnation ? "stagnation" : "max_gen");
|
||||
|
||||
printf(",\"objectives\":[");
|
||||
for (int i = 0; i < {{NUM_OBJ}}; i++) {
|
||||
if (i > 0) printf(",");
|
||||
printf("%.10g", best.objectives[i]);
|
||||
}
|
||||
printf("]");
|
||||
|
||||
printf(",\"solution\":[");
|
||||
for (int r = 0; r < {{DIM1}}; r++) {
|
||||
if (r > 0) printf(",");
|
||||
printf("[");
|
||||
int len = best.dim2_sizes[r];
|
||||
for (int c = 0; c < len; c++) {
|
||||
if (c > 0) printf(",");
|
||||
printf("%d", best.data[r][c]);
|
||||
}
|
||||
printf("]");
|
||||
}
|
||||
printf("]}\n");
|
||||
|
||||
prob.destroy();
|
||||
return 0;
|
||||
}
|
||||
19
python/cugenopt/operator_packs/__init__.py
Normal file
19
python/cugenopt/operator_packs/__init__.py
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
"""
|
||||
Built-in operator packs for standard problems.
|
||||
|
||||
Each pack provides problem-aware custom operators that can significantly
|
||||
improve solution quality for specific problem types. Users load them like:
|
||||
|
||||
from cugenopt.operator_packs import tsp_ops
|
||||
result = cugenopt.solve_custom(..., custom_operators=tsp_ops)
|
||||
|
||||
Or with built-in solvers:
|
||||
|
||||
result = cugenopt.solve_tsp(dist, custom_operators=tsp_ops)
|
||||
"""
|
||||
|
||||
from cugenopt.operator_packs.tsp import tsp_ops
|
||||
from cugenopt.operator_packs.knapsack import knapsack_ops
|
||||
from cugenopt.operator_packs.graph_color import graph_color_ops
|
||||
|
||||
__all__ = ["tsp_ops", "knapsack_ops", "graph_color_ops"]
|
||||
92
python/cugenopt/operator_packs/graph_color.py
Normal file
92
python/cugenopt/operator_packs/graph_color.py
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
"""
|
||||
Graph Coloring specific custom operators.
|
||||
|
||||
These operators exploit graph structure (adjacency awareness) to
|
||||
reduce conflicts more effectively than random integer mutations.
|
||||
"""
|
||||
|
||||
from cugenopt.operators import CustomOperator
|
||||
|
||||
_GC_CONFLICT_RECOLOR = CustomOperator(
|
||||
name="gc_conflict_recolor",
|
||||
code=r"""
|
||||
int row = 0;
|
||||
int sz = sol.dim2_sizes[row];
|
||||
if (sz < 2) return false;
|
||||
int n = prob->_n;
|
||||
const int* adj = prob->d_adj;
|
||||
int node = rand_int(rng, sz);
|
||||
int my_color = sol.data[row][node];
|
||||
bool has_conflict = false;
|
||||
for (int j = 0; j < sz; j++) {
|
||||
if (j != node && adj[node * n + j] && sol.data[row][j] == my_color) {
|
||||
has_conflict = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!has_conflict) return false;
|
||||
int range = val_ub - val_lb + 1;
|
||||
int best_color = my_color;
|
||||
int best_conflicts = sz;
|
||||
for (int t = 0; t < 5; t++) {
|
||||
int c = val_lb + rand_int(rng, range);
|
||||
if (c == my_color) continue;
|
||||
int conflicts = 0;
|
||||
for (int j = 0; j < sz; j++)
|
||||
if (j != node && adj[node * n + j] && sol.data[row][j] == c)
|
||||
conflicts++;
|
||||
if (conflicts < best_conflicts) {
|
||||
best_conflicts = conflicts;
|
||||
best_color = c;
|
||||
}
|
||||
}
|
||||
if (best_color == my_color) return false;
|
||||
sol.data[row][node] = best_color;
|
||||
return true;
|
||||
""",
|
||||
encoding="integer",
|
||||
initial_weight=1.5,
|
||||
)
|
||||
|
||||
_GC_KEMPE_CHAIN = CustomOperator(
|
||||
name="gc_kempe_swap",
|
||||
code=r"""
|
||||
int row = 0;
|
||||
int sz = sol.dim2_sizes[row];
|
||||
if (sz < 3) return false;
|
||||
int n = prob->_n;
|
||||
const int* adj = prob->d_adj;
|
||||
int node = rand_int(rng, sz);
|
||||
int c1 = sol.data[row][node];
|
||||
int range = val_ub - val_lb + 1;
|
||||
int c2 = val_lb + rand_int(rng, range - 1);
|
||||
if (c2 >= c1) c2++;
|
||||
int queue[64];
|
||||
bool visited[512];
|
||||
for (int i = 0; i < sz && i < 512; i++) visited[i] = false;
|
||||
int head = 0, tail = 0;
|
||||
queue[tail++] = node;
|
||||
visited[node] = true;
|
||||
int chain_len = 0;
|
||||
while (head < tail && tail < 64 && chain_len < 16) {
|
||||
int v = queue[head++];
|
||||
int vc = sol.data[row][v];
|
||||
int swap_c = (vc == c1) ? c2 : c1;
|
||||
sol.data[row][v] = swap_c;
|
||||
chain_len++;
|
||||
for (int j = 0; j < sz; j++) {
|
||||
if (!visited[j] && j < 512 && adj[v * n + j]
|
||||
&& (sol.data[row][j] == c1 || sol.data[row][j] == c2)
|
||||
&& tail < 64) {
|
||||
visited[j] = true;
|
||||
queue[tail++] = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
return chain_len > 0;
|
||||
""",
|
||||
encoding="integer",
|
||||
initial_weight=1.0,
|
||||
)
|
||||
|
||||
graph_color_ops = [_GC_CONFLICT_RECOLOR, _GC_KEMPE_CHAIN]
|
||||
71
python/cugenopt/operator_packs/knapsack.py
Normal file
71
python/cugenopt/operator_packs/knapsack.py
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
"""
|
||||
Knapsack-specific custom operators.
|
||||
|
||||
These operators exploit knapsack structure (value/weight ratio awareness)
|
||||
to make more informed bit-flip decisions.
|
||||
"""
|
||||
|
||||
from cugenopt.operators import CustomOperator
|
||||
|
||||
_KNAPSACK_GREEDY_FLIP = CustomOperator(
|
||||
name="knapsack_greedy_flip",
|
||||
code=r"""
|
||||
int row = 0;
|
||||
int sz = sol.dim2_sizes[row];
|
||||
if (sz < 2) return false;
|
||||
const float* weights = prob->d_weights;
|
||||
const float* values = prob->d_values;
|
||||
int pos = rand_int(rng, sz);
|
||||
if (sol.data[row][pos] == 0) {
|
||||
float ratio = (weights[pos] > 0.001f)
|
||||
? values[pos] / weights[pos] : 0.0f;
|
||||
if (ratio > 0.5f || curand_uniform(rng) < 0.3f) {
|
||||
sol.data[row][pos] = 1;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
float ratio = (weights[pos] > 0.001f)
|
||||
? values[pos] / weights[pos] : 1e6f;
|
||||
if (ratio < 0.5f || curand_uniform(rng) < 0.3f) {
|
||||
sol.data[row][pos] = 0;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
""",
|
||||
encoding="binary",
|
||||
initial_weight=0.8,
|
||||
)
|
||||
|
||||
_KNAPSACK_SWAP_RATIO = CustomOperator(
|
||||
name="knapsack_swap_ratio",
|
||||
code=r"""
|
||||
int row = 0;
|
||||
int sz = sol.dim2_sizes[row];
|
||||
if (sz < 2) return false;
|
||||
const float* weights = prob->d_weights;
|
||||
const float* values = prob->d_values;
|
||||
int in_item = -1, out_item = -1;
|
||||
for (int t = 0; t < 8; t++) {
|
||||
int p = rand_int(rng, sz);
|
||||
if (sol.data[row][p] == 1 && in_item < 0) in_item = p;
|
||||
if (sol.data[row][p] == 0 && out_item < 0) out_item = p;
|
||||
if (in_item >= 0 && out_item >= 0) break;
|
||||
}
|
||||
if (in_item < 0 || out_item < 0) return false;
|
||||
float in_ratio = (weights[in_item] > 0.001f)
|
||||
? values[in_item] / weights[in_item] : 1e6f;
|
||||
float out_ratio = (weights[out_item] > 0.001f)
|
||||
? values[out_item] / weights[out_item] : 0.0f;
|
||||
if (out_ratio > in_ratio || curand_uniform(rng) < 0.2f) {
|
||||
sol.data[row][in_item] = 0;
|
||||
sol.data[row][out_item] = 1;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
""",
|
||||
encoding="binary",
|
||||
initial_weight=0.8,
|
||||
)
|
||||
|
||||
knapsack_ops = [_KNAPSACK_GREEDY_FLIP, _KNAPSACK_SWAP_RATIO]
|
||||
108
python/cugenopt/operator_packs/tsp.py
Normal file
108
python/cugenopt/operator_packs/tsp.py
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
"""
|
||||
TSP-specific custom operators.
|
||||
|
||||
These operators exploit TSP structure (distance matrix, route continuity)
|
||||
to perform more effective local search than generic permutation operators.
|
||||
"""
|
||||
|
||||
from cugenopt.operators import CustomOperator
|
||||
|
||||
_TSP_2OPT_DELTA = CustomOperator(
|
||||
name="tsp_2opt_delta",
|
||||
code=r"""
|
||||
int row = 0;
|
||||
int sz = sol.dim2_sizes[row];
|
||||
if (sz < 4) return false;
|
||||
int n = prob->_n;
|
||||
const float* dist = prob->d_dist;
|
||||
int i = rand_int(rng, sz);
|
||||
int j = rand_int(rng, sz - 2);
|
||||
if (j >= i) j += 2; else if (j == i - 1) j = (i + sz - 1) % sz;
|
||||
if (i > j) { int t = i; i = j; j = t; }
|
||||
if (j - i < 2 || j - i >= sz - 1) return false;
|
||||
int a = sol.data[row][i], b = sol.data[row][(i+1) % sz];
|
||||
int c = sol.data[row][j], d = sol.data[row][(j+1) % sz];
|
||||
float old_cost = dist[a * n + b] + dist[c * n + d];
|
||||
float new_cost = dist[a * n + c] + dist[b * n + d];
|
||||
if (new_cost >= old_cost) return false;
|
||||
ops::perm_reverse(sol.data[row], i + 1, j);
|
||||
return true;
|
||||
""",
|
||||
encoding="permutation",
|
||||
initial_weight=1.5,
|
||||
)
|
||||
|
||||
_TSP_OR_OPT_DELTA = CustomOperator(
|
||||
name="tsp_or_opt_delta",
|
||||
code=r"""
|
||||
int row = 0;
|
||||
int sz = sol.dim2_sizes[row];
|
||||
if (sz < 5) return false;
|
||||
int n = prob->_n;
|
||||
const float* dist = prob->d_dist;
|
||||
int seg_len = 1 + rand_int(rng, 3);
|
||||
if (seg_len >= sz - 1) return false;
|
||||
int from_pos = rand_int(rng, sz - seg_len);
|
||||
int prev = sol.data[row][(from_pos + sz - 1) % sz];
|
||||
int seg_start = sol.data[row][from_pos];
|
||||
int seg_end = sol.data[row][from_pos + seg_len - 1];
|
||||
int next = sol.data[row][(from_pos + seg_len) % sz];
|
||||
float remove_cost = dist[prev * n + seg_start]
|
||||
+ dist[seg_end * n + next]
|
||||
- dist[prev * n + next];
|
||||
int best_to = -1;
|
||||
float best_delta = 0.0f;
|
||||
for (int t = 0; t < 8; t++) {
|
||||
int to = rand_int(rng, sz - seg_len);
|
||||
if (to >= from_pos && to < from_pos + seg_len) continue;
|
||||
int tp = sol.data[row][to];
|
||||
int tn = sol.data[row][(to + 1) % sz];
|
||||
float insert_cost = dist[tp * n + seg_start]
|
||||
+ dist[seg_end * n + tn]
|
||||
- dist[tp * n + tn];
|
||||
float delta = insert_cost - remove_cost;
|
||||
if (delta < best_delta) { best_delta = delta; best_to = to; }
|
||||
}
|
||||
if (best_to < 0) return false;
|
||||
ops::perm_or_opt(sol.data[row], sz, from_pos, best_to, seg_len);
|
||||
return true;
|
||||
""",
|
||||
encoding="permutation",
|
||||
initial_weight=1.0,
|
||||
)
|
||||
|
||||
_TSP_NODE_INSERT_DELTA = CustomOperator(
|
||||
name="tsp_node_insert_delta",
|
||||
code=r"""
|
||||
int row = 0;
|
||||
int sz = sol.dim2_sizes[row];
|
||||
if (sz < 4) return false;
|
||||
int n = prob->_n;
|
||||
const float* dist = prob->d_dist;
|
||||
int from_pos = rand_int(rng, sz);
|
||||
int node = sol.data[row][from_pos];
|
||||
int prev = sol.data[row][(from_pos + sz - 1) % sz];
|
||||
int next = sol.data[row][(from_pos + 1) % sz];
|
||||
float remove_save = dist[prev * n + node] + dist[node * n + next]
|
||||
- dist[prev * n + next];
|
||||
int best_to = -1;
|
||||
float best_delta = 0.0f;
|
||||
for (int t = 0; t < 8; t++) {
|
||||
int to = rand_int(rng, sz - 1);
|
||||
if (to >= from_pos) to++;
|
||||
int tp = sol.data[row][to];
|
||||
int tn = sol.data[row][(to + 1) % sz];
|
||||
float insert_cost = dist[tp * n + node] + dist[node * n + tn]
|
||||
- dist[tp * n + tn];
|
||||
float delta = insert_cost - remove_save;
|
||||
if (delta < best_delta) { best_delta = delta; best_to = to; }
|
||||
}
|
||||
if (best_to < 0) return false;
|
||||
ops::perm_insert(sol.data[row], from_pos, best_to, sz);
|
||||
return true;
|
||||
""",
|
||||
encoding="permutation",
|
||||
initial_weight=1.0,
|
||||
)
|
||||
|
||||
tsp_ops = [_TSP_2OPT_DELTA, _TSP_OR_OPT_DELTA, _TSP_NODE_INSERT_DELTA]
|
||||
136
python/cugenopt/operators.py
Normal file
136
python/cugenopt/operators.py
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
"""
|
||||
Custom operator registration for cuGenOpt.
|
||||
|
||||
Allows users to inject problem-specific CUDA search operators into the
|
||||
JIT-compiled solver. Custom operators participate in AOS weight competition
|
||||
alongside built-in operators.
|
||||
|
||||
Two entry points (same underlying mechanism):
|
||||
1. Python users: pass custom_operators=[CustomOperator(...)] to solve_custom()
|
||||
2. CUDA developers: write the same code body in .cuh and call register_custom_operators()
|
||||
|
||||
Operator contract:
|
||||
The code body has access to:
|
||||
- sol: Solution reference (sol.data[row][col], sol.dim2_sizes[row])
|
||||
- rng: curandState* for random number generation
|
||||
- dim1, encoding, val_lb, val_ub: problem parameters
|
||||
- prob: const CustomProblem* (access data via prob->d_dist, prob->_n, etc.)
|
||||
Must return bool (true if solution was modified, false for no-op).
|
||||
Available primitives: ops::perm_swap, ops::perm_reverse, ops::perm_insert,
|
||||
ops::bin_flip, ops::rand_int, etc.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional
|
||||
|
||||
from cugenopt.validation import validate_cuda_snippet, CuGenOptValidationError
|
||||
|
||||
|
||||
@dataclass
|
||||
class CustomOperator:
|
||||
"""A user-defined search operator.
|
||||
|
||||
Args:
|
||||
name: Human-readable name (used in logs and AOS stats).
|
||||
code: CUDA code body that modifies `sol` in-place.
|
||||
Available variables:
|
||||
- sol: Solution reference (sol.data[row][col], sol.dim2_sizes[row])
|
||||
- rng: curandState* for random number generation
|
||||
- dim1: number of active rows
|
||||
- prob: const CustomProblem* — access problem data via prob->field
|
||||
(e.g. prob->d_dist, prob->_n, prob->d_weights)
|
||||
Must return bool (true if sol was modified).
|
||||
encoding: Which encoding this operator targets ("permutation", "binary",
|
||||
"integer", or "any"). Operators are only active when the problem
|
||||
encoding matches.
|
||||
initial_weight: Starting AOS weight (relative, will be normalized).
|
||||
Higher = more likely to be sampled initially.
|
||||
weight_cap: Maximum AOS weight after normalization (0 = use global cap).
|
||||
"""
|
||||
name: str
|
||||
code: str
|
||||
encoding: str = "any"
|
||||
initial_weight: float = 0.5
|
||||
weight_cap: float = 0.0
|
||||
|
||||
def __post_init__(self):
|
||||
if not self.name or not self.name.strip():
|
||||
raise CuGenOptValidationError("CustomOperator name cannot be empty")
|
||||
self.code = validate_cuda_snippet(self.code, f"operator '{self.name}'")
|
||||
valid_enc = {"permutation", "binary", "integer", "any"}
|
||||
if self.encoding.lower() not in valid_enc:
|
||||
raise CuGenOptValidationError(
|
||||
f"CustomOperator encoding must be one of {valid_enc}, "
|
||||
f"got '{self.encoding}'"
|
||||
)
|
||||
if self.initial_weight <= 0:
|
||||
raise CuGenOptValidationError(
|
||||
f"CustomOperator initial_weight must be > 0, got {self.initial_weight}"
|
||||
)
|
||||
|
||||
|
||||
# SeqID range for custom operators: 100..123 (MAX_SEQ=24 custom slots)
|
||||
CUSTOM_SEQ_ID_BASE = 100
|
||||
|
||||
|
||||
def generate_custom_operator_cuda(
|
||||
operators: List[CustomOperator],
|
||||
problem_encoding: str,
|
||||
) -> tuple:
|
||||
"""Generate CUDA code to inject custom operators into execute_custom_op.
|
||||
|
||||
Returns:
|
||||
(switch_block, registry_block, filtered_operators):
|
||||
- switch_block: CUDA switch cases for {{CUSTOM_OP_SWITCH}}
|
||||
- registry_block: add() calls for {{CUSTOM_OP_REGISTRY}}
|
||||
- filtered_operators: list of operators that matched the encoding
|
||||
All empty strings / empty list if no operators match.
|
||||
"""
|
||||
if not operators:
|
||||
return "", "", ""
|
||||
|
||||
filtered = _filter_by_encoding(operators, problem_encoding)
|
||||
if not filtered:
|
||||
return "", "", ""
|
||||
|
||||
switch_cases = []
|
||||
registry_adds = []
|
||||
for i, op in enumerate(filtered):
|
||||
seq_id = CUSTOM_SEQ_ID_BASE + i
|
||||
switch_cases.append(_generate_switch_case(seq_id, op))
|
||||
registry_adds.append(
|
||||
f" add({seq_id}, {op.initial_weight}f, {op.weight_cap}f); "
|
||||
f"// custom: {op.name}"
|
||||
)
|
||||
|
||||
switch_block = "\n".join(switch_cases)
|
||||
registry_block = "\n".join(registry_adds)
|
||||
|
||||
return switch_block, registry_block, filtered
|
||||
|
||||
|
||||
def _filter_by_encoding(
|
||||
operators: List[CustomOperator],
|
||||
problem_encoding: str,
|
||||
) -> List[CustomOperator]:
|
||||
"""Filter operators compatible with the problem encoding."""
|
||||
enc = problem_encoding.lower()
|
||||
return [
|
||||
op for op in operators
|
||||
if op.encoding.lower() == "any" or op.encoding.lower() == enc
|
||||
]
|
||||
|
||||
|
||||
def _generate_switch_case(seq_id: int, op: CustomOperator) -> str:
|
||||
"""Generate a single switch case for execute_sequence."""
|
||||
return f"""\
|
||||
case {seq_id}: {{ // custom: {op.name}
|
||||
{_indent(op.code, 12)}
|
||||
}}"""
|
||||
|
||||
|
||||
def _indent(text: str, spaces: int) -> str:
|
||||
"""Indent each line of text."""
|
||||
prefix = " " * spaces
|
||||
lines = text.strip().split("\n")
|
||||
return "\n".join(prefix + line for line in lines)
|
||||
261
python/cugenopt/validation.py
Normal file
261
python/cugenopt/validation.py
Normal file
|
|
@ -0,0 +1,261 @@
|
|||
"""
|
||||
Input validation and friendly error translation for cuGenOpt.
|
||||
|
||||
Two responsibilities:
|
||||
1. Validate numpy arrays before JIT compilation (dtype, shape, NaN/Inf, contiguity)
|
||||
2. Translate nvcc compilation errors into actionable Python messages
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, Optional, Sequence
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class CuGenOptValidationError(ValueError):
|
||||
"""Raised when input data fails validation."""
|
||||
pass
|
||||
|
||||
|
||||
class CuGenOptCompileError(RuntimeError):
|
||||
"""Raised when nvcc compilation fails, with a friendly summary."""
|
||||
|
||||
def __init__(self, raw_stderr: str, source_path: str):
|
||||
self.raw_stderr = raw_stderr
|
||||
self.source_path = source_path
|
||||
self.friendly = _translate_nvcc_error(raw_stderr)
|
||||
super().__init__(
|
||||
f"{self.friendly}\n\n"
|
||||
f"[raw nvcc output]\n{_truncate(raw_stderr, 1200)}\n\n"
|
||||
f"Source saved at: {source_path}"
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Array validation
|
||||
# ============================================================
|
||||
|
||||
def validate_array(
|
||||
arr: np.ndarray,
|
||||
name: str,
|
||||
*,
|
||||
expected_dtype: Optional[np.dtype] = None,
|
||||
expected_ndim: Optional[int] = None,
|
||||
expected_shape: Optional[tuple] = None,
|
||||
min_size: int = 1,
|
||||
allow_nan: bool = False,
|
||||
allow_inf: bool = False,
|
||||
) -> np.ndarray:
|
||||
"""Validate a single numpy array and return a contiguous copy if needed.
|
||||
|
||||
Raises CuGenOptValidationError with a clear message on failure.
|
||||
"""
|
||||
if not isinstance(arr, np.ndarray):
|
||||
raise CuGenOptValidationError(
|
||||
f"'{name}' must be a numpy array, got {type(arr).__name__}"
|
||||
)
|
||||
|
||||
if expected_ndim is not None and arr.ndim != expected_ndim:
|
||||
raise CuGenOptValidationError(
|
||||
f"'{name}' must be {expected_ndim}D, got {arr.ndim}D with shape {arr.shape}"
|
||||
)
|
||||
|
||||
if expected_shape is not None:
|
||||
for i, (actual, expect) in enumerate(zip(arr.shape, expected_shape)):
|
||||
if expect is not None and actual != expect:
|
||||
raise CuGenOptValidationError(
|
||||
f"'{name}' shape mismatch at axis {i}: "
|
||||
f"expected {expected_shape}, got {arr.shape}"
|
||||
)
|
||||
|
||||
if arr.size < min_size:
|
||||
raise CuGenOptValidationError(
|
||||
f"'{name}' is too small: size={arr.size}, minimum={min_size}"
|
||||
)
|
||||
|
||||
if expected_dtype is not None:
|
||||
arr = np.ascontiguousarray(arr, dtype=expected_dtype)
|
||||
|
||||
if not allow_nan and np.issubdtype(arr.dtype, np.floating) and np.isnan(arr).any():
|
||||
nan_count = int(np.isnan(arr).sum())
|
||||
raise CuGenOptValidationError(
|
||||
f"'{name}' contains {nan_count} NaN value(s). "
|
||||
f"Clean your data or set allow_nan=True."
|
||||
)
|
||||
|
||||
if not allow_inf and np.issubdtype(arr.dtype, np.floating) and np.isinf(arr).any():
|
||||
inf_count = int(np.isinf(arr).sum())
|
||||
raise CuGenOptValidationError(
|
||||
f"'{name}' contains {inf_count} Inf value(s). "
|
||||
f"Clean your data or set allow_inf=True."
|
||||
)
|
||||
|
||||
return np.ascontiguousarray(arr)
|
||||
|
||||
|
||||
def validate_square_matrix(arr: np.ndarray, name: str, dtype=np.float32) -> np.ndarray:
|
||||
"""Validate a square 2D matrix."""
|
||||
arr = validate_array(arr, name, expected_ndim=2, expected_dtype=dtype)
|
||||
if arr.shape[0] != arr.shape[1]:
|
||||
raise CuGenOptValidationError(
|
||||
f"'{name}' must be square, got shape {arr.shape}"
|
||||
)
|
||||
return arr
|
||||
|
||||
|
||||
def validate_1d(arr: np.ndarray, name: str, *, length: Optional[int] = None,
|
||||
dtype=np.float32) -> np.ndarray:
|
||||
"""Validate a 1D array with optional length check."""
|
||||
arr = validate_array(arr, name, expected_ndim=1, expected_dtype=dtype)
|
||||
if length is not None and arr.shape[0] != length:
|
||||
raise CuGenOptValidationError(
|
||||
f"'{name}' length mismatch: expected {length}, got {arr.shape[0]}"
|
||||
)
|
||||
return arr
|
||||
|
||||
|
||||
def validate_data_dict(data: Dict[str, np.ndarray], dtype_tag: str) -> Dict[str, np.ndarray]:
|
||||
"""Validate a dict of name -> array for compile_and_solve data/int_data."""
|
||||
target_dtype = np.float32 if dtype_tag == "float" else np.int32
|
||||
validated = {}
|
||||
for name, arr in data.items():
|
||||
if not isinstance(arr, np.ndarray):
|
||||
raise CuGenOptValidationError(
|
||||
f"data['{name}'] must be a numpy array, got {type(arr).__name__}"
|
||||
)
|
||||
arr = validate_array(arr, f"data['{name}']", expected_dtype=target_dtype)
|
||||
validated[name] = arr
|
||||
return validated
|
||||
|
||||
|
||||
def validate_encoding(encoding: str) -> str:
|
||||
"""Validate encoding string."""
|
||||
valid = {"permutation", "binary", "integer"}
|
||||
enc = encoding.lower().strip()
|
||||
if enc not in valid:
|
||||
raise CuGenOptValidationError(
|
||||
f"Unknown encoding '{encoding}'. Must be one of: {', '.join(sorted(valid))}"
|
||||
)
|
||||
return enc
|
||||
|
||||
|
||||
def validate_positive_int(value, name: str, *, allow_zero: bool = False) -> int:
|
||||
"""Validate that value is a positive integer."""
|
||||
try:
|
||||
v = int(value)
|
||||
except (TypeError, ValueError):
|
||||
raise CuGenOptValidationError(
|
||||
f"'{name}' must be an integer, got {type(value).__name__}: {value!r}"
|
||||
)
|
||||
if allow_zero and v < 0:
|
||||
raise CuGenOptValidationError(f"'{name}' must be >= 0, got {v}")
|
||||
if not allow_zero and v < 1:
|
||||
raise CuGenOptValidationError(f"'{name}' must be >= 1, got {v}")
|
||||
return v
|
||||
|
||||
|
||||
def validate_cuda_snippet(code: str, name: str) -> str:
|
||||
"""Basic sanity check on a CUDA code snippet."""
|
||||
code = code.strip()
|
||||
if not code:
|
||||
raise CuGenOptValidationError(f"'{name}' CUDA code snippet is empty")
|
||||
|
||||
dangerous = ["system(", "popen(", "exec(", "fork(", "unlink("]
|
||||
for d in dangerous:
|
||||
if d in code:
|
||||
raise CuGenOptValidationError(
|
||||
f"'{name}' contains potentially dangerous call: '{d}'"
|
||||
)
|
||||
return code
|
||||
|
||||
|
||||
# ============================================================
|
||||
# nvcc error translation
|
||||
# ============================================================
|
||||
|
||||
_NVCC_PATTERNS = [
|
||||
(
|
||||
re.compile(r"error:\s*identifier\s+\"(\w+)\"\s+is\s+undefined", re.I),
|
||||
lambda m: f"Undefined identifier '{m.group(1)}'. "
|
||||
f"Check that all data field names in compute_obj/compute_penalty "
|
||||
f"match the keys in your data dict."
|
||||
),
|
||||
(
|
||||
re.compile(r"error:\s*expected\s+a\s+\"([^\"]+)\"", re.I),
|
||||
lambda m: f"Syntax error: expected '{m.group(1)}'. "
|
||||
f"Check for missing semicolons, braces, or parentheses."
|
||||
),
|
||||
(
|
||||
re.compile(r"error:\s*no\s+suitable\s+conversion\s+function\s+from\s+\"([^\"]+)\"\s+to\s+\"([^\"]+)\"", re.I),
|
||||
lambda m: f"Type mismatch: cannot convert '{m.group(1)}' to '{m.group(2)}'. "
|
||||
f"Ensure you're using the correct types (float/int)."
|
||||
),
|
||||
(
|
||||
re.compile(r"error:\s*too\s+(?:few|many)\s+arguments", re.I),
|
||||
lambda m: f"Wrong number of arguments in a function call. "
|
||||
f"Check the function signature."
|
||||
),
|
||||
(
|
||||
re.compile(r"error:\s*class\s+\"(\w+)\"\s+has\s+no\s+member\s+\"(\w+)\"", re.I),
|
||||
lambda m: f"'{m.group(1)}' has no member '{m.group(2)}'. "
|
||||
f"Available solution members: data[row][col], dim2_sizes[row]."
|
||||
),
|
||||
(
|
||||
re.compile(r"error:\s*expression\s+must\s+have\s+a\s+constant\s+value", re.I),
|
||||
lambda m: f"Non-constant expression where a constant is required. "
|
||||
f"CUDA device code cannot use dynamic allocation; "
|
||||
f"use fixed-size arrays."
|
||||
),
|
||||
(
|
||||
re.compile(r"ptxas\s+error\s*:\s*Entry\s+function.*uses\s+too\s+much\s+shared\s+data", re.I),
|
||||
lambda m: f"Shared memory overflow. Your problem data is too large for GPU "
|
||||
f"shared memory. Try reducing problem size or data arrays."
|
||||
),
|
||||
(
|
||||
re.compile(r"nvcc\s+fatal\s*:\s*Unsupported\s+gpu\s+architecture\s+'compute_(\d+)'", re.I),
|
||||
lambda m: f"GPU architecture sm_{m.group(1)} is not supported by your nvcc. "
|
||||
f"Try specifying cuda_arch='sm_75' or update your CUDA toolkit."
|
||||
),
|
||||
(
|
||||
re.compile(r"error:\s*return\s+value\s+type\s+does\s+not\s+match", re.I),
|
||||
lambda m: f"Return type mismatch. compute_obj must return float. "
|
||||
f"Make sure all code paths return a float value."
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def _translate_nvcc_error(stderr: str) -> str:
|
||||
"""Extract the most relevant error from nvcc output and provide a friendly message."""
|
||||
messages = []
|
||||
for pattern, formatter in _NVCC_PATTERNS:
|
||||
match = pattern.search(stderr)
|
||||
if match:
|
||||
messages.append(formatter(match))
|
||||
|
||||
if messages:
|
||||
header = "nvcc compilation failed. Likely cause(s):\n"
|
||||
return header + "\n".join(f" - {m}" for m in messages)
|
||||
|
||||
error_lines = [
|
||||
line.strip() for line in stderr.split("\n")
|
||||
if "error" in line.lower() and not line.strip().startswith("#")
|
||||
]
|
||||
if error_lines:
|
||||
summary = error_lines[0]
|
||||
return (
|
||||
f"nvcc compilation failed:\n {summary}\n\n"
|
||||
f"Tip: Check your CUDA code snippets for syntax errors. "
|
||||
f"Common issues: missing semicolons, undefined variables, "
|
||||
f"wrong data field names."
|
||||
)
|
||||
|
||||
return (
|
||||
"nvcc compilation failed with an unknown error.\n"
|
||||
"Check the raw output below for details."
|
||||
)
|
||||
|
||||
|
||||
def _truncate(text: str, max_len: int) -> str:
|
||||
if len(text) <= max_len:
|
||||
return text
|
||||
return text[:max_len] + f"\n... ({len(text) - max_len} chars truncated)"
|
||||
Loading…
Add table
Add a link
Reference in a new issue