Initial commit: cuGenOpt GPU optimization solver

This commit is contained in:
L-yang-yang 2026-03-20 00:33:45 +08:00
commit fc5a0ff4af
117 changed files with 25545 additions and 0 deletions

View file

@ -0,0 +1,54 @@
"""
cuGenOpt GPU-accelerated general-purpose metaheuristic solver
All problems (built-in and custom) use the same JIT compilation pipeline.
First call to each problem type takes ~8s to compile; subsequent calls are cached.
Usage:
import numpy as np
import cugenopt
dist = np.random.rand(20, 20).astype(np.float32)
dist = (dist + dist.T) / 2
np.fill_diagonal(dist, 0)
result = cugenopt.solve_tsp(dist, time_limit=5.0, seed=42)
print(f"Best distance: {result['objective']:.2f}")
print(f"Route: {result['solution'][0]}")
"""
from cugenopt.builtins import (
solve_tsp,
solve_knapsack,
solve_qap,
solve_assignment,
solve_vrp,
solve_vrptw,
solve_graph_color,
solve_bin_packing,
solve_load_balance,
gpu_info,
)
from cugenopt.jit import compile_and_solve as solve_custom, clear_cache
from cugenopt.validation import CuGenOptValidationError, CuGenOptCompileError
from cugenopt.operators import CustomOperator
__version__ = "0.2.0"
__all__ = [
"solve_tsp",
"solve_knapsack",
"solve_qap",
"solve_assignment",
"solve_vrp",
"solve_vrptw",
"solve_graph_color",
"solve_bin_packing",
"solve_load_balance",
"gpu_info",
"solve_custom",
"clear_cache",
"CuGenOptValidationError",
"CuGenOptCompileError",
"CustomOperator",
]

486
python/cugenopt/builtins.py Normal file
View file

@ -0,0 +1,486 @@
"""
Built-in problem solvers thin wrappers around compile_and_solve().
Each solve_xxx() function provides pre-written CUDA code snippets for
standard combinatorial optimization problems. Under the hood they all
call the same JIT compilation pipeline.
"""
from typing import Dict, Any, Optional
import numpy as np
from cugenopt.jit import compile_and_solve
from cugenopt.validation import (
CuGenOptValidationError,
validate_square_matrix,
validate_1d,
validate_positive_int,
)
def _solver_kwargs(kw: dict) -> dict:
"""Extract solver config kwargs from user-provided dict."""
keys = ["pop_size", "max_gen", "time_limit", "seed", "use_aos",
"sa_temp_init", "verbose", "cuda_arch", "framework_root",
"custom_operators"]
return {k: kw[k] for k in keys if k in kw}
# ============================================================
# TSP
# ============================================================
_TSP_OBJ = """
if (idx != 0) return 0.0f;
float total = 0.0f;
const int* route = sol.data[0];
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++)
total += d_dist[route[i] * _n + route[(i+1) % size]];
return total;
"""
def solve_tsp(dist_matrix: np.ndarray, **kw) -> Dict[str, Any]:
"""Solve TSP. Pass distance matrix as NxN numpy float32 array.
Args:
dist_matrix: NxN distance matrix (float32).
**kw: Solver params pop_size, max_gen, time_limit, seed, use_aos, verbose, ...
Returns:
Dict with objective, penalty, solution, elapsed_ms, generations, stop_reason.
"""
dist = validate_square_matrix(dist_matrix, "dist_matrix")
n = dist.shape[0]
if n < 3:
raise CuGenOptValidationError("TSP requires at least 3 cities")
if n > 512:
raise CuGenOptValidationError(
f"TSP size {n} > 512 not supported yet. "
f"Use solve_custom() for larger instances."
)
dim2 = 64 if n <= 64 else (256 if n <= 256 else 512)
return compile_and_solve(
compute_obj=_TSP_OBJ, data={"d_dist": dist},
encoding="permutation", dim2=dim2, n=n,
**_solver_kwargs(kw),
)
# ============================================================
# Knapsack
# ============================================================
_KNAPSACK_OBJ = """
if (idx != 0) return 0.0f;
float tv = 0.0f;
const int* sel = sol.data[0];
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++)
if (sel[i]) tv += d_values[i];
return tv;
"""
def solve_knapsack(weights: np.ndarray, values: np.ndarray,
capacity: float, **kw) -> Dict[str, Any]:
"""Solve 0-1 Knapsack.
Args:
weights: 1D array of item weights (float32).
values: 1D array of item values (float32).
capacity: Knapsack capacity.
"""
w = validate_1d(weights, "weights")
v = validate_1d(values, "values", length=len(w))
n = len(w)
if capacity <= 0:
raise CuGenOptValidationError(f"capacity must be > 0, got {capacity}")
penalty_code = f"""
float tw = 0.0f;
const int* sel = sol.data[0];
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++)
if (sel[i]) tw += d_weights[i];
float over = tw - {capacity}f;
return (over > 0.0f) ? over : 0.0f;
"""
return compile_and_solve(
compute_obj=_KNAPSACK_OBJ, compute_penalty=penalty_code,
data={"d_weights": w, "d_values": v},
encoding="binary", dim2=max(32, n), n=n,
objectives=[("maximize", 1.0)],
**_solver_kwargs(kw),
)
# ============================================================
# QAP
# ============================================================
_QAP_OBJ = """
if (idx != 0) return 0.0f;
float cost = 0.0f;
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++)
for (int j = 0; j < size; j++)
cost += d_flow[i * _n + j] * d_dist[sol.data[0][i] * _n + sol.data[0][j]];
return cost;
"""
def solve_qap(flow_matrix: np.ndarray, dist_matrix: np.ndarray,
**kw) -> Dict[str, Any]:
"""Solve Quadratic Assignment Problem.
Args:
flow_matrix: NxN flow matrix (float32).
dist_matrix: NxN distance matrix (float32).
"""
flow = validate_square_matrix(flow_matrix, "flow_matrix")
dist = validate_square_matrix(dist_matrix, "dist_matrix")
n = flow.shape[0]
if dist.shape[0] != n:
raise CuGenOptValidationError(
f"flow_matrix ({n}x{n}) and dist_matrix ({dist.shape[0]}x{dist.shape[0]}) "
f"must have the same dimensions"
)
return compile_and_solve(
compute_obj=_QAP_OBJ,
data={"d_flow": flow, "d_dist": dist},
encoding="permutation", dim2=32, n=n,
**_solver_kwargs(kw),
)
# ============================================================
# Assignment
# ============================================================
_ASSIGN_OBJ = """
if (idx != 0) return 0.0f;
float total = 0.0f;
const int* assign = sol.data[0];
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++)
total += d_cost[i * _n + assign[i]];
return total;
"""
def solve_assignment(cost_matrix: np.ndarray, **kw) -> Dict[str, Any]:
"""Solve Assignment Problem.
Args:
cost_matrix: NxN cost matrix (float32).
"""
cost = validate_square_matrix(cost_matrix, "cost_matrix")
n = cost.shape[0]
return compile_and_solve(
compute_obj=_ASSIGN_OBJ,
data={"d_cost": cost},
encoding="permutation", dim2=16, n=n,
**_solver_kwargs(kw),
)
# ============================================================
# VRP (CVRP)
# ============================================================
def solve_vrp(dist_matrix: np.ndarray, demand: np.ndarray,
capacity: float, num_vehicles: int, **kw) -> Dict[str, Any]:
"""Solve Capacitated VRP.
Args:
dist_matrix: (N+1)x(N+1) distance matrix including depot at index 0.
demand: 1D array of customer demands (length N, excluding depot).
capacity: Vehicle capacity.
num_vehicles: Number of vehicles.
"""
dist = validate_square_matrix(dist_matrix, "dist_matrix")
n_nodes = dist.shape[0]
n = n_nodes - 1
dem = validate_1d(demand, "demand", length=n)
num_vehicles = validate_positive_int(num_vehicles, "num_vehicles")
if capacity <= 0:
raise CuGenOptValidationError(f"capacity must be > 0, got {capacity}")
stride = n_nodes
max_vehicles = kw.pop("max_vehicles", num_vehicles)
obj_code = f"""
if (idx != 0) return 0.0f;
float total = 0.0f;
for (int r = 0; r < {num_vehicles}; r++) {{
int size = sol.dim2_sizes[r];
if (size == 0) continue;
float dist = 0.0f;
int prev = 0;
for (int j = 0; j < size; j++) {{
int node = sol.data[r][j] + 1;
dist += d_dist[prev * {stride} + node];
prev = node;
}}
dist += d_dist[prev * {stride} + 0];
total += dist;
}}
return total;
"""
penalty_code = f"""
float penalty = 0.0f;
int active = 0;
for (int r = 0; r < {num_vehicles}; r++) {{
int size = sol.dim2_sizes[r];
if (size == 0) continue;
active++;
float load = 0.0f;
for (int j = 0; j < size; j++)
load += d_demand[sol.data[r][j]];
if (load > {capacity}f)
penalty += (load - {capacity}f) * 100.0f;
}}
if (active > {max_vehicles})
penalty += (float)(active - {max_vehicles}) * 1000.0f;
return penalty;
"""
return compile_and_solve(
compute_obj=obj_code, compute_penalty=penalty_code,
data={"d_dist": dist, "d_demand": dem},
encoding="permutation", dim1=num_vehicles, dim2=64, n=n,
row_mode="partition", total_elements=n, cross_row_prob=0.3,
**_solver_kwargs(kw),
)
# ============================================================
# VRPTW
# ============================================================
def solve_vrptw(dist_matrix: np.ndarray, demand: np.ndarray,
earliest: np.ndarray, latest: np.ndarray,
service: np.ndarray, capacity: float,
num_vehicles: int, **kw) -> Dict[str, Any]:
"""Solve VRP with Time Windows.
Args:
dist_matrix: (N+1)x(N+1) distance matrix including depot at index 0.
demand: Customer demands (length N).
earliest, latest, service: Time window arrays (length N+1, including depot).
capacity: Vehicle capacity.
num_vehicles: Number of vehicles.
"""
dist = validate_square_matrix(dist_matrix, "dist_matrix")
n_nodes = dist.shape[0]
n = n_nodes - 1
dem = validate_1d(demand, "demand", length=n)
ear = validate_1d(earliest, "earliest", length=n_nodes)
lat = validate_1d(latest, "latest", length=n_nodes)
svc = validate_1d(service, "service", length=n_nodes)
num_vehicles = validate_positive_int(num_vehicles, "num_vehicles")
if capacity <= 0:
raise CuGenOptValidationError(f"capacity must be > 0, got {capacity}")
stride = n_nodes
max_vehicles = kw.pop("max_vehicles", num_vehicles)
obj_code = f"""
if (idx != 0) return 0.0f;
float total = 0.0f;
for (int r = 0; r < {num_vehicles}; r++) {{
int size = sol.dim2_sizes[r];
if (size == 0) continue;
float dist = 0.0f;
int prev = 0;
for (int j = 0; j < size; j++) {{
int node = sol.data[r][j] + 1;
dist += d_dist[prev * {stride} + node];
prev = node;
}}
dist += d_dist[prev * {stride} + 0];
total += dist;
}}
return total;
"""
penalty_code = f"""
float penalty = 0.0f;
int active = 0;
for (int r = 0; r < {num_vehicles}; r++) {{
int size = sol.dim2_sizes[r];
if (size == 0) continue;
active++;
float load = 0.0f;
for (int j = 0; j < size; j++)
load += d_demand[sol.data[r][j]];
if (load > {capacity}f)
penalty += (load - {capacity}f) * 100.0f;
float time = 0.0f;
int prev = 0;
for (int j = 0; j < size; j++) {{
int node = sol.data[r][j] + 1;
time += d_dist[prev * {stride} + node];
if (time < d_earliest[node]) time = d_earliest[node];
if (time > d_latest[node])
penalty += (time - d_latest[node]) * 50.0f;
time += d_service[node];
prev = node;
}}
float ret = time + d_dist[prev * {stride} + 0];
if (ret > d_latest[0])
penalty += (ret - d_latest[0]) * 50.0f;
}}
if (active > {max_vehicles})
penalty += (float)(active - {max_vehicles}) * 1000.0f;
return penalty;
"""
return compile_and_solve(
compute_obj=obj_code, compute_penalty=penalty_code,
data={"d_dist": dist, "d_demand": dem,
"d_earliest": ear, "d_latest": lat, "d_service": svc},
encoding="permutation", dim1=num_vehicles, dim2=64, n=n,
row_mode="partition", total_elements=n, cross_row_prob=0.3,
**_solver_kwargs(kw),
)
# ============================================================
# Graph Coloring
# ============================================================
_GRAPHCOLOR_OBJ = """
if (idx != 0) return 0.0f;
int conflicts = 0;
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++)
for (int j = i + 1; j < size; j++)
if (d_adj[i * _n + j] && sol.data[0][i] == sol.data[0][j])
conflicts++;
return (float)conflicts;
"""
def solve_graph_color(adj_matrix: np.ndarray, num_colors: int,
**kw) -> Dict[str, Any]:
"""Solve Graph Coloring.
Args:
adj_matrix: NxN adjacency matrix (int32, 1=edge, 0=no edge).
num_colors: Number of colors available.
"""
adj = validate_square_matrix(adj_matrix, "adj_matrix", dtype=np.int32)
n = adj.shape[0]
num_colors = validate_positive_int(num_colors, "num_colors")
return compile_and_solve(
compute_obj=_GRAPHCOLOR_OBJ,
int_data={"d_adj": adj},
encoding="integer", dim2=64, n=n,
value_lower=0, value_upper=num_colors - 1,
**_solver_kwargs(kw),
)
# ============================================================
# Bin Packing
# ============================================================
def solve_bin_packing(item_weights: np.ndarray, max_bins: int,
bin_capacity: float, **kw) -> Dict[str, Any]:
"""Solve Bin Packing.
Args:
item_weights: 1D array of item weights (float32).
max_bins: Maximum number of bins.
bin_capacity: Capacity of each bin.
"""
w = validate_1d(item_weights, "item_weights")
n = len(w)
max_bins = validate_positive_int(max_bins, "max_bins")
if bin_capacity <= 0:
raise CuGenOptValidationError(f"bin_capacity must be > 0, got {bin_capacity}")
obj_code = f"""
if (idx != 0) return 0.0f;
int used = 0;
int size = sol.dim2_sizes[0];
for (int b = 0; b < {max_bins}; b++) {{
bool has = false;
for (int i = 0; i < size; i++)
if (sol.data[0][i] == b) {{ has = true; break; }}
if (has) used++;
}}
return (float)used;
"""
penalty_code = f"""
float penalty = 0.0f;
int size = sol.dim2_sizes[0];
for (int b = 0; b < {max_bins}; b++) {{
float load = 0.0f;
for (int i = 0; i < size; i++)
if (sol.data[0][i] == b) load += d_weights[i];
if (load > {bin_capacity}f)
penalty += (load - {bin_capacity}f);
}}
return penalty;
"""
return compile_and_solve(
compute_obj=obj_code, compute_penalty=penalty_code,
data={"d_weights": w},
encoding="integer", dim2=64, n=n,
value_lower=0, value_upper=max_bins - 1,
**_solver_kwargs(kw),
)
# ============================================================
# Load Balancing
# ============================================================
def solve_load_balance(proc_times: np.ndarray, num_machines: int,
**kw) -> Dict[str, Any]:
"""Solve Load Balancing (minimize makespan).
Args:
proc_times: 1D array of task processing times (float32).
num_machines: Number of machines.
"""
p = validate_1d(proc_times, "proc_times")
n = len(p)
num_machines = validate_positive_int(num_machines, "num_machines")
obj_code = f"""
if (idx != 0) return 0.0f;
float loads[{num_machines}];
for (int m = 0; m < {num_machines}; m++) loads[m] = 0.0f;
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++)
loads[sol.data[0][i]] += d_proc[i];
float makespan = 0.0f;
for (int m = 0; m < {num_machines}; m++)
if (loads[m] > makespan) makespan = loads[m];
return makespan;
"""
return compile_and_solve(
compute_obj=obj_code,
data={"d_proc": p},
encoding="integer", dim2=64, n=n,
value_lower=0, value_upper=num_machines - 1,
**_solver_kwargs(kw),
)
# ============================================================
# GPU info (pure Python, no JIT needed)
# ============================================================
def gpu_info() -> Dict[str, Any]:
"""Get GPU device information via nvidia-smi."""
import subprocess
info = {"device_count": 0}
try:
out = subprocess.check_output(
["nvidia-smi", "--query-gpu=name,compute_cap,memory.total,driver_version",
"--format=csv,noheader"],
stderr=subprocess.DEVNULL, text=True
).strip().split("\n")[0]
parts = [p.strip() for p in out.split(",")]
info["device_count"] = 1
info["name"] = parts[0]
info["compute_capability"] = parts[1]
info["memory"] = parts[2]
info["driver_version"] = parts[3]
except Exception:
pass
return info

View file

@ -0,0 +1,90 @@
/**
* cuda_utils.cuh - CUDA 工具集
*
* 职责:错误检查、设备信息、随机数工具
* 规则:所有 CUDA API 调用都必须用 CUDA_CHECK 包裹
*/
#pragma once
#include <cstdio>
#include <cstdlib>
#include <curand_kernel.h>
// ============================================================
// 错误检查
// ============================================================
#define CUDA_CHECK(call) do { \
cudaError_t err = (call); \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d: %s\n", \
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while(0)
// kernel launch 后检查(捕获异步错误)
#define CUDA_CHECK_LAST() do { \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA kernel error at %s:%d: %s\n", \
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while(0)
// ============================================================
// 设备信息
// ============================================================
inline void print_device_info() {
int device;
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDevice(&device));
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
printf("GPU: %s\n", prop.name);
printf(" SM count: %d\n", prop.multiProcessorCount);
printf(" Max threads/SM: %d\n", prop.maxThreadsPerMultiProcessor);
printf(" Shared mem/blk: %zu KB\n", prop.sharedMemPerBlock / 1024);
printf(" Global mem: %.1f GB\n", prop.totalGlobalMem / 1e9);
printf(" Compute cap: %d.%d\n", prop.major, prop.minor);
}
// ============================================================
// 随机数工具 (Device 端)
// ============================================================
// 初始化 curand 状态,每个线程一个
__global__ void init_curand_kernel(curandState* states, unsigned long long seed, int n) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) {
curand_init(seed, tid, 0, &states[tid]);
}
}
// Device 端:生成 [0, bound) 的随机整数
__device__ inline int rand_int(curandState* state, int bound) {
return curand(state) % bound;
}
// Device 端Fisher-Yates shuffle对 arr[0..n-1] 做随机排列
__device__ inline void shuffle(int* arr, int n, curandState* state) {
for (int i = n - 1; i > 0; i--) {
int j = rand_int(state, i + 1);
int tmp = arr[i];
arr[i] = arr[j];
arr[j] = tmp;
}
}
// ============================================================
// Kernel 启动参数计算
// ============================================================
inline int div_ceil(int a, int b) { return (a + b - 1) / b; }
// 计算合适的 block 数量
inline int calc_grid_size(int n, int block_size = 256) {
return div_ceil(n, block_size);
}

View file

@ -0,0 +1,141 @@
/**
* gpu_cache.cuh - GPU 全局内存哈希表(通用缓存组件)
*
* 设计:
* - 开放寻址固定容量power of 2线性探测
* - key = uint64_t由 Problem 自行计算 hash
* - value = float单个指标值
* - 无锁:允许 race condition缓存语义偶尔脏读可接受
* - 自带命中/未命中原子计数器
*
* 用法:
* GpuCache cache = GpuCache::allocate(65536); // host
* // ... pass cache as Problem member to kernels ...
* cache.print_stats(); // host
* cache.destroy(); // host
*
* 参考scute 项目 LRUCachekey = metric_type + content_hash
*/
#pragma once
#include "cuda_utils.cuh"
#include <cstdint>
// ============================================================
// 常量
// ============================================================
static constexpr uint64_t CACHE_EMPTY_KEY = 0xFFFFFFFFFFFFFFFFULL;
static constexpr int CACHE_MAX_PROBE = 8; // 最大线性探测步数
// ============================================================
// GpuCache 结构体POD可安全拷贝到 kernel
// ============================================================
struct GpuCache {
uint64_t* keys; // GPU 全局内存
float* values; // GPU 全局内存
unsigned int* d_hits; // 原子计数器GPU
unsigned int* d_misses; // 原子计数器GPU
int capacity; // 必须是 2 的幂
int mask; // = capacity - 1
// ---- Host 操作 ----
static GpuCache allocate(int cap = 65536) {
GpuCache c;
c.capacity = cap;
c.mask = cap - 1;
CUDA_CHECK(cudaMalloc(&c.keys, sizeof(uint64_t) * cap));
CUDA_CHECK(cudaMalloc(&c.values, sizeof(float) * cap));
CUDA_CHECK(cudaMalloc(&c.d_hits, sizeof(unsigned int)));
CUDA_CHECK(cudaMalloc(&c.d_misses, sizeof(unsigned int)));
c.clear();
return c;
}
static GpuCache disabled() {
GpuCache c;
c.keys = nullptr; c.values = nullptr;
c.d_hits = nullptr; c.d_misses = nullptr;
c.capacity = 0; c.mask = 0;
return c;
}
bool is_enabled() const { return keys != nullptr; }
void clear() {
CUDA_CHECK(cudaMemset(keys, 0xFF, sizeof(uint64_t) * capacity));
CUDA_CHECK(cudaMemset(d_hits, 0, sizeof(unsigned int)));
CUDA_CHECK(cudaMemset(d_misses, 0, sizeof(unsigned int)));
}
void destroy() {
if (keys) cudaFree(keys);
if (values) cudaFree(values);
if (d_hits) cudaFree(d_hits);
if (d_misses) cudaFree(d_misses);
keys = nullptr; values = nullptr;
d_hits = nullptr; d_misses = nullptr;
}
void print_stats() const {
if (!keys) { printf(" Cache: disabled\n"); return; }
unsigned int h = 0, m = 0;
CUDA_CHECK(cudaMemcpy(&h, d_hits, sizeof(unsigned int), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(&m, d_misses, sizeof(unsigned int), cudaMemcpyDeviceToHost));
unsigned int total = h + m;
float rate = total > 0 ? (float)h / total * 100.0f : 0.0f;
printf(" Cache: %u lookups | %u hits + %u misses | hit rate = %.1f%%\n",
total, h, m, rate);
printf(" Cache: capacity = %d entries (%.1f KB)\n",
capacity, capacity * (sizeof(uint64_t) + sizeof(float)) / 1024.0f);
}
};
// ============================================================
// Device 函数:哈希 / 查找 / 插入
// ============================================================
/// FNV-1a 哈希:对一段有序 int 序列(如路线中的客户 ID
__device__ inline uint64_t route_hash(const int* data, int len) {
uint64_t h = 14695981039346656037ULL; // FNV offset basis
for (int i = 0; i < len; i++) {
h ^= (uint64_t)(unsigned int)data[i];
h *= 1099511628211ULL; // FNV prime
}
return (h == CACHE_EMPTY_KEY) ? h - 1 : h; // 避免与哨兵值碰撞
}
/// 查找:命中返回 true + 写入 out
__device__ inline bool cache_lookup(const GpuCache& c, uint64_t key, float& out) {
int slot = (int)(key & (uint64_t)c.mask);
for (int p = 0; p < CACHE_MAX_PROBE; p++) {
int idx = (slot + p) & c.mask;
uint64_t k = c.keys[idx];
if (k == key) {
out = c.values[idx];
return true;
}
if (k == CACHE_EMPTY_KEY) return false; // 空槽 → 一定不存在
}
return false; // 探测用尽
}
/// 插入:写入 key-value同 key 覆盖,探测满则驱逐首槽
__device__ inline void cache_insert(const GpuCache& c, uint64_t key, float value) {
int slot = (int)(key & (uint64_t)c.mask);
for (int p = 0; p < CACHE_MAX_PROBE; p++) {
int idx = (slot + p) & c.mask;
uint64_t k = c.keys[idx];
if (k == CACHE_EMPTY_KEY || k == key) {
c.keys[idx] = key;
c.values[idx] = value;
return;
}
}
// 探测满:驱逐首槽
int idx = slot & c.mask;
c.keys[idx] = key;
c.values[idx] = value;
}

View file

@ -0,0 +1,121 @@
#pragma once
#include "types.cuh"
#include <vector>
#include <algorithm>
#include <numeric>
namespace heuristic_init {
// 单行排列:所有行填相同排列
template<typename Sol>
static void build_sorted_permutation(Sol& sol, const std::vector<int>& order,
int dim1, int dim2) {
for (int r = 0; r < dim1; r++) {
sol.dim2_sizes[r] = dim2;
for (int c = 0; c < dim2; c++)
sol.data[r][c] = order[c];
}
sol.penalty = 0.0f;
for (int i = 0; i < MAX_OBJ; i++) sol.objectives[i] = 0.0f;
}
// Partition 模式:排列均匀切分到 dim1 行,元素不重复
template<typename Sol>
static void build_partition_from_order(Sol& sol, const std::vector<int>& order,
int dim1, int total_elements) {
int idx = 0;
for (int r = 0; r < dim1; r++) {
int count = total_elements / dim1;
if (r < total_elements % dim1) count++;
sol.dim2_sizes[r] = count;
for (int c = 0; c < count; c++)
sol.data[r][c] = order[idx++];
}
sol.penalty = 0.0f;
for (int i = 0; i < MAX_OBJ; i++) sol.objectives[i] = 0.0f;
}
template<typename Sol>
std::vector<Sol> build_from_matrices(const HeuristicMatrix* matrices, int num_matrices,
int dim1, int dim2, EncodingType encoding,
bool partition_mode = false, int total_elements = 0) {
std::vector<Sol> results;
if (encoding != EncodingType::Permutation) return results;
int elem_count = partition_mode ? total_elements : dim2;
if (num_matrices <= 0 || elem_count <= 0) return results;
auto make_sol = [&](const std::vector<int>& order) {
Sol sol{};
if (partition_mode)
build_partition_from_order(sol, order, dim1, total_elements);
else
build_sorted_permutation(sol, order, dim1, dim2);
return sol;
};
for (int m = 0; m < num_matrices; m++) {
const float* mat = matrices[m].data;
int N = matrices[m].N;
if (!mat || N < elem_count) continue;
std::vector<float> row_sum(N, 0.0f);
std::vector<float> col_sum(N, 0.0f);
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++) {
row_sum[i] += mat[i * N + j];
col_sum[j] += mat[i * N + j];
}
// 对于 Partition (VRPTW),距离矩阵含 depot (index 0)
// 排序只针对客户 (index 1..N-1),输出值为 0-based 客户编号
std::vector<int> idx;
if (partition_mode && N > elem_count) {
for (int i = 1; i <= elem_count; i++) idx.push_back(i);
} else {
idx.resize(elem_count);
std::iota(idx.begin(), idx.end(), 0);
}
auto to_customer = [&](std::vector<int>& order) {
if (partition_mode && N > elem_count) {
for (auto& v : order) v -= 1;
}
};
// row_sum ascending
{
auto order = idx;
std::sort(order.begin(), order.end(),
[&](int a, int b) { return row_sum[a] < row_sum[b]; });
to_customer(order);
results.push_back(make_sol(order));
}
// row_sum descending
{
auto order = idx;
std::sort(order.begin(), order.end(),
[&](int a, int b) { return row_sum[a] > row_sum[b]; });
to_customer(order);
results.push_back(make_sol(order));
}
// col_sum ascending
{
auto order = idx;
std::sort(order.begin(), order.end(),
[&](int a, int b) { return col_sum[a] < col_sum[b]; });
to_customer(order);
results.push_back(make_sol(order));
}
// col_sum descending
{
auto order = idx;
std::sort(order.begin(), order.end(),
[&](int a, int b) { return col_sum[a] > col_sum[b]; });
to_customer(order);
results.push_back(make_sol(order));
}
}
return results;
}
} // namespace heuristic_init

View file

@ -0,0 +1,258 @@
/**
* init_selection.cuh - 初始解采样择优 + NSGA-II 选择
*
* Host 端逻辑,在 solver 初始化阶段调用一次。
* 从 K × pop_size 个候选解中选出 pop_size 个作为初始种群。
*
* 选择策略:
* 1. 核心目标预留名额(按 importance 分配)
* 2. NSGA-II 选择(非支配排序 + 加权拥挤度)
* 3. 纯随机保底(多样性)
*
* 单目标时自动退化为 top-N 排序,无需分支。
*/
#pragma once
#include "types.cuh"
#include <algorithm>
#include <vector>
#include <cmath>
#include <cstring>
namespace init_sel {
// ============================================================
// 候选解的目标信息(从 GPU 下载后在 host 端使用)
// ============================================================
struct CandidateInfo {
int idx; // 在候选数组中的原始索引
float objs[MAX_OBJ]; // 归一化后的目标值(越小越好)
float penalty;
int rank; // 非支配排序层级0 = Pareto 前沿)
float crowding; // 拥挤度距离
bool selected; // 是否已被选中
};
// ============================================================
// 非支配排序Fast Non-dominated Sort
// ============================================================
// 复杂度O(M × N²)M = 目标数N = 候选数
// 对初始化场景N ≤ 几千M ≤ 4完全可接受
inline void fast_nondominated_sort(std::vector<CandidateInfo>& cands,
int num_obj,
std::vector<std::vector<int>>& fronts) {
int n = (int)cands.size();
std::vector<int> dom_count(n, 0); // 被多少个解支配
std::vector<std::vector<int>> dom_set(n); // 支配了哪些解
// 判断 a 是否支配 ba 在所有目标上 ≤ b且至少一个 <
// 先处理 penalty可行解支配不可行解
auto dominates = [&](int a, int b) -> bool {
const auto& ca = cands[a];
const auto& cb = cands[b];
// penalty 处理
if (ca.penalty <= 0.0f && cb.penalty > 0.0f) return true;
if (ca.penalty > 0.0f && cb.penalty <= 0.0f) return false;
if (ca.penalty > 0.0f && cb.penalty > 0.0f) return ca.penalty < cb.penalty;
bool all_leq = true;
bool any_lt = false;
for (int m = 0; m < num_obj; m++) {
if (ca.objs[m] > cb.objs[m]) { all_leq = false; break; }
if (ca.objs[m] < cb.objs[m]) any_lt = true;
}
return all_leq && any_lt;
};
// 计算支配关系
for (int i = 0; i < n; i++) {
for (int j = i + 1; j < n; j++) {
if (dominates(i, j)) {
dom_set[i].push_back(j);
dom_count[j]++;
} else if (dominates(j, i)) {
dom_set[j].push_back(i);
dom_count[i]++;
}
}
}
// 提取各层前沿
fronts.clear();
std::vector<int> current_front;
for (int i = 0; i < n; i++) {
if (dom_count[i] == 0) {
cands[i].rank = 0;
current_front.push_back(i);
}
}
int front_idx = 0;
while (!current_front.empty()) {
fronts.push_back(current_front);
std::vector<int> next_front;
for (int i : current_front) {
for (int j : dom_set[i]) {
dom_count[j]--;
if (dom_count[j] == 0) {
cands[j].rank = front_idx + 1;
next_front.push_back(j);
}
}
}
current_front = next_front;
front_idx++;
}
}
// ============================================================
// 加权拥挤度距离
// ============================================================
// 标准拥挤度 + importance 加权:核心目标维度上的间距贡献更大
inline void weighted_crowding_distance(std::vector<CandidateInfo>& cands,
const std::vector<int>& front,
int num_obj,
const float* importance) {
int n = (int)front.size();
if (n <= 2) {
for (int i : front) cands[i].crowding = 1e18f; // 边界解无穷大
return;
}
for (int i : front) cands[i].crowding = 0.0f;
std::vector<int> sorted_idx(front.begin(), front.end());
for (int m = 0; m < num_obj; m++) {
// 按目标 m 排序
std::sort(sorted_idx.begin(), sorted_idx.end(),
[&](int a, int b) { return cands[a].objs[m] < cands[b].objs[m]; });
float range = cands[sorted_idx[n-1]].objs[m] - cands[sorted_idx[0]].objs[m];
if (range < 1e-12f) continue; // 该目标无区分度
// 边界解设为无穷大
cands[sorted_idx[0]].crowding += 1e18f;
cands[sorted_idx[n-1]].crowding += 1e18f;
// 中间解:相邻间距 × importance 权重
float w = importance[m];
for (int i = 1; i < n - 1; i++) {
float gap = cands[sorted_idx[i+1]].objs[m] - cands[sorted_idx[i-1]].objs[m];
cands[sorted_idx[i]].crowding += w * (gap / range);
}
}
}
// ============================================================
// 主选择函数:从 N 个候选中选出 target 个
// ============================================================
// 返回被选中的候选索引
inline std::vector<int> nsga2_select(std::vector<CandidateInfo>& cands,
int num_obj,
const float* importance,
int target,
int num_reserved_random) {
// --- 1. 核心目标预留名额 ---
int num_reserve_total = target - num_reserved_random;
// 预留比例importance[i] × 30% 的名额(剩余 70% 给 NSGA-II
float reserve_ratio = 0.3f;
std::vector<int> selected;
selected.reserve(target);
// 对每个目标,按该目标排序取 top
for (int m = 0; m < num_obj; m++) {
int quota = (int)(num_reserve_total * importance[m] * reserve_ratio);
if (quota < 1 && num_obj > 1) quota = 1; // 每个目标至少 1 个
// 按目标 m 排序(越小越好)
std::vector<int> by_obj(cands.size());
for (int i = 0; i < (int)cands.size(); i++) by_obj[i] = i;
std::sort(by_obj.begin(), by_obj.end(),
[&](int a, int b) { return cands[a].objs[m] < cands[b].objs[m]; });
int added = 0;
for (int i = 0; i < (int)by_obj.size() && added < quota; i++) {
int idx = by_obj[i];
if (!cands[idx].selected) {
cands[idx].selected = true;
selected.push_back(idx);
added++;
}
}
}
// --- 2. NSGA-II 选择填充剩余名额 ---
int remaining = target - num_reserved_random - (int)selected.size();
if (remaining > 0) {
// 非支配排序
std::vector<std::vector<int>> fronts;
fast_nondominated_sort(cands, num_obj, fronts);
for (auto& front : fronts) {
if (remaining <= 0) break;
// 过滤已选中的
std::vector<int> available;
for (int i : front) {
if (!cands[i].selected) available.push_back(i);
}
if ((int)available.size() <= remaining) {
// 整层都选
for (int i : available) {
cands[i].selected = true;
selected.push_back(i);
remaining--;
}
} else {
// 该层需要截断:按加权拥挤度选
weighted_crowding_distance(cands, available, num_obj, importance);
std::sort(available.begin(), available.end(),
[&](int a, int b) { return cands[a].crowding > cands[b].crowding; });
for (int i = 0; i < remaining; i++) {
cands[available[i]].selected = true;
selected.push_back(available[i]);
}
remaining = 0;
}
}
}
return selected;
}
// ============================================================
// 单目标快速路径:直接按标量排序取 top
// ============================================================
inline std::vector<int> top_n_select(std::vector<CandidateInfo>& cands,
int target,
int num_reserved_random) {
int to_select = target - num_reserved_random;
// 按 penalty 优先,然后按 objs[0](已归一化为越小越好)
std::vector<int> indices(cands.size());
for (int i = 0; i < (int)cands.size(); i++) indices[i] = i;
std::sort(indices.begin(), indices.end(), [&](int a, int b) {
if (cands[a].penalty <= 0.0f && cands[b].penalty > 0.0f) return true;
if (cands[a].penalty > 0.0f && cands[b].penalty <= 0.0f) return false;
if (cands[a].penalty > 0.0f && cands[b].penalty > 0.0f)
return cands[a].penalty < cands[b].penalty;
return cands[a].objs[0] < cands[b].objs[0];
});
std::vector<int> selected;
selected.reserve(to_select);
for (int i = 0; i < to_select && i < (int)indices.size(); i++) {
selected.push_back(indices[i]);
cands[indices[i]].selected = true;
}
return selected;
}
} // namespace init_sel

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,212 @@
/**
* population.cuh - 种群管理
*
* v2.0: Block 级架构
* - RNG 数组大小 = pop_size * block_size每个 block 内每个线程独立 RNG
* - 初始化 kernel 保持 1-thread-per-solution初始化只做一次不需要并行
* - find_best_kernel 保持单线程(种群规模不大)
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
// ============================================================
// Device 端 Kernel模板化
// ============================================================
template<typename Sol>
__global__ void init_permutation_kernel(Sol* pop, int pop_size,
int dim1, int dim2_default,
curandState* rng_states) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= pop_size) return;
Sol& sol = pop[tid];
curandState* rng = &rng_states[tid];
for (int r = 0; r < dim1; r++) {
sol.dim2_sizes[r] = dim2_default;
for (int c = 0; c < dim2_default; c++) sol.data[r][c] = c;
shuffle(sol.data[r], dim2_default, rng);
}
sol.penalty = 0.0f;
}
template<typename Sol>
__global__ void init_binary_kernel(Sol* pop, int pop_size,
int dim1, int dim2_default,
curandState* rng_states) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= pop_size) return;
Sol& sol = pop[tid];
curandState* rng = &rng_states[tid];
for (int r = 0; r < dim1; r++) {
sol.dim2_sizes[r] = dim2_default;
for (int c = 0; c < dim2_default; c++) sol.data[r][c] = curand(rng) % 2;
}
sol.penalty = 0.0f;
}
template<typename Sol>
__global__ void init_integer_kernel(Sol* pop, int pop_size,
int dim1, int dim2_default,
int lb, int ub,
curandState* rng_states) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= pop_size) return;
Sol& sol = pop[tid];
curandState* rng = &rng_states[tid];
int range = ub - lb + 1;
for (int r = 0; r < dim1; r++) {
sol.dim2_sizes[r] = dim2_default;
for (int c = 0; c < dim2_default; c++)
sol.data[r][c] = lb + (curand(rng) % range);
}
sol.penalty = 0.0f;
}
// ============================================================
// 多重集排列初始化 — 每个值 [0, N) 重复 R 次,总长度 N*R
// ============================================================
// 用于 JSP 工序排列编码N=num_jobs, R=num_ops值 j 出现 R 次表示工件 j
template<typename Sol>
__global__ void init_multiset_perm_kernel(Sol* pop, int pop_size,
int dim1, int num_values, int repeat_count,
curandState* rng_states) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= pop_size) return;
Sol& sol = pop[tid];
curandState* rng = &rng_states[tid];
int total = num_values * repeat_count;
for (int r = 0; r < dim1; r++) {
sol.dim2_sizes[r] = total;
int idx = 0;
for (int v = 0; v < num_values; v++)
for (int k = 0; k < repeat_count; k++)
sol.data[r][idx++] = v;
shuffle(sol.data[r], total, rng);
}
sol.penalty = 0.0f;
}
// ============================================================
// 分区初始化 — 元素 {0..total_elements-1} 不重复分配到 dim1 行
// ============================================================
template<typename Sol>
__global__ void init_partition_kernel(Sol* pop, int pop_size,
int dim1, int total_elements,
curandState* rng_states) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= pop_size) return;
Sol& sol = pop[tid];
curandState* rng = &rng_states[tid];
for (int i = 0; i < total_elements; i++) sol.data[0][i] = i;
shuffle(sol.data[0], total_elements, rng);
int idx = 0;
for (int r = 0; r < dim1; r++) {
int count = total_elements / dim1;
if (r < total_elements % dim1) count++;
sol.dim2_sizes[r] = count;
if (r > 0) {
for (int c = 0; c < count; c++)
sol.data[r][c] = sol.data[0][idx + c];
}
idx += count;
}
sol.penalty = 0.0f;
}
template<typename Sol>
__global__ void find_best_kernel(const Sol* pop, int pop_size,
ObjConfig oc, int* best_idx) {
if (threadIdx.x != 0 || blockIdx.x != 0) return;
int best = 0;
for (int i = 1; i < pop_size; i++)
if (is_better(pop[i], pop[best], oc)) best = i;
*best_idx = best;
}
// ============================================================
// Host 端 RAII 类(模板化)
// ============================================================
template<typename Sol>
class Population {
public:
Sol* d_solutions = nullptr;
curandState* d_rng_states = nullptr; // 大小 = pop_size * block_size
int size = 0;
int rng_count = 0; // RNG 状态总数
Population() = default;
// block_size: Block 级架构下每个 block 的线程数
// RNG 数组大小 = pop_size * block_size每个 block 内每个线程独立 RNG
void allocate(int pop_size, int block_size = 128) {
size = pop_size;
rng_count = pop_size * block_size;
CUDA_CHECK(cudaMalloc(&d_solutions, sizeof(Sol) * size));
CUDA_CHECK(cudaMalloc(&d_rng_states, sizeof(curandState) * rng_count));
}
void init_rng(unsigned seed, int block_size = 256) {
int grid = calc_grid_size(rng_count, block_size);
init_curand_kernel<<<grid, block_size>>>(d_rng_states, seed, rng_count);
CUDA_CHECK_LAST();
}
void init_population(const ProblemConfig& cfg, int block_size = 256) {
int grid = calc_grid_size(size, block_size);
if (cfg.row_mode == RowMode::Partition) {
init_partition_kernel<<<grid, block_size>>>(
d_solutions, size, cfg.dim1, cfg.total_elements, d_rng_states);
} else if (cfg.encoding == EncodingType::Permutation && cfg.perm_repeat_count > 1) {
int num_values = cfg.dim2_default / cfg.perm_repeat_count;
init_multiset_perm_kernel<<<grid, block_size>>>(
d_solutions, size, cfg.dim1, num_values, cfg.perm_repeat_count, d_rng_states);
} else {
switch (cfg.encoding) {
case EncodingType::Permutation:
init_permutation_kernel<<<grid, block_size>>>(
d_solutions, size, cfg.dim1, cfg.dim2_default, d_rng_states);
break;
case EncodingType::Binary:
init_binary_kernel<<<grid, block_size>>>(
d_solutions, size, cfg.dim1, cfg.dim2_default, d_rng_states);
break;
case EncodingType::Integer:
init_integer_kernel<<<grid, block_size>>>(
d_solutions, size, cfg.dim1, cfg.dim2_default,
cfg.value_lower_bound, cfg.value_upper_bound,
d_rng_states);
break;
}
}
CUDA_CHECK_LAST();
}
Sol download_solution(int idx) const {
Sol h_sol;
CUDA_CHECK(cudaMemcpy(&h_sol, d_solutions + idx, sizeof(Sol), cudaMemcpyDeviceToHost));
return h_sol;
}
~Population() {
if (d_solutions) cudaFree(d_solutions);
if (d_rng_states) cudaFree(d_rng_states);
}
Population(const Population&) = delete;
Population& operator=(const Population&) = delete;
Population(Population&& o) noexcept
: d_solutions(o.d_solutions), d_rng_states(o.d_rng_states),
size(o.size), rng_count(o.rng_count) {
o.d_solutions = nullptr; o.d_rng_states = nullptr;
o.size = 0; o.rng_count = 0;
}
};

View file

@ -0,0 +1,125 @@
/**
* relation_matrix.cuh - G/O 关系矩阵管理
*
* G[i][j]: 分组倾向(元素 i 和 j 应在同一行的倾向,对称)
* O[i][j]: 排序倾向(元素 i 应排在 j 前面的倾向,不对称)
*
* 更新来源:历史最优解统计
* 每当 host 端获取到当前 best 解,扫描所有元素对关系:
* - 同行 → G[i][j] 增强
* - i 在 j 前 → O[i][j] 增强
* 使用 EMA 衰减M[i][j] = α * M[i][j] + (1-α) * signal
*
* 生命周期:
* 1. relation_matrix_create(N) — 分配 host/device 内存,初始化为 0
* 2. relation_matrix_update(rm, sol, dim1) — 从一个解更新 G/Ohost 端)
* 3. relation_matrix_upload(rm) — 上传 h_G/h_O 到 d_G/d_O
* 4. relation_matrix_destroy(rm) — 释放内存
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
#include <cstring>
// ============================================================
// 创建 / 销毁
// ============================================================
inline RelationMatrix relation_matrix_create(int N, float decay = 0.95f) {
RelationMatrix rm;
rm.N = N;
rm.decay = decay;
rm.update_count = 0;
size_t bytes = (size_t)N * N * sizeof(float);
rm.h_G = new float[N * N];
rm.h_O = new float[N * N];
memset(rm.h_G, 0, bytes);
memset(rm.h_O, 0, bytes);
CUDA_CHECK(cudaMalloc(&rm.d_G, bytes));
CUDA_CHECK(cudaMalloc(&rm.d_O, bytes));
CUDA_CHECK(cudaMemset(rm.d_G, 0, bytes));
CUDA_CHECK(cudaMemset(rm.d_O, 0, bytes));
return rm;
}
inline void relation_matrix_destroy(RelationMatrix& rm) {
delete[] rm.h_G;
delete[] rm.h_O;
CUDA_CHECK(cudaFree(rm.d_G));
CUDA_CHECK(cudaFree(rm.d_O));
rm.h_G = rm.h_O = nullptr;
rm.d_G = rm.d_O = nullptr;
rm.N = 0;
}
// ============================================================
// 从一个解更新 G/Ohost 端)
// ============================================================
// sol: 当前最优解(已下载到 host
// dim1: 实际使用的行数
//
// 逻辑:
// 对 sol 中每对元素 (val_a, val_b)
// 如果在同一行 → G[val_a][val_b] 增强
// 如果 val_a 在 val_b 前面 → O[val_a][val_b] 增强
//
// 注意:元素值 val 必须在 [0, N) 范围内才有意义
// 对于 partition 编码VRP元素值就是客户编号
// 对于单行排列TSP元素值就是城市编号
template<typename Sol>
void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) {
int N = rm.N;
float alpha = rm.decay;
float signal_strength = 1.0f;
// 衰减所有现有值
for (int i = 0; i < N * N; i++) {
rm.h_G[i] *= alpha;
rm.h_O[i] *= alpha;
}
// 扫描解中的元素对关系
for (int r = 0; r < dim1; r++) {
int sz = sol.dim2_sizes[r];
for (int c1 = 0; c1 < sz; c1++) {
int val_a = sol.data[r][c1];
if (val_a < 0 || val_a >= N) continue;
for (int c2 = c1 + 1; c2 < sz; c2++) {
int val_b = sol.data[r][c2];
if (val_b < 0 || val_b >= N) continue;
// 同行 → G 增强(对称)
rm.h_G[val_a * N + val_b] += (1.0f - alpha) * signal_strength;
rm.h_G[val_b * N + val_a] += (1.0f - alpha) * signal_strength;
// val_a 在 val_b 前 → O[val_a][val_b] 增强
rm.h_O[val_a * N + val_b] += (1.0f - alpha) * signal_strength;
}
}
}
// 裁剪到 [0, 1]
for (int i = 0; i < N * N; i++) {
if (rm.h_G[i] > 1.0f) rm.h_G[i] = 1.0f;
if (rm.h_O[i] > 1.0f) rm.h_O[i] = 1.0f;
}
rm.update_count++;
}
// ============================================================
// 上传到 GPU
// ============================================================
inline void relation_matrix_upload(const RelationMatrix& rm) {
size_t bytes = (size_t)rm.N * rm.N * sizeof(float);
CUDA_CHECK(cudaMemcpy(rm.d_G, rm.h_G, bytes, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(rm.d_O, rm.h_O, bytes, cudaMemcpyHostToDevice));
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,721 @@
/**
* types.cuh - 核心类型定义
*
* 包含编码类型、Solution 模板、ProblemConfig/SolverConfig、
* SeqRegistryAOS 序列级权重、KStepConfig多步执行
* RelationMatrixG/O 关系矩阵、ProblemBaseCRTP 基类)
*/
#pragma once
// ============================================================
// 编译时常量
// ============================================================
constexpr int MAX_OBJ = 4; // 最多 4 个目标16字节不值得模板化
constexpr int MAX_SEQ = 32; // 最大序列数(内置 ~16 + 自定义算子 ≤8留余量
constexpr int MAX_K = 3; // 多步执行的最大步数K=1,2,3
// AOS 权重上下限(归一化后)
constexpr float AOS_WEIGHT_FLOOR = 0.05f; // 最低权重保底(确保充分探索)
constexpr float AOS_WEIGHT_CAP = 0.35f; // 最高权重上限(防止赢者通吃)
// ============================================================
// 枚举类型
// ============================================================
enum class EncodingType {
Permutation, // 排列:元素不重复
Binary, // 0-1flip 是主要算子
Integer // 有界整数
};
enum class RowMode {
Single, // dim1=1单行TSP/QAP/Knapsack 等大部分问题)
Fixed, // dim1>1行等长不可变JSP-Int/Schedule禁止 SPLIT/MERGE
Partition // dim1>1元素分区到各行行长可变CVRP/VRPTW
};
enum class ObjDir {
Minimize,
Maximize
};
// 多目标比较模式
enum class CompareMode {
Weighted, // 加权求和sum(weight[i] * obj[i]),越小越好
Lexicographic // 字典法:按优先级逐目标比较,前面的目标优先
};
enum class MigrateStrategy {
Ring, // 环形:各岛最优→邻岛最差(慢传播,高多样性)
TopN, // 全局 Top-N 轮转分发(快传播,强收敛)
Hybrid // 两者兼顾Top-N 替换最差 + Ring 替换次差
};
// ============================================================
// SeqID — 统一的 OperationSequence 编号
// ============================================================
// 每个 SeqID 对应一种具体的搜索操作(原子或多步)
// AOS 权重跟踪粒度 = SeqID每个序列独立权重
//
// 命名规则SEQ_{编码}_{操作名}
// 跨编码共享的行级操作统一编号
namespace seq {
// --- Permutation 行内(元素级)---
constexpr int SEQ_PERM_SWAP = 0; // swap 两个位置
constexpr int SEQ_PERM_REVERSE = 1; // 2-opt反转区间
constexpr int SEQ_PERM_INSERT = 2; // insert移动到新位置
constexpr int SEQ_PERM_3OPT = 3; // 3-opt断 3 边重连)
// --- Permutation 行内(片段级)---
constexpr int SEQ_PERM_OR_OPT = 4; // or-opt移动连续 k 个元素)
// --- Permutation 行内(组合级)---
constexpr int SEQ_PERM_DOUBLE_SWAP = 30; // 连续两次 swap同行
constexpr int SEQ_PERM_TRIPLE_SWAP = 31; // 连续三次 swap同行
// --- Permutation 跨行(元素级)---
constexpr int SEQ_PERM_CROSS_RELOCATE = 5; // 单元素移行
constexpr int SEQ_PERM_CROSS_SWAP = 6; // 单元素换行
// --- Permutation 跨行(片段级)---
constexpr int SEQ_PERM_SEG_RELOCATE = 7; // 片段移行
constexpr int SEQ_PERM_SEG_SWAP = 8; // 片段换行2-opt*
constexpr int SEQ_PERM_CROSS_EXCHANGE = 9; // 片段互换(保序)
// --- Binary 行内(元素级)---
constexpr int SEQ_BIN_FLIP = 0; // 翻转一个位
constexpr int SEQ_BIN_SWAP = 1; // 交换两个位
// --- Binary 行内(片段级)---
constexpr int SEQ_BIN_SEG_FLIP = 2; // 翻转连续 k 个位
constexpr int SEQ_BIN_K_FLIP = 3; // 同时翻转 k 个随机位
// --- Binary 跨行 ---
constexpr int SEQ_BIN_CROSS_SWAP = 4; // 两行各一个位互换
constexpr int SEQ_BIN_SEG_CROSS_SWAP = 5; // 两行各取一段互换
// --- 共享:行级(编码无关)---
constexpr int SEQ_ROW_SWAP = 10; // 交换两行
constexpr int SEQ_ROW_REVERSE = 11; // 反转行排列
constexpr int SEQ_ROW_SPLIT = 12; // 一行拆两行
constexpr int SEQ_ROW_MERGE = 13; // 两行合并
// --- 特殊 ---
constexpr int SEQ_PERTURBATION = 14; // 扰动(多步不可逆)
// --- Integer 行内(元素级)---
constexpr int SEQ_INT_RANDOM_RESET = 0; // 随机一个位置重置为 [lb, ub] 内随机值
constexpr int SEQ_INT_DELTA = 1; // 随机一个位置 ±kclamp 到 [lb, ub]
constexpr int SEQ_INT_SWAP = 2; // 交换两个位置的值
// --- Integer 行内(片段级)---
constexpr int SEQ_INT_SEG_RESET = 3; // 连续 k 个位置全部重置
constexpr int SEQ_INT_K_DELTA = 4; // 随机 k 个位置各自 ±1
// --- Integer 跨行 ---
constexpr int SEQ_INT_CROSS_SWAP = 5; // 两行各一个位置互换
// --- LNS大邻域搜索---
constexpr int SEQ_LNS_SEGMENT_SHUFFLE = 20; // 打乱连续片段
constexpr int SEQ_LNS_SCATTER_SHUFFLE = 21; // 打乱随机分散位置
constexpr int SEQ_LNS_GUIDED_REBUILD = 22; // 关系矩阵引导重建
} // namespace seq
// ============================================================
// RelationMatrix — G/O 关系矩阵GPU global memory
// ============================================================
// G[i][j]: 元素 i 和 j 的分组倾向(对称,越大越倾向同组)
// O[i][j]: 元素 i 排在 j 前面的倾向(不对称)
// 存储为一维数组 [N * N],行优先
// 小规模 N<200 直接 DenseP2 再做稀疏化
//
// 更新时机host 端,每个 batch 间隙
// 使用时机kernel 中 SEQ_LNS_GUIDED_REBUILD 读取
struct RelationMatrix {
float* d_G; // GPU 上的 G 矩阵 [N * N]
float* d_O; // GPU 上的 O 矩阵 [N * N]
float* h_G; // Host 上的 G 矩阵 [N * N](用于更新后上传)
float* h_O; // Host 上的 O 矩阵 [N * N]
int N; // 元素总数
float decay; // 衰减系数 α(默认 0.95
int update_count; // 已更新次数(用于冷启动判断)
};
// ============================================================
// SeqRegistry — 运行时可用序列注册表
// ============================================================
// 根据 EncodingType 和 dim1 自动确定哪些序列可用
// 传到 GPU 供 sample_sequence() 使用
enum class SeqCategory : int {
InRow = 0, // 行内算子swap, reverse, insert, ...
CrossRow = 1, // 跨行算子cross_relocate, cross_swap, seg_relocate, ...
RowLevel = 2, // 行级算子row_swap, row_reverse, split, merge
LNS = 3, // 大邻域搜索
};
struct SeqRegistry {
int ids[MAX_SEQ]; // 可用序列的 SeqID 列表
int count; // 可用序列数量
float weights[MAX_SEQ]; // 每个序列的当前权重(归一化后用于采样)
float max_w[MAX_SEQ]; // 每个序列的权重上限0 = 不限,用全局 cap
SeqCategory categories[MAX_SEQ]; // 每个序列的分类(约束导向用)
};
// ============================================================
// KStepConfig — 多步执行的步数选择配置
// ============================================================
// K=1: 单步当前行为K=2/3: 连续执行多个序列后再评估
// 两层权重体系的第一层
//
// 自适应策略:
// - 初始 K=1 权重很大保守K>1 权重小
// - K>1 带来改进 → 增大该 K 的权重
// - 长时间无改进 → 重置/增大 K>1 权重(跳出局部最优)
struct KStepConfig {
float weights[MAX_K]; // K=1,2,3 的采样权重(归一化)
int stagnation_count; // 连续无改进的 batch 数(用于触发重置)
int stagnation_limit; // 触发重置的阈值(默认 5 个 batch
};
// 构建默认 K 步配置
inline KStepConfig build_kstep_config() {
KStepConfig kc;
kc.weights[0] = 0.80f; // K=1: 初始主导
kc.weights[1] = 0.15f; // K=2: 少量探索
kc.weights[2] = 0.05f; // K=3: 极少探索
kc.stagnation_count = 0;
kc.stagnation_limit = 5;
return kc;
};
// ============================================================
// ProblemProfile — 基于结构特征推断的问题画像
// ============================================================
// 第一层:纯结构推断(不感知语义),用于驱动算子注册和初始权重
// 未来第二层:可扩展更细粒度的画像(如多属性、高约束等)
enum class ScaleClass { Small, Medium, Large };
enum class StructClass { SingleSeq, MultiFixed, MultiPartition };
struct ProblemProfile {
EncodingType encoding;
ScaleClass scale;
StructClass structure;
float cross_row_prob;
};
// classify_problem() 定义在 ProblemConfig 之后
// ============================================================
// 权重预设 — 由 ScaleClass 驱动
// ============================================================
struct WeightPreset {
float w_cubic;
float w_quadratic;
float w_lns;
float lns_cap;
};
inline WeightPreset get_weight_preset(ScaleClass scale) {
switch (scale) {
case ScaleClass::Small: return { 0.50f, 0.80f, 0.006f, 0.01f };
case ScaleClass::Medium: return { 0.30f, 0.70f, 0.004f, 0.01f };
case ScaleClass::Large: return { 0.05f, 0.30f, 0.001f, 0.01f };
}
return { 0.50f, 0.80f, 0.006f, 0.01f };
}
// classify_problem() 和 build_seq_registry() 定义在 ProblemConfig 之后
// ============================================================
// Solution<D1, D2> — 解的模板化表示
// ============================================================
// D1: 行数上限 (TSP=1, VRP≤16, Schedule≤8)
// D2: 每行列数上限 (TSP≤64, 背包≤32)
// 每个 Problem 选择最小够用的 D1/D2编译器生成紧凑的结构
template<int D1, int D2>
struct Solution {
static constexpr int DIM1 = D1; // 编译时行数上限
static constexpr int DIM2 = D2; // 编译时列数上限
int data[D1][D2]; // D1×D2×4 字节
int dim2_sizes[D1]; // D1×4 字节
float objectives[MAX_OBJ]; // 16 字节(固定)
float penalty; // 4 字节
};
// ============================================================
// ProblemConfig — 问题的运行时元信息
// ============================================================
struct ProblemConfig {
EncodingType encoding;
int dim1; // 实际使用的行数 (≤ D1)
int dim2_default; // 实际使用的列数 (≤ D2)
int num_objectives;
ObjDir obj_dirs[MAX_OBJ];
float obj_weights[MAX_OBJ]; // Weighted 模式下的权重
// 多目标比较
CompareMode compare_mode = CompareMode::Weighted;
int obj_priority[MAX_OBJ] = {0, 1, 2, 3}; // Lexicographic 模式下的比较顺序(索引)
float obj_tolerance[MAX_OBJ] = {0.0f, 0.0f, 0.0f, 0.0f}; // 字典法容差:差值 <= tol 视为相等
int value_lower_bound;
int value_upper_bound;
// v3.4: 统一行模式
RowMode row_mode = RowMode::Single; // 行模式Single/Fixed/Partition
float cross_row_prob = 0.0f; // 跨行 move 概率0=纯行内操作)
int total_elements = 0; // Partition 模式下的总元素数
int perm_repeat_count = 1; // 排列中每个值的重复次数1=标准排列,>1=多重集排列)
};
// ============================================================
// SolverConfig — 求解器参数
// ============================================================
struct SolverConfig {
int pop_size = 0; // 种群大小0 = 自动匹配 GPU 最大并行度)
int max_gen = 1000;
float mutation_rate = 0.1f;
unsigned seed = 42;
bool verbose = true;
int print_every = 100;
// 岛屿模型参数
int num_islands = 1; // 0 = 自适应1 = 纯爬山(无岛屿),>1 = 岛屿模型
int migrate_interval = 100; // 每隔多少代执行一次迁移
MigrateStrategy migrate_strategy = MigrateStrategy::Hybrid;
// 模拟退火参数
float sa_temp_init = 0.0f; // 初始温度0 = 禁用 SA纯爬山
float sa_alpha = 0.998f; // 冷却率(每代乘以 alpha
// v1.0: 交叉参数
float crossover_rate = 0.1f; // 每代中执行交叉的概率vs 变异)
// v2.0: 自适应算子选择
bool use_aos = false; // 启用 AOSbatch 间更新算子权重)
float aos_weight_floor = AOS_WEIGHT_FLOOR; // 运行时可覆盖的 floor
float aos_weight_cap = AOS_WEIGHT_CAP; // 运行时可覆盖的 cap
// v2.1: 初始解策略
int init_oversample = 4; // 采样倍数1 = 不做采样择优,即纯随机)
float init_random_ratio = 0.3f; // 纯随机解占比(多样性保底)
// v3.0: 工程可用性
float time_limit_sec = 0.0f; // 时间限制0 = 不限制,按 max_gen 跑完)
int stagnation_limit = 0; // 收敛检测:连续多少个 batch 无改进后 reheat0 = 禁用)
float reheat_ratio = 0.5f; // reheat 时温度恢复到初始温度的比例
// v3.5: CUDA Graph
bool use_cuda_graph = false; // 启用 CUDA Graph减少 kernel launch 开销)
// v3.6: AOS 更新频率控制
int aos_update_interval = 10; // 每隔多少个 batch 更新一次 AOS 权重(降低 cudaMemcpy 同步频率)
// v4.0: 约束导向 + 分层搜索
bool use_constraint_directed = false; // 启用约束导向(根据 penalty 比例动态调整跨行算子权重)
bool use_phased_search = false; // 启用分层搜索(按进度调整全局 floor/cap
// 分层搜索参数:三期阈值
float phase_explore_end = 0.30f; // 探索期结束(进度比例)
float phase_refine_start = 0.70f; // 精细期开始(进度比例)
// 约束导向参数
float constraint_boost_max = 2.5f; // 高约束时跨行算子 cap 提升倍率上限
};
// ============================================================
// classify_problem — 从 ProblemConfig 推断问题画像
// ============================================================
inline ProblemProfile classify_problem(const ProblemConfig& pcfg) {
ProblemProfile p;
p.encoding = pcfg.encoding;
if (pcfg.dim2_default <= 100) p.scale = ScaleClass::Small;
else if (pcfg.dim2_default <= 250) p.scale = ScaleClass::Medium;
else p.scale = ScaleClass::Large;
if (pcfg.dim1 <= 1)
p.structure = StructClass::SingleSeq;
else if (pcfg.row_mode == RowMode::Partition)
p.structure = StructClass::MultiPartition;
else
p.structure = StructClass::MultiFixed;
p.cross_row_prob = pcfg.cross_row_prob;
return p;
}
// ============================================================
// build_seq_registry — 由 ProblemProfile 驱动的算子注册
// ============================================================
inline SeqRegistry build_seq_registry(const ProblemProfile& prof) {
SeqRegistry reg;
reg.count = 0;
for (int i = 0; i < MAX_SEQ; i++) {
reg.ids[i] = -1; reg.weights[i] = 0.0f;
reg.max_w[i] = 0.0f; reg.categories[i] = SeqCategory::InRow;
}
auto add = [&](int id, float w, SeqCategory cat, float cap = 0.0f) {
if (reg.count >= MAX_SEQ) return;
reg.ids[reg.count] = id;
reg.weights[reg.count] = w;
reg.max_w[reg.count] = cap;
reg.categories[reg.count] = cat;
reg.count++;
};
WeightPreset wp = get_weight_preset(prof.scale);
bool multi_row = (prof.structure != StructClass::SingleSeq);
float cr = prof.cross_row_prob;
if (prof.encoding == EncodingType::Permutation) {
add(seq::SEQ_PERM_SWAP, 1.0f, SeqCategory::InRow);
add(seq::SEQ_PERM_REVERSE, 1.0f, SeqCategory::InRow);
add(seq::SEQ_PERM_INSERT, 1.0f, SeqCategory::InRow);
add(seq::SEQ_PERM_DOUBLE_SWAP, 0.5f, SeqCategory::InRow);
add(seq::SEQ_PERM_TRIPLE_SWAP, 0.3f, SeqCategory::InRow);
add(seq::SEQ_PERM_3OPT, wp.w_cubic, SeqCategory::InRow);
add(seq::SEQ_PERM_OR_OPT, wp.w_quadratic, SeqCategory::InRow);
if (multi_row && cr > 0.0f) {
add(seq::SEQ_PERM_CROSS_RELOCATE, 0.6f * cr, SeqCategory::CrossRow);
add(seq::SEQ_PERM_CROSS_SWAP, 0.6f * cr, SeqCategory::CrossRow);
add(seq::SEQ_PERM_SEG_RELOCATE, 0.5f * cr, SeqCategory::CrossRow);
add(seq::SEQ_PERM_SEG_SWAP, 0.5f * cr, SeqCategory::CrossRow);
add(seq::SEQ_PERM_CROSS_EXCHANGE, 0.4f * cr, SeqCategory::CrossRow);
}
if (multi_row) {
add(seq::SEQ_ROW_SWAP, 0.3f, SeqCategory::RowLevel);
add(seq::SEQ_ROW_REVERSE, 0.2f, SeqCategory::RowLevel);
if (prof.structure == StructClass::MultiPartition) {
add(seq::SEQ_ROW_SPLIT, 0.2f, SeqCategory::RowLevel);
add(seq::SEQ_ROW_MERGE, 0.2f, SeqCategory::RowLevel);
}
}
add(seq::SEQ_LNS_SEGMENT_SHUFFLE, wp.w_lns, SeqCategory::LNS, wp.lns_cap);
add(seq::SEQ_LNS_SCATTER_SHUFFLE, wp.w_lns, SeqCategory::LNS, wp.lns_cap);
add(seq::SEQ_LNS_GUIDED_REBUILD, wp.w_lns, SeqCategory::LNS, wp.lns_cap);
}
else if (prof.encoding == EncodingType::Binary) {
add(seq::SEQ_BIN_FLIP, 1.0f, SeqCategory::InRow);
add(seq::SEQ_BIN_SWAP, 0.8f, SeqCategory::InRow);
add(seq::SEQ_BIN_SEG_FLIP, 0.6f, SeqCategory::InRow);
add(seq::SEQ_BIN_K_FLIP, 0.6f, SeqCategory::InRow);
if (multi_row && cr > 0.0f) {
add(seq::SEQ_BIN_CROSS_SWAP, 0.5f * cr, SeqCategory::CrossRow);
add(seq::SEQ_BIN_SEG_CROSS_SWAP, 0.4f * cr, SeqCategory::CrossRow);
}
if (multi_row) {
add(seq::SEQ_ROW_SWAP, 0.3f, SeqCategory::RowLevel);
add(seq::SEQ_ROW_REVERSE, 0.2f, SeqCategory::RowLevel);
if (prof.structure == StructClass::MultiPartition) {
add(seq::SEQ_ROW_SPLIT, 0.2f, SeqCategory::RowLevel);
add(seq::SEQ_ROW_MERGE, 0.2f, SeqCategory::RowLevel);
}
}
}
else if (prof.encoding == EncodingType::Integer) {
add(seq::SEQ_INT_RANDOM_RESET, 1.0f, SeqCategory::InRow);
add(seq::SEQ_INT_DELTA, 1.0f, SeqCategory::InRow);
add(seq::SEQ_INT_SWAP, 0.8f, SeqCategory::InRow);
add(seq::SEQ_INT_SEG_RESET, 0.6f, SeqCategory::InRow);
add(seq::SEQ_INT_K_DELTA, 0.6f, SeqCategory::InRow);
if (multi_row && cr > 0.0f) {
add(seq::SEQ_INT_CROSS_SWAP, 0.5f * cr, SeqCategory::CrossRow);
}
if (multi_row) {
add(seq::SEQ_ROW_SWAP, 0.3f, SeqCategory::RowLevel);
add(seq::SEQ_ROW_REVERSE, 0.2f, SeqCategory::RowLevel);
if (prof.structure == StructClass::MultiPartition) {
add(seq::SEQ_ROW_SPLIT, 0.2f, SeqCategory::RowLevel);
add(seq::SEQ_ROW_MERGE, 0.2f, SeqCategory::RowLevel);
}
}
}
float sum = 0.0f;
for (int i = 0; i < reg.count; i++) sum += reg.weights[i];
if (sum > 0.0f) {
for (int i = 0; i < reg.count; i++) reg.weights[i] /= sum;
}
return reg;
}
// ============================================================
// ObjConfig — 传到 GPU 的目标比较配置(紧凑结构)
// ============================================================
struct ObjConfig {
int num_obj;
CompareMode mode;
ObjDir dirs[MAX_OBJ]; // 每个目标的方向
float weights[MAX_OBJ]; // Weighted 模式下的权重
int priority[MAX_OBJ]; // Lexicographic 模式下的比较顺序
float tolerance[MAX_OBJ]; // Lexicographic 模式下的容差
};
// 从 ProblemConfig 构造 ObjConfigCPU 端)
inline ObjConfig make_obj_config(const ProblemConfig& pcfg) {
ObjConfig oc;
oc.num_obj = pcfg.num_objectives;
oc.mode = pcfg.compare_mode;
for (int i = 0; i < MAX_OBJ; i++) {
oc.dirs[i] = pcfg.obj_dirs[i];
oc.weights[i] = pcfg.obj_weights[i];
oc.priority[i] = pcfg.obj_priority[i];
oc.tolerance[i] = pcfg.obj_tolerance[i];
}
return oc;
}
// ============================================================
// SolveResult — solve() 的返回值
// ============================================================
enum class StopReason { MaxGen, TimeLimit, Stagnation };
template<typename Sol>
struct SolveResult {
Sol best_solution;
float elapsed_ms = 0.0f;
int generations = 0;
StopReason stop_reason = StopReason::MaxGen;
};
// ============================================================
// 目标重要性映射 — 统一 Weighted / Lexicographic 的重要性度量
// ============================================================
// 用于初始化选种NSGA-II 加权拥挤度 + 核心目标预留名额)
// Weighted: importance[i] = weight[i] / Σweight
// Lexicographic: importance[i] = 0.5^rank[i] / Σ(0.5^rank)
// → 第一优先级 ~57%,第二 ~29%,第三 ~14%
inline void compute_importance(const ObjConfig& oc, float* importance) {
float sum = 0.0f;
for (int i = 0; i < oc.num_obj; i++) {
if (oc.mode == CompareMode::Weighted) {
importance[i] = oc.weights[i];
} else {
int rank = oc.priority[i];
importance[i] = 1.0f;
for (int r = 0; r < rank; r++) importance[i] *= 0.5f; // 0.5^rank
}
sum += importance[i];
}
if (sum > 0.0f) {
for (int i = 0; i < oc.num_obj; i++)
importance[i] /= sum;
}
}
// ============================================================
// 比较工具 — 支持 Weighted / Lexicographic
// ============================================================
// 将目标值统一为"越小越好"Maximize 目标取负
__device__ __host__ inline float normalize_obj(float val, ObjDir dir) {
return (dir == ObjDir::Maximize) ? -val : val;
}
// 核心比较a 是否优于 b
template<typename Sol>
__device__ inline bool is_better(const Sol& a, const Sol& b,
const ObjConfig& oc) {
// penalty 优先:可行解一定优于不可行解
if (a.penalty <= 0.0f && b.penalty > 0.0f) return true;
if (a.penalty > 0.0f && b.penalty <= 0.0f) return false;
if (a.penalty > 0.0f && b.penalty > 0.0f) return a.penalty < b.penalty;
if (oc.mode == CompareMode::Weighted) {
// 加权求和权重已包含方向信息Maximize 目标用负权重,或由 normalize_obj 处理)
float sum_a = 0.0f, sum_b = 0.0f;
for (int i = 0; i < oc.num_obj; i++) {
float na = normalize_obj(a.objectives[i], oc.dirs[i]);
float nb = normalize_obj(b.objectives[i], oc.dirs[i]);
sum_a += oc.weights[i] * na;
sum_b += oc.weights[i] * nb;
}
return sum_a < sum_b;
} else {
// 字典法:按 priority 顺序逐目标比较
for (int p = 0; p < oc.num_obj; p++) {
int idx = oc.priority[p];
float va = normalize_obj(a.objectives[idx], oc.dirs[idx]);
float vb = normalize_obj(b.objectives[idx], oc.dirs[idx]);
float diff = va - vb;
if (diff < -oc.tolerance[idx]) return true; // a 明显更好
if (diff > oc.tolerance[idx]) return false; // b 明显更好
// 在容差内视为相等 → 继续比较下一个目标
}
return false; // 所有目标都在容差内相等
}
}
// 标量化SA 接受概率用):返回越小越好的标量
template<typename Sol>
__device__ __host__ inline float scalar_objective(const Sol& sol,
const ObjConfig& oc) {
if (oc.mode == CompareMode::Weighted) {
float sum = 0.0f;
for (int i = 0; i < oc.num_obj; i++)
sum += oc.weights[i] * normalize_obj(sol.objectives[i], oc.dirs[i]);
return sum;
} else {
// 字典法下 SA 用第一优先级目标作为标量
int idx = oc.priority[0];
return normalize_obj(sol.objectives[idx], oc.dirs[idx]);
}
}
// 轻量比较:直接操作 float[] 目标数组(避免复制整个 Sol
__device__ inline bool obj_is_better(const float* new_objs, const float* old_objs,
const ObjConfig& oc) {
if (oc.mode == CompareMode::Weighted) {
float sum_new = 0.0f, sum_old = 0.0f;
for (int i = 0; i < oc.num_obj; i++) {
sum_new += oc.weights[i] * normalize_obj(new_objs[i], oc.dirs[i]);
sum_old += oc.weights[i] * normalize_obj(old_objs[i], oc.dirs[i]);
}
return sum_new < sum_old;
} else {
for (int p = 0; p < oc.num_obj; p++) {
int idx = oc.priority[p];
float va = normalize_obj(new_objs[idx], oc.dirs[idx]);
float vb = normalize_obj(old_objs[idx], oc.dirs[idx]);
float diff = va - vb;
if (diff < -oc.tolerance[idx]) return true;
if (diff > oc.tolerance[idx]) return false;
}
return false;
}
}
// 轻量标量化:直接操作 float[] 目标数组
__device__ __host__ inline float obj_scalar(const float* objs, const ObjConfig& oc) {
if (oc.mode == CompareMode::Weighted) {
float sum = 0.0f;
for (int i = 0; i < oc.num_obj; i++)
sum += oc.weights[i] * normalize_obj(objs[i], oc.dirs[i]);
return sum;
} else {
int idx = oc.priority[0];
return normalize_obj(objs[idx], oc.dirs[idx]);
}
}
// ============================================================
// AOSStats — 自适应算子选择统计(每个 block 一份)
// ============================================================
// v3.0: 粒度从 3 层 → MAX_SEQ 个序列
// 记录每个序列的使用次数和改进次数
// batch 结束后由 host 聚合,更新 SeqRegistry 权重
struct AOSStats {
// 算子层统计(第二层)
int usage[MAX_SEQ]; // 各序列使用次数
int improvement[MAX_SEQ]; // 各序列改进次数delta < 0 且被接受)
// K 步数层统计(第一层)
int k_usage[MAX_K]; // K=1,2,3 各自使用次数
int k_improvement[MAX_K]; // K=1,2,3 各自改进次数
};
// ============================================================
// ObjDef — 单个目标的定义(编译期常量)
// ============================================================
struct ObjDef {
ObjDir dir; // 优化方向
float weight; // Weighted 模式下的权重
float tolerance; // Lexicographic 模式下的容差
};
// ============================================================
// HeuristicMatrix — 启发式初始解构造用的数据矩阵描述
// ============================================================
struct HeuristicMatrix {
const float* data; // host 端 N*N 矩阵
int N; // 维度
};
// ============================================================
// ProblemBase<Derived, D1, D2> — CRTP 基类
//
// 用户继承此基类,提供:
// static constexpr ObjDef OBJ_DEFS[] = {...}; — 目标元信息
// __device__ float compute_obj(int idx, ...) const; — 目标分发
// __device__ float compute_penalty(...) const;
//
// 约定OBJ_DEFS 和 compute_obj 紧挨着写case N 对应 OBJ_DEFS[N]
// NUM_OBJ 由 sizeof(OBJ_DEFS) 自动推导,无需手动维护
//
// 基类自动提供:
// evaluate(sol) — 遍历目标列表调用 compute_obj
// fill_obj_config(cfg) — 从 OBJ_DEFS 自动填充 ProblemConfig
// obj_config() — 直接生成 ObjConfig
// ============================================================
template<typename Derived, int D1_, int D2_>
struct ProblemBase {
static constexpr int D1 = D1_;
static constexpr int D2 = D2_;
using Sol = Solution<D1, D2>;
// NUM_OBJ 从 OBJ_DEFS 数组自动推导
static constexpr int NUM_OBJ = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
// 自动评估:遍历目标列表
__device__ void evaluate(Sol& sol) const {
const auto& self = static_cast<const Derived&>(*this);
constexpr int n = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
for (int i = 0; i < n; i++)
sol.objectives[i] = self.compute_obj(i, sol);
sol.penalty = self.compute_penalty(sol);
}
// 从 OBJ_DEFS 自动填充 ProblemConfig 的目标部分
void fill_obj_config(ProblemConfig& cfg) const {
constexpr int n = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
cfg.num_objectives = n;
for (int i = 0; i < n; i++) {
cfg.obj_dirs[i] = Derived::OBJ_DEFS[i].dir;
cfg.obj_weights[i] = Derived::OBJ_DEFS[i].weight;
cfg.obj_tolerance[i] = Derived::OBJ_DEFS[i].tolerance;
cfg.obj_priority[i] = i; // 列表顺序即优先级
}
}
// 直接生成 ObjConfig供 solver 使用)
ObjConfig obj_config() const {
ProblemConfig pcfg;
fill_obj_config(pcfg);
return make_obj_config(pcfg);
}
// 每个 block 在 global memory 中的热数据工作集大小(字节)
// 用于 auto pop_size 估算 L2 cache 压力
// 默认 = shared_mem_bytes()(数据在 smem 时gmem 工作集为 0 不影响)
// 子类覆盖:当 shared_mem_bytes() 返回 0数据放不进 smem
// 返回实际数据大小(如距离矩阵 n*n*sizeof(float)
size_t working_set_bytes() const {
return static_cast<const Derived&>(*this).shared_mem_bytes();
}
// 可选:初始化 G/O 关系矩阵(为 GUIDED_REBUILD 提供先验知识)
// G[i*N+j]: 元素 i 和 j 的分组倾向(对称,[0,1],越大越倾向同组)
// O[i*N+j]: 元素 i 排在 j 前面的倾向(不对称,[0,1]
// 默认不提供(全零),搜索过程中通过 EMA 从历史好解积累
// 用户覆盖示例:距离近 → G 和 O 都高
void init_relation_matrix(float* h_G, float* h_O, int N) const {
(void)h_G; (void)h_O; (void)N; // 默认:不做任何事(保持全零)
}
// 可选:返回 host 端数据矩阵供启发式初始解构造
// 默认返回 0不提供子类 override 后填充 out 数组并返回实际数量
int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
(void)out; (void)max_count;
return 0;
}
};

View file

@ -0,0 +1,114 @@
/**
* assignment.cuh - 指派问题
*
* 继承 ProblemBase使用 ObjDef 目标注册机制
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
#include "operators.cuh"
struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
const float* d_cost;
const float* h_cost; // host 端成本矩阵(用于 init_relation_matrix
int n;
// ---- 目标计算 ----
__device__ float calc_total_cost(const Sol& sol) const {
float total = 0.0f;
const int* assign = sol.data[0];
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++)
total += d_cost[i * n + assign[i]];
return total;
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_cost
};
__device__ float compute_obj(int idx, const Sol& sol) const {
switch (idx) {
case 0: return calc_total_cost(sol); // OBJ_DEFS[0]
default: return 0.0f;
}
}
__device__ float compute_penalty(const Sol& sol) const {
return 0.0f;
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Permutation;
cfg.dim1 = 1; cfg.dim2_default = n;
fill_obj_config(cfg);
return cfg;
}
// ---- shared memory 接口 ----
static constexpr size_t SMEM_LIMIT = 48 * 1024;
size_t shared_mem_bytes() const {
size_t need = (size_t)n * n * sizeof(float);
return need <= SMEM_LIMIT ? need : 0;
}
size_t working_set_bytes() const {
return (size_t)n * n * sizeof(float);
}
__device__ void load_shared(char* smem, int tid, int bsz) {
float* sc = reinterpret_cast<float*>(smem);
int total = n * n;
for (int i = tid; i < total; i += bsz) sc[i] = d_cost[i];
d_cost = sc;
}
// 成本先验task j 和 task k 如果被相似 agent 偏好G 值高
// O 矩阵task j 在位置 i 成本低 → O[j][k] 略高j 倾向排在 k 前面的位置)
void init_relation_matrix(float* G, float* O, int N) const {
if (!h_cost || N != n) return;
// 对每个 task构建成本向量task 间余弦相似度 → G
// 简化:成本列向量的相关性
float max_c = 0.0f;
for (int i = 0; i < N * N; i++)
if (h_cost[i] > max_c) max_c = h_cost[i];
if (max_c <= 0.0f) return;
for (int j = 0; j < N; j++)
for (int k = 0; k < N; k++) {
if (j == k) continue;
// G: 两个 task 的成本向量越相似 → 越可能互换
float dot = 0.0f, nj = 0.0f, nk = 0.0f;
for (int i = 0; i < N; i++) {
float cj = h_cost[i * N + j] / max_c;
float ck = h_cost[i * N + k] / max_c;
dot += cj * ck;
nj += cj * cj;
nk += ck * ck;
}
float denom = sqrtf(nj) * sqrtf(nk);
float sim = (denom > 1e-6f) ? dot / denom : 0.0f;
G[j * N + k] = sim * 0.2f;
O[j * N + k] = sim * 0.05f;
}
}
static AssignmentProblem create(const float* hc, int n) {
AssignmentProblem prob;
prob.n = n;
prob.h_cost = hc;
float* dc;
CUDA_CHECK(cudaMalloc(&dc, sizeof(float)*n*n));
CUDA_CHECK(cudaMemcpy(dc, hc, sizeof(float)*n*n, cudaMemcpyHostToDevice));
prob.d_cost = dc;
return prob;
}
void destroy() {
if (d_cost) { cudaFree(const_cast<float*>(d_cost)); d_cost = nullptr; }
h_cost = nullptr;
}
};

View file

@ -0,0 +1,97 @@
/**
* bin_packing.cuh - 一维装箱问题Integer 编码 + 约束)
*
* N 个物品,每个重量 w[i],装入最多 B 个箱子,每个箱子容量 C。
* 决策变量data[0][i] ∈ [0, B-1],表示物品 i 放入的箱子编号。
* 目标:最小化使用的箱子数。
* 约束:每个箱子总重不超过 C超出部分作为 penalty。
*
* 验证实例8 物品 weights=[7,5,3,4,6,2,8,1], C=10, 最优=4 箱
* 箱0={7,3}=10, 箱1={5,4,1}=10, 箱2={6,2}=8, 箱3={8}=8
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
struct BinPackingProblem : ProblemBase<BinPackingProblem, 1, 64> {
const float* d_weights;
int n; // 物品数
int max_bins; // 最大箱子数 B
float capacity; // 箱子容量 C
__device__ float calc_bins_used(const Sol& sol) const {
bool used[32] = {};
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++) {
int b = sol.data[0][i];
if (b >= 0 && b < max_bins) used[b] = true;
}
int count = 0;
for (int b = 0; b < max_bins; b++)
if (used[b]) count++;
return (float)count;
}
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f},
};
__device__ float compute_obj(int idx, const Sol& sol) const {
switch (idx) {
case 0: return calc_bins_used(sol);
default: return 0.0f;
}
}
__device__ float compute_penalty(const Sol& sol) const {
float penalty = 0.0f;
float load[32] = {};
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++) {
int b = sol.data[0][i];
if (b >= 0 && b < max_bins)
load[b] += d_weights[i];
}
for (int b = 0; b < max_bins; b++) {
float over = load[b] - capacity;
if (over > 0.0f) penalty += over * 10.0f;
}
return penalty;
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Integer;
cfg.dim1 = 1; cfg.dim2_default = n;
cfg.value_lower_bound = 0;
cfg.value_upper_bound = max_bins - 1;
fill_obj_config(cfg);
return cfg;
}
size_t shared_mem_bytes() const {
return (size_t)n * sizeof(float);
}
__device__ void load_shared(char* smem, int tid, int bsz) {
float* sw = reinterpret_cast<float*>(smem);
for (int i = tid; i < n; i += bsz) sw[i] = d_weights[i];
d_weights = sw;
}
static BinPackingProblem create(const float* h_weights, int n,
int max_bins, float capacity) {
BinPackingProblem prob;
prob.n = n; prob.max_bins = max_bins; prob.capacity = capacity;
float* dw;
CUDA_CHECK(cudaMalloc(&dw, sizeof(float) * n));
CUDA_CHECK(cudaMemcpy(dw, h_weights, sizeof(float) * n, cudaMemcpyHostToDevice));
prob.d_weights = dw;
return prob;
}
void destroy() {
if (d_weights) cudaFree(const_cast<float*>(d_weights));
d_weights = nullptr;
}
};

View file

@ -0,0 +1,79 @@
/**
* graph_color.cuh - 图着色问题Integer 编码)
*
* N 个节点的图,用 k 种颜色着色。
* 决策变量data[0][i] ∈ [0, k-1],表示节点 i 的颜色。
* 目标:最小化冲突边数(相邻节点同色的边数)。
*
* 验证实例Petersen 图10 节点 15 边,色数=3最优冲突=0
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
struct GraphColorProblem : ProblemBase<GraphColorProblem, 1, 64> {
const int* d_adj; // 邻接矩阵 [N*N]1=相邻, 0=不相邻)
int n; // 节点数
int k; // 颜色数
__device__ float calc_conflicts(const Sol& sol) const {
int conflicts = 0;
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++)
for (int j = i + 1; j < size; j++)
if (d_adj[i * n + j] && sol.data[0][i] == sol.data[0][j])
conflicts++;
return (float)conflicts;
}
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f},
};
__device__ float compute_obj(int idx, const Sol& sol) const {
switch (idx) {
case 0: return calc_conflicts(sol);
default: return 0.0f;
}
}
__device__ float compute_penalty(const Sol& sol) const {
return 0.0f;
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Integer;
cfg.dim1 = 1; cfg.dim2_default = n;
cfg.value_lower_bound = 0;
cfg.value_upper_bound = k - 1;
fill_obj_config(cfg);
return cfg;
}
size_t shared_mem_bytes() const {
return (size_t)n * n * sizeof(int);
}
__device__ void load_shared(char* smem, int tid, int bsz) {
int* sa = reinterpret_cast<int*>(smem);
int total = n * n;
for (int i = tid; i < total; i += bsz) sa[i] = d_adj[i];
d_adj = sa;
}
static GraphColorProblem create(const int* h_adj, int n, int k) {
GraphColorProblem prob;
prob.n = n; prob.k = k;
int* da;
CUDA_CHECK(cudaMalloc(&da, sizeof(int) * n * n));
CUDA_CHECK(cudaMemcpy(da, h_adj, sizeof(int) * n * n, cudaMemcpyHostToDevice));
prob.d_adj = da;
return prob;
}
void destroy() {
if (d_adj) cudaFree(const_cast<int*>(d_adj));
d_adj = nullptr;
}
};

View file

@ -0,0 +1,271 @@
/**
* jsp.cuh - 车间调度问题 (Job Shop Scheduling Problem)
*
* J 个工件,每个工件有 O 道工序,每道工序指定机器和耗时。
*
* === 编码方案 AInteger 多行(时间表编码)===
* JSPProblem: data[j][i] = 工件 j 的第 i 道工序的开始时间
* dim1 = num_jobs, dim2_default = num_ops
* row_mode = Fixed禁止 ROW_SPLIT/ROW_MERGE
* 每行代表一个工件的固定工序序列,行长度不可变
*
* === 编码方案 BPermutation 多重集(工序排列编码)===
* JSPPermProblem: data[0][k] = 工件编号0..J-1长度 J*O
* 值 j 出现 O 次。从左到右扫描,第 t 次遇到值 j 表示工件 j 的第 t 道工序。
* dim1 = 1, dim2_default = J*O, perm_repeat_count = O
* 标准 Permutation 算子swap/reverse/insert天然保持多重集结构
*
* 目标Minimize makespan所有工件完成时间的最大值
* 约束:
* (a) 工序顺序:同一工件的工序必须按序执行
* (b) 机器冲突:同一机器同一时刻只能处理一个工序
*
* 验证实例:自定义 3 工件 3 机器 (3x3),最优 makespan = 12
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
// ============================================================
// 编码方案 AInteger 多行(时间表编码)
// ============================================================
struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
const int* d_machine; // 工序所需机器 [J*O]
const float* d_duration; // 工序耗时 [J*O]
int num_jobs; // 工件数 J
int num_ops; // 每工件工序数 O
int num_machines; // 机器数 M
int time_horizon; // 时间上界
__device__ float calc_makespan(const Sol& sol) const {
float makespan = 0.0f;
for (int j = 0; j < num_jobs; j++) {
int last = num_ops - 1;
float end = (float)sol.data[j][last] + d_duration[j * num_ops + last];
if (end > makespan) makespan = end;
}
return makespan;
}
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f},
};
__device__ float compute_obj(int idx, const Sol& sol) const {
switch (idx) {
case 0: return calc_makespan(sol);
default: return 0.0f;
}
}
__device__ float compute_penalty(const Sol& sol) const {
float penalty = 0.0f;
// (a) 工序顺序约束
for (int j = 0; j < num_jobs; j++) {
for (int i = 1; i < num_ops; i++) {
float prev_end = (float)sol.data[j][i-1] + d_duration[j * num_ops + (i-1)];
float curr_start = (float)sol.data[j][i];
if (curr_start < prev_end)
penalty += (prev_end - curr_start) * 10.0f;
}
}
// (b) 机器冲突约束
int total = num_jobs * num_ops;
for (int a = 0; a < total; a++) {
int ja = a / num_ops, ia = a % num_ops;
int m_a = d_machine[a];
float s_a = (float)sol.data[ja][ia];
float e_a = s_a + d_duration[a];
for (int b = a + 1; b < total; b++) {
if (d_machine[b] != m_a) continue;
int jb = b / num_ops, ib = b % num_ops;
float s_b = (float)sol.data[jb][ib];
float e_b = s_b + d_duration[b];
float overlap = fminf(e_a, e_b) - fmaxf(s_a, s_b);
if (overlap > 0.0f)
penalty += overlap * 10.0f;
}
}
return penalty;
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Integer;
cfg.dim1 = num_jobs;
cfg.dim2_default = num_ops;
cfg.value_lower_bound = 0;
cfg.value_upper_bound = time_horizon - 1;
cfg.row_mode = RowMode::Fixed;
fill_obj_config(cfg);
return cfg;
}
size_t shared_mem_bytes() const {
int total = num_jobs * num_ops;
return (size_t)total * (sizeof(int) + sizeof(float));
}
__device__ void load_shared(char* smem, int tid, int bsz) {
int total = num_jobs * num_ops;
int* sm = reinterpret_cast<int*>(smem);
for (int i = tid; i < total; i += bsz) sm[i] = d_machine[i];
d_machine = sm;
float* sd = reinterpret_cast<float*>(sm + total);
for (int i = tid; i < total; i += bsz) sd[i] = d_duration[i];
d_duration = sd;
}
static JSPProblem create(const int* h_machine, const float* h_duration,
int num_jobs, int num_ops, int num_machines,
int time_horizon) {
JSPProblem prob;
prob.num_jobs = num_jobs;
prob.num_ops = num_ops;
prob.num_machines = num_machines;
prob.time_horizon = time_horizon;
int total = num_jobs * num_ops;
int* dm;
CUDA_CHECK(cudaMalloc(&dm, sizeof(int) * total));
CUDA_CHECK(cudaMemcpy(dm, h_machine, sizeof(int) * total, cudaMemcpyHostToDevice));
prob.d_machine = dm;
float* dd;
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * total));
CUDA_CHECK(cudaMemcpy(dd, h_duration, sizeof(float) * total, cudaMemcpyHostToDevice));
prob.d_duration = dd;
return prob;
}
void destroy() {
if (d_machine) { cudaFree(const_cast<int*>(d_machine)); d_machine = nullptr; }
if (d_duration) { cudaFree(const_cast<float*>(d_duration)); d_duration = nullptr; }
}
};
// ============================================================
// 编码方案 BPermutation 多重集(工序排列编码)
// ============================================================
// data[0] 是长度 J*O 的排列,值域 [0, J),每个值出现 O 次
// 从左到右扫描:第 t 次遇到值 j → 安排工件 j 的第 t 道工序
// 贪心解码:每道工序安排在"最早可行时间"(满足工序顺序 + 机器空闲)
struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
const int* d_machine; // 工序所需机器 [J*O]
const float* d_duration; // 工序耗时 [J*O]
int num_jobs;
int num_ops;
int num_machines;
// 贪心解码:从排列生成调度方案,返回 makespan
__device__ float decode_and_makespan(const Sol& sol) const {
int total = num_jobs * num_ops;
int size = sol.dim2_sizes[0];
if (size < total) return 1e9f;
float job_avail[8]; // 每个工件的下一道工序最早开始时间
float mach_avail[8]; // 每台机器的最早空闲时间
int job_next_op[8]; // 每个工件的下一道待安排工序编号
for (int j = 0; j < num_jobs; j++) { job_avail[j] = 0.0f; job_next_op[j] = 0; }
for (int m = 0; m < num_machines; m++) mach_avail[m] = 0.0f;
float makespan = 0.0f;
for (int k = 0; k < total; k++) {
int j = sol.data[0][k];
if (j < 0 || j >= num_jobs) return 1e9f;
int op = job_next_op[j];
if (op >= num_ops) continue; // 该工件已安排完
int flat = j * num_ops + op;
int m = d_machine[flat];
float dur = d_duration[flat];
// 最早开始时间 = max(工件前序完成, 机器空闲)
float start = fmaxf(job_avail[j], mach_avail[m]);
float end = start + dur;
job_avail[j] = end;
mach_avail[m] = end;
job_next_op[j] = op + 1;
if (end > makespan) makespan = end;
}
return makespan;
}
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f},
};
__device__ float compute_obj(int idx, const Sol& sol) const {
switch (idx) {
case 0: return decode_and_makespan(sol);
default: return 0.0f;
}
}
// 贪心解码天然满足约束penalty 始终为 0
__device__ float compute_penalty(const Sol& sol) const {
return 0.0f;
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Permutation;
cfg.dim1 = 1;
cfg.dim2_default = num_jobs * num_ops;
cfg.perm_repeat_count = num_ops;
fill_obj_config(cfg);
return cfg;
}
size_t shared_mem_bytes() const {
int total = num_jobs * num_ops;
return (size_t)total * (sizeof(int) + sizeof(float));
}
__device__ void load_shared(char* smem, int tid, int bsz) {
int total = num_jobs * num_ops;
int* sm = reinterpret_cast<int*>(smem);
for (int i = tid; i < total; i += bsz) sm[i] = d_machine[i];
d_machine = sm;
float* sd = reinterpret_cast<float*>(sm + total);
for (int i = tid; i < total; i += bsz) sd[i] = d_duration[i];
d_duration = sd;
}
static JSPPermProblem create(const int* h_machine, const float* h_duration,
int num_jobs, int num_ops, int num_machines) {
JSPPermProblem prob;
prob.num_jobs = num_jobs;
prob.num_ops = num_ops;
prob.num_machines = num_machines;
int total = num_jobs * num_ops;
int* dm;
CUDA_CHECK(cudaMalloc(&dm, sizeof(int) * total));
CUDA_CHECK(cudaMemcpy(dm, h_machine, sizeof(int) * total, cudaMemcpyHostToDevice));
prob.d_machine = dm;
float* dd;
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * total));
CUDA_CHECK(cudaMemcpy(dd, h_duration, sizeof(float) * total, cudaMemcpyHostToDevice));
prob.d_duration = dd;
return prob;
}
void destroy() {
if (d_machine) { cudaFree(const_cast<int*>(d_machine)); d_machine = nullptr; }
if (d_duration) { cudaFree(const_cast<float*>(d_duration)); d_duration = nullptr; }
}
};

View file

@ -0,0 +1,88 @@
/**
* knapsack.cuh - 0-1 背包问题
*
* 继承 ProblemBase使用 ObjDef 目标注册机制
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
#include "operators.cuh"
struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
// 问题数据d_weights 是物品重量,非目标权重)
const float* d_weights;
const float* d_values;
float capacity;
int n;
// ---- 目标计算 ----
__device__ float calc_total_value(const Sol& sol) const {
float tv = 0.0f;
const int* sel = sol.data[0];
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++)
if (sel[i]) tv += d_values[i];
return tv;
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Maximize, 1.0f, 0.0f}, // case 0: calc_total_value
};
__device__ float compute_obj(int idx, const Sol& sol) const {
switch (idx) {
case 0: return calc_total_value(sol); // OBJ_DEFS[0]
default: return 0.0f;
}
}
__device__ float compute_penalty(const Sol& sol) const {
float tw = 0.0f;
const int* sel = sol.data[0];
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++)
if (sel[i]) tw += d_weights[i];
float over = tw - capacity;
return (over > 0.0f) ? over : 0.0f;
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Binary;
cfg.dim1 = 1; cfg.dim2_default = n;
fill_obj_config(cfg);
return cfg;
}
// ---- shared memory 接口 ----
size_t shared_mem_bytes() const {
return 2 * (size_t)n * sizeof(float);
}
__device__ void load_shared(char* smem, int tid, int bsz) {
float* sw = reinterpret_cast<float*>(smem);
float* sv = sw + n;
for (int i = tid; i < n; i += bsz) { sw[i] = d_weights[i]; sv[i] = d_values[i]; }
d_weights = sw;
d_values = sv;
}
static KnapsackProblem create(const float* hw, const float* hv, int n, float cap) {
KnapsackProblem prob;
prob.n = n; prob.capacity = cap;
float *dw, *dv;
CUDA_CHECK(cudaMalloc(&dw, sizeof(float)*n));
CUDA_CHECK(cudaMalloc(&dv, sizeof(float)*n));
CUDA_CHECK(cudaMemcpy(dw, hw, sizeof(float)*n, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(dv, hv, sizeof(float)*n, cudaMemcpyHostToDevice));
prob.d_weights = dw; prob.d_values = dv;
return prob;
}
void destroy() {
if (d_weights) cudaFree(const_cast<float*>(d_weights));
if (d_values) cudaFree(const_cast<float*>(d_values));
d_weights = nullptr; d_values = nullptr;
}
};

View file

@ -0,0 +1,83 @@
/**
* load_balance.cuh - 离散负载均衡问题Integer 编码验证)
*
* N 个任务分配到 M 台机器,每个任务有一个处理时间 p[i]。
* 决策变量data[0][i] ∈ [0, M-1],表示任务 i 分配到哪台机器。
* 目标:最小化 makespan最大机器负载
*
* 已知 NP-hard等价于 multiprocessor scheduling / load balancing
* LPT最长处理时间优先贪心可得 4/3 近似。
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
struct LoadBalanceProblem : ProblemBase<LoadBalanceProblem, 1, 64> {
const float* d_proc_time; // 任务处理时间 [N]
int n; // 任务数
int m; // 机器数
__device__ float calc_makespan(const Sol& sol) const {
float load[32] = {}; // 最多 32 台机器
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++) {
int machine = sol.data[0][i];
if (machine >= 0 && machine < m)
load[machine] += d_proc_time[i];
}
float max_load = 0.0f;
for (int j = 0; j < m; j++)
if (load[j] > max_load) max_load = load[j];
return max_load;
}
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: makespan
};
__device__ float compute_obj(int idx, const Sol& sol) const {
switch (idx) {
case 0: return calc_makespan(sol);
default: return 0.0f;
}
}
__device__ float compute_penalty(const Sol& sol) const {
return 0.0f; // 无约束(任何分配都合法)
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Integer;
cfg.dim1 = 1; cfg.dim2_default = n;
cfg.value_lower_bound = 0;
cfg.value_upper_bound = m - 1;
fill_obj_config(cfg);
return cfg;
}
size_t shared_mem_bytes() const {
return (size_t)n * sizeof(float);
}
__device__ void load_shared(char* smem, int tid, int bsz) {
float* sp = reinterpret_cast<float*>(smem);
for (int i = tid; i < n; i += bsz) sp[i] = d_proc_time[i];
d_proc_time = sp;
}
static LoadBalanceProblem create(const float* h_proc_time, int n, int m) {
LoadBalanceProblem prob;
prob.n = n; prob.m = m;
float* dp;
CUDA_CHECK(cudaMalloc(&dp, sizeof(float) * n));
CUDA_CHECK(cudaMemcpy(dp, h_proc_time, sizeof(float) * n, cudaMemcpyHostToDevice));
prob.d_proc_time = dp;
return prob;
}
void destroy() {
if (d_proc_time) cudaFree(const_cast<float*>(d_proc_time));
d_proc_time = nullptr;
}
};

View file

@ -0,0 +1,84 @@
/**
* qap.cuh - 二次分配问题 (Quadratic Assignment Problem)
*
* N 个设施分配到 N 个位置(排列编码)。
* 决策变量data[0][i] = 设施 i 分配到的位置。
* 目标Minimize sum(flow[i][j] * dist[perm[i]][perm[j]])
*
* 验证实例:自定义 5x5
* flow: 设施间的物流量
* dist: 位置间的距离
* 已知最优 = 58
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
const float* d_flow; // 物流量矩阵 [N*N]
const float* d_dist; // 距离矩阵 [N*N]
int n;
__device__ float calc_cost(const Sol& sol) const {
float cost = 0.0f;
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++)
for (int j = 0; j < size; j++)
cost += d_flow[i * n + j] * d_dist[sol.data[0][i] * n + sol.data[0][j]];
return cost;
}
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f},
};
__device__ float compute_obj(int idx, const Sol& sol) const {
switch (idx) {
case 0: return calc_cost(sol);
default: return 0.0f;
}
}
__device__ float compute_penalty(const Sol& sol) const {
return 0.0f;
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Permutation;
cfg.dim1 = 1; cfg.dim2_default = n;
fill_obj_config(cfg);
return cfg;
}
size_t shared_mem_bytes() const {
return 2 * (size_t)n * n * sizeof(float);
}
__device__ void load_shared(char* smem, int tid, int bsz) {
float* sf = reinterpret_cast<float*>(smem);
float* sd = sf + n * n;
int total = n * n;
for (int i = tid; i < total; i += bsz) { sf[i] = d_flow[i]; sd[i] = d_dist[i]; }
d_flow = sf;
d_dist = sd;
}
static QAPProblem create(const float* h_flow, const float* h_dist, int n) {
QAPProblem prob;
prob.n = n;
float *df, *dd;
CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n));
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
prob.d_flow = df; prob.d_dist = dd;
return prob;
}
void destroy() {
if (d_flow) cudaFree(const_cast<float*>(d_flow));
if (d_dist) cudaFree(const_cast<float*>(d_dist));
d_flow = nullptr; d_dist = nullptr;
}
};

View file

@ -0,0 +1,101 @@
/**
* schedule.cuh - 排班问题
*
* 继承 ProblemBase使用 ObjDef 目标注册机制
* 2 个目标总成本min+ 不公平度min权重更高
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
#include "operators.cuh"
struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
const float* d_cost;
int days, emps, required;
// ---- 目标计算 ----
__device__ float calc_total_cost(const Sol& sol) const {
float total = 0.0f;
for (int d = 0; d < days; d++)
for (int e = 0; e < emps; e++)
if (sol.data[d][e]) total += d_cost[d * emps + e];
return total;
}
__device__ float calc_unfairness(const Sol& sol) const {
int workdays[D2];
for (int e = 0; e < emps; e++) workdays[e] = 0;
for (int d = 0; d < days; d++)
for (int e = 0; e < emps; e++)
if (sol.data[d][e]) workdays[e]++;
int max_w = 0, min_w = days;
for (int e = 0; e < emps; e++) {
if (workdays[e] > max_w) max_w = workdays[e];
if (workdays[e] < min_w) min_w = workdays[e];
}
return (float)(max_w - min_w);
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_cost
{ObjDir::Minimize, 5.0f, 0.0f}, // case 1: calc_unfairness
};
__device__ float compute_obj(int idx, const Sol& sol) const {
switch (idx) {
case 0: return calc_total_cost(sol); // OBJ_DEFS[0]
case 1: return calc_unfairness(sol); // OBJ_DEFS[1]
default: return 0.0f;
}
}
__device__ float compute_penalty(const Sol& sol) const {
float penalty = 0.0f;
for (int d = 0; d < days; d++) {
int count = 0;
for (int e = 0; e < emps; e++)
if (sol.data[d][e]) count++;
int diff = count - required;
penalty += (diff > 0) ? (float)diff : (float)(-diff);
}
return penalty;
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Binary;
cfg.dim1 = days; cfg.dim2_default = emps;
cfg.row_mode = RowMode::Fixed;
fill_obj_config(cfg);
return cfg;
}
// 默认回退全量(基类行为)— 不需要覆盖 evaluate_move
// ---- shared memory 接口 ----
size_t shared_mem_bytes() const {
return (size_t)days * emps * sizeof(float);
}
__device__ void load_shared(char* smem, int tid, int bsz) {
float* sc = reinterpret_cast<float*>(smem);
int total = days * emps;
for (int i = tid; i < total; i += bsz) sc[i] = d_cost[i];
d_cost = sc;
}
static ScheduleProblem create(const float* hc, int days, int emps, int req) {
ScheduleProblem prob;
prob.days = days; prob.emps = emps; prob.required = req;
float* dc;
CUDA_CHECK(cudaMalloc(&dc, sizeof(float)*days*emps));
CUDA_CHECK(cudaMemcpy(dc, hc, sizeof(float)*days*emps, cudaMemcpyHostToDevice));
prob.d_cost = dc;
return prob;
}
void destroy() {
if (d_cost) { cudaFree(const_cast<float*>(d_cost)); d_cost = nullptr; }
}
};

View file

@ -0,0 +1,110 @@
/**
* tsp.cuh - TSP 问题定义
*
* 继承 ProblemBase使用 ObjDef 目标注册机制
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
#include "operators.cuh"
struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
// 问题数据
const float* d_dist;
const float* h_dist; // host 端距离矩阵(用于 init_relation_matrix
int n;
// ---- 目标计算 ----
__device__ float calc_total_distance(const Sol& sol) const {
float total = 0.0f;
const int* route = sol.data[0];
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++)
total += d_dist[route[i] * n + route[(i + 1) % size]];
return total;
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_distance
};
__device__ float compute_obj(int idx, const Sol& sol) const {
switch (idx) {
case 0: return calc_total_distance(sol); // OBJ_DEFS[0]
default: return 0.0f;
}
}
__device__ float compute_penalty(const Sol& sol) const {
return 0.0f; // TSP 无约束
}
// ---- config编码/维度部分,目标由基类自动填充)----
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Permutation;
cfg.dim1 = 1; cfg.dim2_default = n;
fill_obj_config(cfg);
return cfg;
}
// ---- shared memory 接口 ----
static constexpr size_t SMEM_LIMIT = 48 * 1024;
size_t shared_mem_bytes() const {
size_t need = (size_t)n * n * sizeof(float);
return need <= SMEM_LIMIT ? need : 0;
}
size_t working_set_bytes() const {
return (size_t)n * n * sizeof(float);
}
__device__ void load_shared(char* smem, int tid, int bsz) {
float* sd = reinterpret_cast<float*>(smem);
int total = n * n;
for (int i = tid; i < total; i += bsz)
sd[i] = d_dist[i];
d_dist = sd;
}
// 距离先验:距离近 → G/O 分数高
void init_relation_matrix(float* G, float* O, int N) const {
if (!h_dist || N != n) return;
float max_d = 0.0f;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
if (h_dist[i * N + j] > max_d) max_d = h_dist[i * N + j];
if (max_d <= 0.0f) return;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++) {
if (i == j) continue;
float proximity = 1.0f - h_dist[i * N + j] / max_d;
G[i * N + j] = proximity * 0.3f;
O[i * N + j] = proximity * 0.1f;
}
}
int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
if (max_count < 1 || !h_dist) return 0;
out[0] = {h_dist, n};
return 1;
}
static TSPProblem create(const float* h_dist_ptr, int n) {
TSPProblem prob;
prob.n = n;
prob.h_dist = h_dist_ptr;
float* dd;
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));
prob.d_dist = dd;
return prob;
}
void destroy() {
if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
h_dist = nullptr;
}
};

View file

@ -0,0 +1,107 @@
/**
* tsp_large.cuh - 大规模 TSP 问题定义 (最多 256 城市)
*
* 继承 ProblemBase逻辑与 tsp.cuh 一致,仅 D2 上限不同
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
#include "operators.cuh"
struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
const float* d_dist;
const float* h_dist;
int n;
// ---- 目标计算 ----
__device__ float calc_total_distance(const Sol& sol) const {
float total = 0.0f;
const int* route = sol.data[0];
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++)
total += d_dist[route[i] * n + route[(i + 1) % size]];
return total;
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_distance
};
__device__ float compute_obj(int idx, const Sol& sol) const {
switch (idx) {
case 0: return calc_total_distance(sol); // OBJ_DEFS[0]
default: return 0.0f;
}
}
__device__ float compute_penalty(const Sol& sol) const {
return 0.0f;
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Permutation;
cfg.dim1 = 1; cfg.dim2_default = n;
fill_obj_config(cfg);
return cfg;
}
static constexpr size_t SMEM_LIMIT = 48 * 1024;
size_t shared_mem_bytes() const {
size_t need = (size_t)n * n * sizeof(float);
return need <= SMEM_LIMIT ? need : 0;
}
// 距离矩阵的实际大小(不管是否放进 smem
size_t working_set_bytes() const {
return (size_t)n * n * sizeof(float);
}
__device__ void load_shared(char* smem, int tid, int bsz) {
float* sd = reinterpret_cast<float*>(smem);
int total = n * n;
for (int i = tid; i < total; i += bsz)
sd[i] = d_dist[i];
d_dist = sd;
}
void init_relation_matrix(float* G, float* O, int N) const {
if (!h_dist || N != n) return;
float max_d = 0.0f;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
if (h_dist[i * N + j] > max_d) max_d = h_dist[i * N + j];
if (max_d <= 0.0f) return;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++) {
if (i == j) continue;
float proximity = 1.0f - h_dist[i * N + j] / max_d;
G[i * N + j] = proximity * 0.3f;
O[i * N + j] = proximity * 0.1f;
}
}
int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
if (max_count < 1 || !h_dist) return 0;
out[0] = {h_dist, n};
return 1;
}
static TSPLargeProblem create(const float* h_dist_ptr, int n) {
TSPLargeProblem prob;
prob.n = n;
prob.h_dist = h_dist_ptr;
float* dd;
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));
prob.d_dist = dd;
return prob;
}
void destroy() {
if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
h_dist = nullptr;
}
};

View file

@ -0,0 +1,99 @@
/**
* tsp_xlarge.cuh - 超大规模 TSP 问题定义 (最多 512 城市)
*
* 继承 ProblemBase逻辑与 tsp_large.cuh 一致D2=512
* 注意:距离矩阵 512×512×4B = 1MB远超 48KB shared memory
* 因此 shared_mem_bytes() 返回 0距离矩阵留在 global memory
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
#include "operators.cuh"
struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
const float* d_dist;
const float* h_dist; // host 端距离矩阵(用于 init_relation_matrix
int n;
__device__ float calc_total_distance(const Sol& sol) const {
float total = 0.0f;
const int* route = sol.data[0];
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++)
total += d_dist[route[i] * n + route[(i + 1) % size]];
return total;
}
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f},
};
__device__ float compute_obj(int idx, const Sol& sol) const {
switch (idx) {
case 0: return calc_total_distance(sol);
default: return 0.0f;
}
}
__device__ float compute_penalty(const Sol& sol) const { return 0.0f; }
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Permutation;
cfg.dim1 = 1; cfg.dim2_default = n;
fill_obj_config(cfg);
return cfg;
}
// 距离矩阵太大,不放 shared memory
size_t shared_mem_bytes() const { return 0; }
__device__ void load_shared(char*, int, int) {}
size_t working_set_bytes() const {
return (size_t)n * n * sizeof(float);
}
// 用距离矩阵初始化 G/O 先验:距离近 → 分数高
void init_relation_matrix(float* G, float* O, int N) const {
if (!h_dist || N != n) return;
// 找最大距离用于归一化
float max_d = 0.0f;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
if (h_dist[i * N + j] > max_d) max_d = h_dist[i * N + j];
if (max_d <= 0.0f) return;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
if (i == j) continue;
// 距离近 → G 高(分组倾向强)
float proximity = 1.0f - h_dist[i * N + j] / max_d;
G[i * N + j] = proximity * 0.3f; // 初始信号不要太强,留空间给 EMA
// 距离近 → O 也给一点信号(对称的,不偏向任何方向)
O[i * N + j] = proximity * 0.1f;
}
}
}
int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
if (max_count < 1 || !h_dist) return 0;
out[0] = {h_dist, n};
return 1;
}
static TSPXLargeProblem create(const float* h_dist_ptr, int n) {
TSPXLargeProblem prob;
prob.n = n;
prob.h_dist = h_dist_ptr; // 保留 host 指针
float* dd;
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));
prob.d_dist = dd;
return prob;
}
void destroy() {
if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
h_dist = nullptr;
}
};

View file

@ -0,0 +1,184 @@
/**
* vrp.cuh - 容量约束车辆路径问题 (CVRP)
*
* 继承 ProblemBase使用 ObjDef 目标注册机制
* 多行编码D1=K 条路线,分区初始化 + 跨行算子)
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
#include "operators.cuh"
#include "gpu_cache.cuh"
struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
// GPU 数据
const float* d_dist;
const float* d_demand;
const float* h_dist; // host 端距离矩阵(含 depot用于 init_relation_matrix
int n;
int stride;
float capacity;
int num_vehicles;
int max_vehicles;
GpuCache cache;
// ---- 目标计算 ----
__device__ float compute_route_dist(const int* route, int size) const {
if (size == 0) return 0.0f;
float dist = 0.0f;
int prev = 0;
for (int j = 0; j < size; j++) {
int node = route[j] + 1;
dist += d_dist[prev * stride + node];
prev = node;
}
dist += d_dist[prev * stride + 0];
return dist;
}
__device__ float eval_route(const int* route, int size) const {
if (size == 0) return 0.0f;
if (!cache.keys) return compute_route_dist(route, size);
uint64_t key = route_hash(route, size);
float dist;
if (cache_lookup(cache, key, dist)) {
atomicAdd(cache.d_hits, 1);
return dist;
}
dist = compute_route_dist(route, size);
cache_insert(cache, key, dist);
atomicAdd(cache.d_misses, 1);
return dist;
}
__device__ float calc_total_distance(const Sol& sol) const {
float total = 0.0f;
for (int r = 0; r < num_vehicles; r++)
total += eval_route(sol.data[r], sol.dim2_sizes[r]);
return total;
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_distance
};
__device__ float compute_obj(int idx, const Sol& sol) const {
switch (idx) {
case 0: return calc_total_distance(sol); // OBJ_DEFS[0]
default: return 0.0f;
}
}
__device__ float compute_penalty(const Sol& sol) const {
float penalty = 0.0f;
int active = 0;
for (int r = 0; r < num_vehicles; r++) {
int size = sol.dim2_sizes[r];
if (size == 0) continue;
active++;
float load = 0.0f;
for (int j = 0; j < size; j++)
load += d_demand[sol.data[r][j]];
if (load > capacity)
penalty += (load - capacity) * 100.0f;
}
if (active > max_vehicles)
penalty += (float)(active - max_vehicles) * 1000.0f;
return penalty;
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Permutation;
cfg.dim1 = num_vehicles;
cfg.dim2_default = 0;
fill_obj_config(cfg);
cfg.cross_row_prob = 0.3f;
cfg.row_mode = RowMode::Partition;
cfg.total_elements = n;
return cfg;
}
// ---- shared memory 接口 ----
static constexpr size_t SMEM_LIMIT = 48 * 1024;
size_t shared_mem_bytes() const {
size_t dist_bytes = (size_t)stride * stride * sizeof(float);
size_t demand_bytes = (size_t)n * sizeof(float);
size_t total = dist_bytes + demand_bytes;
return total <= SMEM_LIMIT ? total : 0;
}
size_t working_set_bytes() const {
return (size_t)stride * stride * sizeof(float) + (size_t)n * sizeof(float);
}
__device__ void load_shared(char* smem, int tid, int bsz) {
float* sd = reinterpret_cast<float*>(smem);
int dist_size = stride * stride;
for (int i = tid; i < dist_size; i += bsz) sd[i] = d_dist[i];
d_dist = sd;
float* sdem = sd + dist_size;
for (int i = tid; i < n; i += bsz) sdem[i] = d_demand[i];
d_demand = sdem;
}
void enable_cache(int cap = 65536) { cache = GpuCache::allocate(cap); }
void print_cache_stats() const { cache.print_stats(); }
// 距离先验:客户间距离近 → G/O 分数高
// 注意h_dist 含 depotstride×stride元素编号 0..n-1 对应 node 1..n
void init_relation_matrix(float* G, float* O, int N) const {
if (!h_dist || N != n) return;
float max_d = 0.0f;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++) {
float d = h_dist[(i + 1) * stride + (j + 1)]; // 跳过 depot
if (d > max_d) max_d = d;
}
if (max_d <= 0.0f) return;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++) {
if (i == j) continue;
float d = h_dist[(i + 1) * stride + (j + 1)];
float proximity = 1.0f - d / max_d;
G[i * N + j] = proximity * 0.3f;
O[i * N + j] = proximity * 0.1f;
}
}
static VRPProblem create(const float* h_dist_ptr, const float* h_demand,
int n, float capacity,
int num_vehicles, int max_vehicles) {
VRPProblem prob;
prob.n = n;
prob.stride = n + 1;
prob.capacity = capacity;
prob.num_vehicles = num_vehicles;
prob.max_vehicles = max_vehicles;
prob.cache = GpuCache::disabled();
prob.h_dist = h_dist_ptr;
int n_nodes = n + 1;
float* dd;
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n_nodes * n_nodes));
CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n_nodes * n_nodes, cudaMemcpyHostToDevice));
prob.d_dist = dd;
float* ddem;
CUDA_CHECK(cudaMalloc(&ddem, sizeof(float) * n));
CUDA_CHECK(cudaMemcpy(ddem, h_demand, sizeof(float) * n, cudaMemcpyHostToDevice));
prob.d_demand = ddem;
return prob;
}
void destroy() {
if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
if (d_demand) { cudaFree(const_cast<float*>(d_demand)); d_demand = nullptr; }
h_dist = nullptr;
cache.destroy();
}
};

View file

@ -0,0 +1,192 @@
/**
* vrptw.cuh - 带时间窗的车辆路径问题 (VRPTW)
*
* 在 CVRP 基础上增加时间窗约束。
* 编码Perm 多行分区(同 CVRPdata[r][j] = 路线 r 的第 j 个客户。
* 目标Minimize 总距离。
* 约束:(a) 容量约束, (b) 时间窗约束(到达时间必须 ≤ latest早到需等待
*
* 验证实例8 客户 3 车, 手工设计坐标+时间窗, 确保有已知可行解。
*/
#pragma once
#include "types.cuh"
#include "cuda_utils.cuh"
struct VRPTWProblem : ProblemBase<VRPTWProblem, 8, 64> {
const float* d_dist; // 距离矩阵 [(n+1)*(n+1)](含 depot
const float* d_demand; // 需求 [n]
const float* d_earliest; // 最早服务时间 [n+1](含 depot
const float* d_latest; // 最晚服务时间 [n+1](含 depot
const float* d_service; // 服务耗时 [n+1](含 depot
int n; // 客户数(不含 depot
int stride; // n+1
float capacity;
int num_vehicles;
int max_vehicles;
__device__ float compute_route_dist(const int* route, int size) const {
if (size == 0) return 0.0f;
float dist = 0.0f;
int prev = 0;
for (int j = 0; j < size; j++) {
int node = route[j] + 1;
dist += d_dist[prev * stride + node];
prev = node;
}
dist += d_dist[prev * stride + 0];
return dist;
}
__device__ float calc_total_distance(const Sol& sol) const {
float total = 0.0f;
for (int r = 0; r < num_vehicles; r++)
total += compute_route_dist(sol.data[r], sol.dim2_sizes[r]);
return total;
}
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f},
};
__device__ float compute_obj(int idx, const Sol& sol) const {
switch (idx) {
case 0: return calc_total_distance(sol);
default: return 0.0f;
}
}
__device__ float compute_penalty(const Sol& sol) const {
float penalty = 0.0f;
int active = 0;
for (int r = 0; r < num_vehicles; r++) {
int size = sol.dim2_sizes[r];
if (size == 0) continue;
active++;
// 容量约束
float load = 0.0f;
for (int j = 0; j < size; j++)
load += d_demand[sol.data[r][j]];
if (load > capacity)
penalty += (load - capacity) * 100.0f;
// 时间窗约束:模拟路线行驶
float time = 0.0f;
int prev = 0;
for (int j = 0; j < size; j++) {
int node = sol.data[r][j] + 1;
float travel = d_dist[prev * stride + node];
time += travel;
// 早到需等待
if (time < d_earliest[node])
time = d_earliest[node];
// 迟到产生惩罚
if (time > d_latest[node])
penalty += (time - d_latest[node]) * 50.0f;
time += d_service[node];
prev = node;
}
// 返回 depot 的时间窗
float return_time = time + d_dist[prev * stride + 0];
if (return_time > d_latest[0])
penalty += (return_time - d_latest[0]) * 50.0f;
}
if (active > max_vehicles)
penalty += (float)(active - max_vehicles) * 1000.0f;
return penalty;
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Permutation;
cfg.dim1 = num_vehicles;
cfg.dim2_default = 0;
fill_obj_config(cfg);
cfg.cross_row_prob = 0.3f;
cfg.row_mode = RowMode::Partition;
cfg.total_elements = n;
return cfg;
}
static constexpr size_t SMEM_LIMIT = 48 * 1024;
size_t shared_mem_bytes() const {
size_t dist_bytes = (size_t)stride * stride * sizeof(float);
size_t aux_bytes = (size_t)(n + 1) * 4 * sizeof(float); // demand(n) + earliest/latest/service(n+1 each)
size_t total = dist_bytes + aux_bytes;
return total <= SMEM_LIMIT ? total : 0;
}
size_t working_set_bytes() const {
return (size_t)stride * stride * sizeof(float) + (size_t)(n + 1) * 4 * sizeof(float);
}
__device__ void load_shared(char* smem, int tid, int bsz) {
float* sd = reinterpret_cast<float*>(smem);
int dist_size = stride * stride;
for (int i = tid; i < dist_size; i += bsz) sd[i] = d_dist[i];
d_dist = sd;
float* sdem = sd + dist_size;
for (int i = tid; i < n; i += bsz) sdem[i] = d_demand[i];
d_demand = sdem;
float* se = sdem + n;
int nn = n + 1;
for (int i = tid; i < nn; i += bsz) se[i] = d_earliest[i];
d_earliest = se;
float* sl = se + nn;
for (int i = tid; i < nn; i += bsz) sl[i] = d_latest[i];
d_latest = sl;
float* ss = sl + nn;
for (int i = tid; i < nn; i += bsz) ss[i] = d_service[i];
d_service = ss;
}
static VRPTWProblem create(const float* h_dist, const float* h_demand,
const float* h_earliest, const float* h_latest,
const float* h_service,
int n, float capacity,
int num_vehicles, int max_vehicles) {
VRPTWProblem prob;
prob.n = n;
prob.stride = n + 1;
prob.capacity = capacity;
prob.num_vehicles = num_vehicles;
prob.max_vehicles = max_vehicles;
int nn = n + 1;
float *dd, *ddem, *de, *dl, *ds;
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * nn * nn));
CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * nn * nn, cudaMemcpyHostToDevice));
prob.d_dist = dd;
CUDA_CHECK(cudaMalloc(&ddem, sizeof(float) * n));
CUDA_CHECK(cudaMemcpy(ddem, h_demand, sizeof(float) * n, cudaMemcpyHostToDevice));
prob.d_demand = ddem;
CUDA_CHECK(cudaMalloc(&de, sizeof(float) * nn));
CUDA_CHECK(cudaMemcpy(de, h_earliest, sizeof(float) * nn, cudaMemcpyHostToDevice));
prob.d_earliest = de;
CUDA_CHECK(cudaMalloc(&dl, sizeof(float) * nn));
CUDA_CHECK(cudaMemcpy(dl, h_latest, sizeof(float) * nn, cudaMemcpyHostToDevice));
prob.d_latest = dl;
CUDA_CHECK(cudaMalloc(&ds, sizeof(float) * nn));
CUDA_CHECK(cudaMemcpy(ds, h_service, sizeof(float) * nn, cudaMemcpyHostToDevice));
prob.d_service = ds;
return prob;
}
void destroy() {
if (d_dist) { cudaFree(const_cast<float*>(d_dist)); d_dist = nullptr; }
if (d_demand) { cudaFree(const_cast<float*>(d_demand)); d_demand = nullptr; }
if (d_earliest) { cudaFree(const_cast<float*>(d_earliest)); d_earliest = nullptr; }
if (d_latest) { cudaFree(const_cast<float*>(d_latest)); d_latest = nullptr; }
if (d_service) { cudaFree(const_cast<float*>(d_service)); d_service = nullptr; }
}
};

599
python/cugenopt/jit.py Normal file
View file

@ -0,0 +1,599 @@
"""
JIT compiler for custom cuGenOpt problems.
Workflow:
1. User provides CUDA code snippets (compute_obj, compute_penalty) + data arrays
2. Python fills the .cu template with user code
3. nvcc compiles to executable (cached by content hash)
4. subprocess runs executable, parses JSON output
"""
import hashlib
import json
import os
import shutil
import struct
import subprocess
import tempfile
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
import numpy as np
from cugenopt.validation import (
CuGenOptCompileError,
CuGenOptValidationError,
validate_cuda_snippet,
validate_data_dict,
validate_encoding,
validate_positive_int,
)
_TEMPLATE_PATH = Path(__file__).parent / "jit_template.cu"
_CACHE_DIR = Path.home() / ".cache" / "cugenopt" / "jit"
def _detect_framework_root() -> Path:
"""Find the cuGenOpt framework root (contains core/types.cuh).
Search order:
1. Bundled headers inside the installed package (pip install wheel)
2. Development layout (source tree)
3. CUGENOPT_ROOT env var
"""
pkg_dir = Path(__file__).parent # cugenopt/
# 1. Bundled headers (wheel layout: cugenopt/include/core/types.cuh)
bundled = pkg_dir / "include"
if (bundled / "core" / "types.cuh").exists():
return bundled
# 2. Development layout (python/../prototype)
dev_root = pkg_dir.parent # python/
for rel in ["../prototype", "../../prototype"]:
c = (dev_root / rel).resolve()
if (c / "core" / "types.cuh").exists():
return c
# 3. CUGENOPT_ROOT env var
env_root = os.environ.get("CUGENOPT_ROOT")
if env_root:
p = Path(env_root) / "prototype"
if (p / "core" / "types.cuh").exists():
return p
raise RuntimeError(
"Cannot find cuGenOpt framework headers. "
"Set CUGENOPT_ROOT env var to the generic_solver directory, "
"or reinstall: pip install cugenopt"
)
def _detect_cuda_arch() -> str:
"""Detect GPU compute capability via nvidia-smi."""
try:
out = subprocess.check_output(
["nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader"],
stderr=subprocess.DEVNULL, text=True
).strip().split("\n")[0]
major, minor = out.strip().split(".")
return f"sm_{major}{minor}"
except Exception:
return "sm_75"
def _is_windows() -> bool:
return os.name == "nt"
def _nvcc_name() -> str:
return "nvcc.exe" if _is_windows() else "nvcc"
def _find_nvcc() -> str:
"""Find nvcc binary. Search order: PATH → pip-installed → common system paths."""
# 1. System PATH
nvcc = shutil.which(_nvcc_name())
if nvcc:
return nvcc
# 2. pip-installed nvidia-cuda-nvcc / nvidia-cuda-nvcc-cu12
import sys
import site
pip_search_dirs = [sys.prefix, *site.getsitepackages(), site.getusersitepackages()]
nvcc_bin = _nvcc_name()
for base in pip_search_dirs:
pip_subpaths = [
os.path.join("nvidia", "cuda_nvcc", "bin", nvcc_bin),
os.path.join("nvidia", "cu13", "bin", nvcc_bin),
os.path.join("nvidia", "cu12", "bin", nvcc_bin),
os.path.join("bin", nvcc_bin),
]
if _is_windows():
pip_subpaths += [
os.path.join("Scripts", nvcc_bin),
os.path.join("Library", "bin", nvcc_bin),
]
for subpath in pip_subpaths:
candidate = os.path.join(base, subpath)
if os.path.isfile(candidate):
return candidate
# 3. Common system paths
if _is_windows():
cuda_base = os.path.join(os.environ.get("CUDA_PATH", ""),
"bin", "nvcc.exe")
if os.path.isfile(cuda_base):
return cuda_base
for ver in ["12.4", "12.6", "12.0", "11.8"]:
candidate = os.path.join(
"C:\\", "Program Files", "NVIDIA GPU Computing Toolkit",
"CUDA", f"v{ver}", "bin", "nvcc.exe")
if os.path.isfile(candidate):
return candidate
else:
for candidate in [
"/usr/local/cuda/bin/nvcc",
"/usr/local/cuda-12.4/bin/nvcc",
"/usr/local/cuda-12.6/bin/nvcc",
"/usr/local/cuda-12.0/bin/nvcc",
"/usr/local/cuda-11.8/bin/nvcc",
]:
if os.path.isfile(candidate):
return candidate
raise RuntimeError(
"nvcc not found. Install the CUDA compiler:\n"
" pip install nvidia-cuda-nvcc-cu12\n"
"Or install CUDA Toolkit: https://developer.nvidia.com/cuda-downloads"
)
def _content_hash(source: str) -> str:
return hashlib.sha256(source.encode()).hexdigest()[:16]
def _fallback_compile_without_bad_ops(
custom_operators: list,
encoding: str,
template: str,
replacements: dict,
base_cmd: list,
fw_root: Path,
original_stderr: str,
) -> tuple:
"""When compilation fails with custom operators, try excluding them one by one.
Strategy:
1. Try compiling without ALL custom operators (baseline)
2. If baseline compiles, try adding operators back one by one
3. Report which operators were disabled
Returns (source, exe_path, cu_path) of the successful compilation.
Raises CuGenOptCompileError if even baseline fails.
"""
import warnings
from cugenopt.operators import generate_custom_operator_cuda
# Try baseline (no custom ops)
baseline_replacements = dict(replacements)
baseline_replacements["{{CUSTOM_OP_DEFINES}}"] = ""
baseline_replacements["{{CUSTOM_OP_SWITCH}}"] = ""
baseline_replacements["{{CUSTOM_OP_REGISTRY}}"] = ""
baseline_source = template
for key, val in baseline_replacements.items():
baseline_source = baseline_source.replace(key, val)
h = _content_hash(baseline_source)
cu_path = _CACHE_DIR / f"custom_{h}.cu"
exe_suffix = ".exe" if _is_windows() else ""
exe_path = _CACHE_DIR / f"custom_{h}{exe_suffix}"
if not exe_path.exists():
cu_path.write_text(baseline_source)
cmd = base_cmd + [str(cu_path), "-o", str(exe_path)]
proc = subprocess.run(cmd, capture_output=True, text=True)
if proc.returncode != 0:
raise CuGenOptCompileError(original_stderr, str(cu_path))
disabled_names = [op.name for op in custom_operators]
warnings.warn(
f"Custom operator(s) caused compilation failure. "
f"Disabled: {', '.join(disabled_names)}. "
f"Solving with built-in operators only.\n"
f"Fix your operator code and retry. "
f"Original error:\n{original_stderr[:500]}",
RuntimeWarning,
stacklevel=4,
)
return baseline_source, exe_path, cu_path
def _build_custom_op_defines(switch_block: str, registry_block: str) -> str:
"""Build #define to enable custom operator hooks in framework headers."""
return "#define CUGENOPT_HAS_CUSTOM_OPS"
def _write_binary_float(path: str, arr: np.ndarray):
arr = np.ascontiguousarray(arr, dtype=np.float32)
with open(path, "wb") as f:
f.write(arr.tobytes())
def _write_binary_int(path: str, arr: np.ndarray):
arr = np.ascontiguousarray(arr, dtype=np.int32)
with open(path, "wb") as f:
f.write(arr.tobytes())
class DataArray:
"""Describes a data array to be passed to the custom problem."""
def __init__(self, name: str, data: np.ndarray, dtype: str = "float"):
self.name = name
self.data = data
self.dtype = dtype # "float" or "int"
self.size = data.size # total element count
def _build_encoding_str(encoding: str) -> str:
mapping = {
"permutation": "EncodingType::Permutation",
"binary": "EncodingType::Binary",
"integer": "EncodingType::Integer",
}
return mapping.get(encoding.lower(), "EncodingType::Permutation")
def _build_row_mode_str(row_mode: str) -> str:
mapping = {
"single": "RowMode::Single",
"fixed": "RowMode::Fixed",
"partition": "RowMode::Partition",
}
return mapping.get(row_mode.lower(), "RowMode::Single")
def _build_obj_dir_str(direction: str) -> str:
return "ObjDir::Maximize" if direction.lower().startswith("max") else "ObjDir::Minimize"
def compile_and_solve(
compute_obj: str,
compute_penalty: str = "return 0.0f;",
data: Optional[Dict[str, np.ndarray]] = None,
int_data: Optional[Dict[str, np.ndarray]] = None,
encoding: str = "permutation",
dim1: int = 1,
dim2: int = 64,
n: Optional[int] = None,
row_mode: str = "single",
total_elements: int = 0,
cross_row_prob: float = 0.0,
perm_repeat_count: int = 1,
value_lower: int = 0,
value_upper: int = 1,
objectives: Optional[List[Tuple[str, float]]] = None,
shared_mem: Optional[str] = None,
load_shared: Optional[str] = None,
pop_size: int = 0,
max_gen: int = 1000,
time_limit: float = 0.0,
seed: int = 42,
use_aos: bool = False,
sa_temp_init: float = 0.0,
verbose: bool = False,
framework_root: Optional[str] = None,
cuda_arch: Optional[str] = None,
custom_operators: Optional[List] = None,
) -> Dict[str, Any]:
"""
JIT-compile and solve a custom optimization problem.
Args:
compute_obj: CUDA code for the compute_obj function body.
Available variables: idx (objective index), sol (const Sol&),
and any data fields you declared.
compute_penalty: CUDA code for compute_penalty body.
data: Dict of name -> numpy float32 array for problem data.
int_data: Dict of name -> numpy int32 array for problem data.
encoding: "permutation", "binary", or "integer".
dim1: Number of rows in solution (1 for most problems).
dim2: Max columns per row.
n: Problem size (number of elements). If None, inferred from data.
row_mode: "single", "fixed", or "partition".
total_elements: For partition mode, total elements across all rows.
cross_row_prob: Probability of cross-row operations.
perm_repeat_count: For multiset permutation (JSP-style).
value_lower, value_upper: Bounds for integer encoding.
objectives: List of (direction, weight) tuples. Default: [("minimize", 1.0)].
shared_mem: Expression for shared_mem_bytes() return value.
Use '_n' for problem size. E.g. "(size_t)_n * _n * sizeof(float)"
load_shared: CUDA code for load_shared body.
pop_size, max_gen, time_limit, seed, use_aos, sa_temp_init, verbose:
Solver configuration parameters.
framework_root: Path to cuGenOpt framework. Auto-detected if None.
cuda_arch: CUDA architecture (e.g. "sm_75"). Auto-detected if None.
Returns:
Dict with keys: objective, penalty, solution, elapsed_ms, generations,
stop_reason, objectives.
Example:
>>> result = compile_and_solve(
... compute_obj='''
... if (idx != 0) return 0.0f;
... float total = 0.0f;
... const int* route = sol.data[0];
... int size = sol.dim2_sizes[0];
... for (int i = 0; i < size; i++)
... total += d_dist[route[i] * _n + route[(i+1) % size]];
... return total;
... ''',
... data={"d_dist": dist_matrix},
... encoding="permutation", dim2=64, n=20,
... time_limit=5.0,
... )
"""
if data is None:
data = {}
if int_data is None:
int_data = {}
if objectives is None:
objectives = [("minimize", 1.0)]
# --- Input validation ---
compute_obj = validate_cuda_snippet(compute_obj, "compute_obj")
if compute_penalty != "return 0.0f;":
compute_penalty = validate_cuda_snippet(compute_penalty, "compute_penalty")
encoding = validate_encoding(encoding)
dim1 = validate_positive_int(dim1, "dim1")
dim2 = validate_positive_int(dim2, "dim2")
max_gen = validate_positive_int(max_gen, "max_gen")
if data:
data = validate_data_dict(data, "float")
if int_data:
int_data = validate_data_dict(int_data, "int")
# Infer n from data if not provided
if n is None:
for arr in data.values():
if arr.ndim == 2:
n = arr.shape[0]
break
elif arr.ndim == 1:
n = arr.shape[0]
break
if n is None:
n = dim2
n = validate_positive_int(n, "n")
# Framework root
fw_root = Path(framework_root) if framework_root else _detect_framework_root()
# Read template
template = _TEMPLATE_PATH.read_text()
# Build data fields
all_data = []
data_fields_lines = []
for name, arr in data.items():
all_data.append(DataArray(name, arr, "float"))
data_fields_lines.append(f" const float* {name};")
for name, arr in int_data.items():
all_data.append(DataArray(name, arr, "int"))
data_fields_lines.append(f" const int* {name};")
data_fields = "\n".join(data_fields_lines) if data_fields_lines else " // no data fields"
# Build OBJ_DEFS
obj_defs_parts = []
for direction, weight in objectives:
obj_defs_parts.append(f"{{{_build_obj_dir_str(direction)}, {weight}f, 0.0f}}")
obj_defs = ", ".join(obj_defs_parts)
# Build shared memory
if shared_mem is None:
total_bytes_parts = []
for da in all_data:
elem_size = "sizeof(float)" if da.dtype == "float" else "sizeof(int)"
total_bytes_parts.append(f"(size_t){da.size} * {elem_size}")
if total_bytes_parts:
total_expr = " + ".join(total_bytes_parts)
shared_mem_expr = f"size_t need = {total_expr};\n return (need <= 48 * 1024) ? need : 0;"
else:
shared_mem_expr = "return 0;"
else:
shared_mem_expr = f"size_t need = {shared_mem};\n return (need <= 48 * 1024) ? need : 0;"
# Build load_shared
if load_shared is None:
load_lines = []
offset = "smem"
for i, da in enumerate(all_data):
ctype = "float" if da.dtype == "float" else "int"
ptr_name = f"s_{da.name}"
if i == 0:
load_lines.append(f" {ctype}* {ptr_name} = reinterpret_cast<{ctype}*>(smem);")
else:
prev = all_data[i - 1]
prev_ptr = f"s_{prev.name}"
load_lines.append(f" {ctype}* {ptr_name} = reinterpret_cast<{ctype}*>({prev_ptr} + {prev.size});")
load_lines.append(f" for (int i = tid; i < {da.size}; i += bsz) {ptr_name}[i] = {da.name}[i];")
load_lines.append(f" {da.name} = {ptr_name};")
load_shared_body = "\n".join(load_lines) if load_lines else " // no data to load"
else:
load_shared_body = load_shared
# Build destroy body
destroy_lines = []
for da in all_data:
ctype = "float" if da.dtype == "float" else "int"
destroy_lines.append(f" if ({da.name}) cudaFree(const_cast<{ctype}*>({da.name}));")
destroy_body = "\n".join(destroy_lines) if destroy_lines else " // nothing to free"
# Build data load body (main function: read binary files, cudaMalloc, cudaMemcpy)
data_load_lines = []
for da in all_data:
if da.dtype == "float":
data_load_lines.append(f' snprintf(path, sizeof(path), "%s/{da.name}.bin", data_dir);')
data_load_lines.append(f' float* h_{da.name} = read_binary_floats(path, {da.size});')
data_load_lines.append(f' float* d_{da.name}; CUDA_CHECK(cudaMalloc(&d_{da.name}, sizeof(float) * {da.size}));')
data_load_lines.append(f' CUDA_CHECK(cudaMemcpy(d_{da.name}, h_{da.name}, sizeof(float) * {da.size}, cudaMemcpyHostToDevice));')
data_load_lines.append(f' prob.{da.name} = d_{da.name};')
data_load_lines.append(f' delete[] h_{da.name};')
else:
data_load_lines.append(f' snprintf(path, sizeof(path), "%s/{da.name}.bin", data_dir);')
data_load_lines.append(f' int* h_{da.name} = read_binary_ints(path, {da.size});')
data_load_lines.append(f' int* d_{da.name}; CUDA_CHECK(cudaMalloc(&d_{da.name}, sizeof(int) * {da.size}));')
data_load_lines.append(f' CUDA_CHECK(cudaMemcpy(d_{da.name}, h_{da.name}, sizeof(int) * {da.size}, cudaMemcpyHostToDevice));')
data_load_lines.append(f' prob.{da.name} = d_{da.name};')
data_load_lines.append(f' delete[] h_{da.name};')
data_load_body = "\n".join(data_load_lines) if data_load_lines else " // no data to load"
# Build solver config
config_lines = [
f" cfg.pop_size = {pop_size};",
f" cfg.max_gen = {max_gen};",
f" cfg.seed = {seed};",
f" cfg.verbose = {'true' if verbose else 'false'};",
f" cfg.use_aos = {'true' if use_aos else 'false'};",
]
if time_limit > 0:
config_lines.append(f" cfg.time_limit_sec = {time_limit}f;")
if sa_temp_init > 0:
config_lines.append(f" cfg.sa_temp_init = {sa_temp_init}f;")
solver_config = "\n".join(config_lines)
dim2_default = n if row_mode.lower() == "single" else 0
if total_elements == 0 and row_mode.lower() == "partition":
total_elements = n
# Process custom operators
custom_op_defines_block = ""
custom_op_switch = ""
custom_op_registry = ""
if custom_operators:
from cugenopt.operators import generate_custom_operator_cuda
switch_block, registry_block, filtered = generate_custom_operator_cuda(
custom_operators, encoding
)
if filtered:
custom_op_switch = switch_block
custom_op_registry = registry_block
custom_op_defines_block = _build_custom_op_defines(switch_block, registry_block)
# Fill template
source = template
replacements = {
"{{D1}}": str(dim1),
"{{D2}}": str(dim2),
"{{DATA_FIELDS}}": data_fields,
"{{OBJ_DEFS}}": obj_defs,
"{{COMPUTE_OBJ}}": compute_obj,
"{{COMPUTE_PENALTY}}": compute_penalty,
"{{ENCODING}}": _build_encoding_str(encoding),
"{{DIM1}}": str(dim1),
"{{DIM2_DEFAULT}}": str(dim2_default),
"{{ROW_MODE}}": _build_row_mode_str(row_mode),
"{{TOTAL_ELEMENTS}}": str(total_elements),
"{{CROSS_ROW_PROB}}": f"{cross_row_prob}f",
"{{PERM_REPEAT_COUNT}}": str(perm_repeat_count),
"{{VALUE_LOWER}}": str(value_lower),
"{{VALUE_UPPER}}": str(value_upper),
"{{SHARED_MEM_EXPR}}": shared_mem_expr,
"{{LOAD_SHARED_BODY}}": load_shared_body,
"{{DESTROY_BODY}}": destroy_body,
"{{DATA_LOAD_BODY}}": data_load_body,
"{{SOLVER_CONFIG}}": solver_config,
"{{NUM_OBJ}}": str(len(objectives)),
"{{CUSTOM_OP_DEFINES}}": custom_op_defines_block,
"{{CUSTOM_OP_SWITCH}}": custom_op_switch,
"{{CUSTOM_OP_REGISTRY}}": custom_op_registry,
}
for key, val in replacements.items():
source = source.replace(key, val)
# Hash for caching
h = _content_hash(source)
_CACHE_DIR.mkdir(parents=True, exist_ok=True)
cu_path = _CACHE_DIR / f"custom_{h}.cu"
exe_suffix = ".exe" if _is_windows() else ""
exe_path = _CACHE_DIR / f"custom_{h}{exe_suffix}"
# Compile if needed
if not exe_path.exists():
cu_path.write_text(source)
if cuda_arch is None:
cuda_arch = _detect_cuda_arch()
nvcc = _find_nvcc()
cmd = [
nvcc, "-O2", "-std=c++17", "--extended-lambda", "--expt-relaxed-constexpr",
f"-arch={cuda_arch}",
f"-I{fw_root}",
f"-I{fw_root / 'core'}",
f"-I{fw_root / 'problems'}",
str(cu_path), "-o", str(exe_path),
]
proc = subprocess.run(cmd, capture_output=True, text=True)
if proc.returncode != 0:
if custom_operators and len(custom_operators) > 0:
source, exe_path, cu_path = _fallback_compile_without_bad_ops(
custom_operators, encoding, template, replacements,
cmd[:-3], fw_root, proc.stderr,
)
else:
raise CuGenOptCompileError(proc.stderr, str(cu_path))
# Write data to temp dir
with tempfile.TemporaryDirectory(prefix="cugenopt_") as tmpdir:
# Write n
with open(os.path.join(tmpdir, "n.bin"), "wb") as f:
f.write(struct.pack("i", n))
# Write data arrays
for da in all_data:
bin_path = os.path.join(tmpdir, f"{da.name}.bin")
if da.dtype == "float":
_write_binary_float(bin_path, da.data)
else:
_write_binary_int(bin_path, da.data)
# Run
proc = subprocess.run(
[str(exe_path), tmpdir],
capture_output=True, text=True, timeout=max(300, time_limit * 3 + 60)
)
if proc.returncode != 0:
raise RuntimeError(f"Execution failed:\n{proc.stderr}")
# Parse JSON output (find the last line that starts with '{')
output_lines = proc.stdout.strip().split("\n")
json_line = None
for line in reversed(output_lines):
line = line.strip()
if line.startswith("{"):
json_line = line
break
if json_line is None:
raise RuntimeError(
f"No JSON output found.\nstdout:\n{proc.stdout}\nstderr:\n{proc.stderr}"
)
result = json.loads(json_line)
# Convert solution lists to numpy arrays
if "solution" in result:
result["solution"] = [np.array(row, dtype=np.int32) for row in result["solution"]]
return result
def clear_cache():
"""Remove all cached JIT compilations."""
if _CACHE_DIR.exists():
shutil.rmtree(_CACHE_DIR)
print(f"Cleared JIT cache: {_CACHE_DIR}")

View file

@ -0,0 +1,204 @@
/**
* JIT template — auto-generated by cugenopt.solve_custom()
*
* Placeholders (replaced by Python at runtime):
* {{D1}}, {{D2}} — Solution dimensions
* {{ENCODING}} — EncodingType enum value
* {{ROW_MODE}} — RowMode enum value
* {{DIM1}}, {{DIM2_DEFAULT}}, {{TOTAL_ELEMENTS}}
* {{VALUE_LOWER}}, {{VALUE_UPPER}}
* {{CROSS_ROW_PROB}}, {{PERM_REPEAT_COUNT}}
* {{OBJ_DEFS}} — ObjDef array initializer
* {{DATA_FIELDS}} — struct field declarations
* {{COMPUTE_OBJ}} — user's compute_obj body
* {{COMPUTE_PENALTY}} — user's compute_penalty body
* {{SHARED_MEM_EXPR}} — shared_mem_bytes() return expression
* {{LOAD_SHARED_BODY}} — load_shared() body
* {{DESTROY_BODY}} — destroy() body
* {{DATA_LOAD_BODY}} — main() data loading code
* {{SOLVER_CONFIG}} — SolverConfig field assignments
* {{NUM_OBJ}} — number of objectives
* {{CUSTOM_OP_DEFINES}} — #define CUGENOPT_HAS_CUSTOM_OPS (or empty)
* {{CUSTOM_OP_SWITCH}} — switch cases for execute_custom_op (or empty)
* {{CUSTOM_OP_REGISTRY}} — add() calls for register_custom_operators (or empty)
*/
{{CUSTOM_OP_DEFINES}}
#include "types.cuh"
#include "cuda_utils.cuh"
struct CustomProblem;
#ifdef CUGENOPT_HAS_CUSTOM_OPS
template<typename Sol>
__device__ inline bool execute_custom_op(int seq_id, Sol& sol, int dim1,
EncodingType encoding, curandState* rng,
int val_lb, int val_ub,
const void* prob_data);
#endif
#include "solver.cuh"
#include <cstdio>
#include <cstdlib>
#include <cstring>
struct CustomProblem : ProblemBase<CustomProblem, {{D1}}, {{D2}}> {
{{DATA_FIELDS}}
int _n;
static constexpr ObjDef OBJ_DEFS[] = {
{{OBJ_DEFS}}
};
__device__ float compute_obj(int idx, const Sol& sol) const {
{{COMPUTE_OBJ}}
}
__device__ float compute_penalty(const Sol& sol) const {
{{COMPUTE_PENALTY}}
}
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = {{ENCODING}};
cfg.dim1 = {{DIM1}};
cfg.dim2_default = {{DIM2_DEFAULT}};
cfg.row_mode = {{ROW_MODE}};
cfg.total_elements = {{TOTAL_ELEMENTS}};
cfg.cross_row_prob = {{CROSS_ROW_PROB}};
cfg.perm_repeat_count = {{PERM_REPEAT_COUNT}};
cfg.value_lower_bound = {{VALUE_LOWER}};
cfg.value_upper_bound = {{VALUE_UPPER}};
fill_obj_config(cfg);
return cfg;
}
size_t shared_mem_bytes() const {
{{SHARED_MEM_EXPR}}
}
__device__ void load_shared(char* smem, int tid, int bsz) {
{{LOAD_SHARED_BODY}}
}
void destroy() {
{{DESTROY_BODY}}
}
};
#ifdef CUGENOPT_HAS_CUSTOM_OPS
template<typename Sol>
__device__ inline bool execute_custom_op(int seq_id, Sol& sol, int dim1,
EncodingType encoding, curandState* rng,
int val_lb, int val_ub,
const void* prob_data) {
const CustomProblem* prob = static_cast<const CustomProblem*>(prob_data);
switch (seq_id) {
{{CUSTOM_OP_SWITCH}}
default:
return false;
}
}
#endif
#ifdef CUGENOPT_HAS_CUSTOM_OPS
inline void register_custom_operators(SeqRegistry& reg) {
auto add = [&](int id, float w, float cap) {
if (reg.count >= MAX_SEQ) return;
reg.ids[reg.count] = id;
reg.weights[reg.count] = w;
reg.max_w[reg.count] = cap;
reg.count++;
};
{{CUSTOM_OP_REGISTRY}}
float sum = 0.0f;
for (int i = 0; i < reg.count; i++) sum += reg.weights[i];
if (sum > 0.0f) {
for (int i = 0; i < reg.count; i++) reg.weights[i] /= sum;
}
}
#endif
static float* read_binary_floats(const char* path, int count) {
FILE* f = fopen(path, "rb");
if (!f) { fprintf(stderr, "Cannot open %s\n", path); exit(1); }
float* buf = new float[count];
size_t r = fread(buf, sizeof(float), count, f);
fclose(f);
if ((int)r != count) { fprintf(stderr, "Short read %s: %d/%d\n", path, (int)r, count); exit(1); }
return buf;
}
static int* read_binary_ints(const char* path, int count) {
FILE* f = fopen(path, "rb");
if (!f) { fprintf(stderr, "Cannot open %s\n", path); exit(1); }
int* buf = new int[count];
size_t r = fread(buf, sizeof(int), count, f);
fclose(f);
if ((int)r != count) { fprintf(stderr, "Short read %s: %d/%d\n", path, (int)r, count); exit(1); }
return buf;
}
int main(int argc, char** argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <data_dir>\n", argv[0]);
return 1;
}
const char* data_dir = argv[1];
char path[512];
// Read problem size
snprintf(path, sizeof(path), "%s/n.bin", data_dir);
int n;
{ FILE* f = fopen(path, "rb"); fread(&n, sizeof(int), 1, f); fclose(f); }
// Read data arrays and upload to GPU
CustomProblem prob;
prob._n = n;
{{DATA_LOAD_BODY}}
// Configure solver
SolverConfig cfg;
{{SOLVER_CONFIG}}
// Solve
#ifdef CUGENOPT_HAS_CUSTOM_OPS
auto result = solve(prob, cfg, nullptr, 0, register_custom_operators);
#else
auto result = solve(prob, cfg);
#endif
// Output JSON result
auto& best = result.best_solution;
printf("{\"objective\":%.10g", best.objectives[0]);
printf(",\"penalty\":%.10g", best.penalty);
printf(",\"elapsed_ms\":%.4f", result.elapsed_ms);
printf(",\"generations\":%d", result.generations);
printf(",\"stop_reason\":\"%s\"",
result.stop_reason == StopReason::TimeLimit ? "time_limit" :
result.stop_reason == StopReason::Stagnation ? "stagnation" : "max_gen");
printf(",\"objectives\":[");
for (int i = 0; i < {{NUM_OBJ}}; i++) {
if (i > 0) printf(",");
printf("%.10g", best.objectives[i]);
}
printf("]");
printf(",\"solution\":[");
for (int r = 0; r < {{DIM1}}; r++) {
if (r > 0) printf(",");
printf("[");
int len = best.dim2_sizes[r];
for (int c = 0; c < len; c++) {
if (c > 0) printf(",");
printf("%d", best.data[r][c]);
}
printf("]");
}
printf("]}\n");
prob.destroy();
return 0;
}

View file

@ -0,0 +1,19 @@
"""
Built-in operator packs for standard problems.
Each pack provides problem-aware custom operators that can significantly
improve solution quality for specific problem types. Users load them like:
from cugenopt.operator_packs import tsp_ops
result = cugenopt.solve_custom(..., custom_operators=tsp_ops)
Or with built-in solvers:
result = cugenopt.solve_tsp(dist, custom_operators=tsp_ops)
"""
from cugenopt.operator_packs.tsp import tsp_ops
from cugenopt.operator_packs.knapsack import knapsack_ops
from cugenopt.operator_packs.graph_color import graph_color_ops
__all__ = ["tsp_ops", "knapsack_ops", "graph_color_ops"]

View file

@ -0,0 +1,92 @@
"""
Graph Coloring specific custom operators.
These operators exploit graph structure (adjacency awareness) to
reduce conflicts more effectively than random integer mutations.
"""
from cugenopt.operators import CustomOperator
_GC_CONFLICT_RECOLOR = CustomOperator(
name="gc_conflict_recolor",
code=r"""
int row = 0;
int sz = sol.dim2_sizes[row];
if (sz < 2) return false;
int n = prob->_n;
const int* adj = prob->d_adj;
int node = rand_int(rng, sz);
int my_color = sol.data[row][node];
bool has_conflict = false;
for (int j = 0; j < sz; j++) {
if (j != node && adj[node * n + j] && sol.data[row][j] == my_color) {
has_conflict = true;
break;
}
}
if (!has_conflict) return false;
int range = val_ub - val_lb + 1;
int best_color = my_color;
int best_conflicts = sz;
for (int t = 0; t < 5; t++) {
int c = val_lb + rand_int(rng, range);
if (c == my_color) continue;
int conflicts = 0;
for (int j = 0; j < sz; j++)
if (j != node && adj[node * n + j] && sol.data[row][j] == c)
conflicts++;
if (conflicts < best_conflicts) {
best_conflicts = conflicts;
best_color = c;
}
}
if (best_color == my_color) return false;
sol.data[row][node] = best_color;
return true;
""",
encoding="integer",
initial_weight=1.5,
)
_GC_KEMPE_CHAIN = CustomOperator(
name="gc_kempe_swap",
code=r"""
int row = 0;
int sz = sol.dim2_sizes[row];
if (sz < 3) return false;
int n = prob->_n;
const int* adj = prob->d_adj;
int node = rand_int(rng, sz);
int c1 = sol.data[row][node];
int range = val_ub - val_lb + 1;
int c2 = val_lb + rand_int(rng, range - 1);
if (c2 >= c1) c2++;
int queue[64];
bool visited[512];
for (int i = 0; i < sz && i < 512; i++) visited[i] = false;
int head = 0, tail = 0;
queue[tail++] = node;
visited[node] = true;
int chain_len = 0;
while (head < tail && tail < 64 && chain_len < 16) {
int v = queue[head++];
int vc = sol.data[row][v];
int swap_c = (vc == c1) ? c2 : c1;
sol.data[row][v] = swap_c;
chain_len++;
for (int j = 0; j < sz; j++) {
if (!visited[j] && j < 512 && adj[v * n + j]
&& (sol.data[row][j] == c1 || sol.data[row][j] == c2)
&& tail < 64) {
visited[j] = true;
queue[tail++] = j;
}
}
}
return chain_len > 0;
""",
encoding="integer",
initial_weight=1.0,
)
graph_color_ops = [_GC_CONFLICT_RECOLOR, _GC_KEMPE_CHAIN]

View file

@ -0,0 +1,71 @@
"""
Knapsack-specific custom operators.
These operators exploit knapsack structure (value/weight ratio awareness)
to make more informed bit-flip decisions.
"""
from cugenopt.operators import CustomOperator
_KNAPSACK_GREEDY_FLIP = CustomOperator(
name="knapsack_greedy_flip",
code=r"""
int row = 0;
int sz = sol.dim2_sizes[row];
if (sz < 2) return false;
const float* weights = prob->d_weights;
const float* values = prob->d_values;
int pos = rand_int(rng, sz);
if (sol.data[row][pos] == 0) {
float ratio = (weights[pos] > 0.001f)
? values[pos] / weights[pos] : 0.0f;
if (ratio > 0.5f || curand_uniform(rng) < 0.3f) {
sol.data[row][pos] = 1;
return true;
}
} else {
float ratio = (weights[pos] > 0.001f)
? values[pos] / weights[pos] : 1e6f;
if (ratio < 0.5f || curand_uniform(rng) < 0.3f) {
sol.data[row][pos] = 0;
return true;
}
}
return false;
""",
encoding="binary",
initial_weight=0.8,
)
_KNAPSACK_SWAP_RATIO = CustomOperator(
name="knapsack_swap_ratio",
code=r"""
int row = 0;
int sz = sol.dim2_sizes[row];
if (sz < 2) return false;
const float* weights = prob->d_weights;
const float* values = prob->d_values;
int in_item = -1, out_item = -1;
for (int t = 0; t < 8; t++) {
int p = rand_int(rng, sz);
if (sol.data[row][p] == 1 && in_item < 0) in_item = p;
if (sol.data[row][p] == 0 && out_item < 0) out_item = p;
if (in_item >= 0 && out_item >= 0) break;
}
if (in_item < 0 || out_item < 0) return false;
float in_ratio = (weights[in_item] > 0.001f)
? values[in_item] / weights[in_item] : 1e6f;
float out_ratio = (weights[out_item] > 0.001f)
? values[out_item] / weights[out_item] : 0.0f;
if (out_ratio > in_ratio || curand_uniform(rng) < 0.2f) {
sol.data[row][in_item] = 0;
sol.data[row][out_item] = 1;
return true;
}
return false;
""",
encoding="binary",
initial_weight=0.8,
)
knapsack_ops = [_KNAPSACK_GREEDY_FLIP, _KNAPSACK_SWAP_RATIO]

View file

@ -0,0 +1,108 @@
"""
TSP-specific custom operators.
These operators exploit TSP structure (distance matrix, route continuity)
to perform more effective local search than generic permutation operators.
"""
from cugenopt.operators import CustomOperator
_TSP_2OPT_DELTA = CustomOperator(
name="tsp_2opt_delta",
code=r"""
int row = 0;
int sz = sol.dim2_sizes[row];
if (sz < 4) return false;
int n = prob->_n;
const float* dist = prob->d_dist;
int i = rand_int(rng, sz);
int j = rand_int(rng, sz - 2);
if (j >= i) j += 2; else if (j == i - 1) j = (i + sz - 1) % sz;
if (i > j) { int t = i; i = j; j = t; }
if (j - i < 2 || j - i >= sz - 1) return false;
int a = sol.data[row][i], b = sol.data[row][(i+1) % sz];
int c = sol.data[row][j], d = sol.data[row][(j+1) % sz];
float old_cost = dist[a * n + b] + dist[c * n + d];
float new_cost = dist[a * n + c] + dist[b * n + d];
if (new_cost >= old_cost) return false;
ops::perm_reverse(sol.data[row], i + 1, j);
return true;
""",
encoding="permutation",
initial_weight=1.5,
)
_TSP_OR_OPT_DELTA = CustomOperator(
name="tsp_or_opt_delta",
code=r"""
int row = 0;
int sz = sol.dim2_sizes[row];
if (sz < 5) return false;
int n = prob->_n;
const float* dist = prob->d_dist;
int seg_len = 1 + rand_int(rng, 3);
if (seg_len >= sz - 1) return false;
int from_pos = rand_int(rng, sz - seg_len);
int prev = sol.data[row][(from_pos + sz - 1) % sz];
int seg_start = sol.data[row][from_pos];
int seg_end = sol.data[row][from_pos + seg_len - 1];
int next = sol.data[row][(from_pos + seg_len) % sz];
float remove_cost = dist[prev * n + seg_start]
+ dist[seg_end * n + next]
- dist[prev * n + next];
int best_to = -1;
float best_delta = 0.0f;
for (int t = 0; t < 8; t++) {
int to = rand_int(rng, sz - seg_len);
if (to >= from_pos && to < from_pos + seg_len) continue;
int tp = sol.data[row][to];
int tn = sol.data[row][(to + 1) % sz];
float insert_cost = dist[tp * n + seg_start]
+ dist[seg_end * n + tn]
- dist[tp * n + tn];
float delta = insert_cost - remove_cost;
if (delta < best_delta) { best_delta = delta; best_to = to; }
}
if (best_to < 0) return false;
ops::perm_or_opt(sol.data[row], sz, from_pos, best_to, seg_len);
return true;
""",
encoding="permutation",
initial_weight=1.0,
)
_TSP_NODE_INSERT_DELTA = CustomOperator(
name="tsp_node_insert_delta",
code=r"""
int row = 0;
int sz = sol.dim2_sizes[row];
if (sz < 4) return false;
int n = prob->_n;
const float* dist = prob->d_dist;
int from_pos = rand_int(rng, sz);
int node = sol.data[row][from_pos];
int prev = sol.data[row][(from_pos + sz - 1) % sz];
int next = sol.data[row][(from_pos + 1) % sz];
float remove_save = dist[prev * n + node] + dist[node * n + next]
- dist[prev * n + next];
int best_to = -1;
float best_delta = 0.0f;
for (int t = 0; t < 8; t++) {
int to = rand_int(rng, sz - 1);
if (to >= from_pos) to++;
int tp = sol.data[row][to];
int tn = sol.data[row][(to + 1) % sz];
float insert_cost = dist[tp * n + node] + dist[node * n + tn]
- dist[tp * n + tn];
float delta = insert_cost - remove_save;
if (delta < best_delta) { best_delta = delta; best_to = to; }
}
if (best_to < 0) return false;
ops::perm_insert(sol.data[row], from_pos, best_to, sz);
return true;
""",
encoding="permutation",
initial_weight=1.0,
)
tsp_ops = [_TSP_2OPT_DELTA, _TSP_OR_OPT_DELTA, _TSP_NODE_INSERT_DELTA]

View file

@ -0,0 +1,136 @@
"""
Custom operator registration for cuGenOpt.
Allows users to inject problem-specific CUDA search operators into the
JIT-compiled solver. Custom operators participate in AOS weight competition
alongside built-in operators.
Two entry points (same underlying mechanism):
1. Python users: pass custom_operators=[CustomOperator(...)] to solve_custom()
2. CUDA developers: write the same code body in .cuh and call register_custom_operators()
Operator contract:
The code body has access to:
- sol: Solution reference (sol.data[row][col], sol.dim2_sizes[row])
- rng: curandState* for random number generation
- dim1, encoding, val_lb, val_ub: problem parameters
- prob: const CustomProblem* (access data via prob->d_dist, prob->_n, etc.)
Must return bool (true if solution was modified, false for no-op).
Available primitives: ops::perm_swap, ops::perm_reverse, ops::perm_insert,
ops::bin_flip, ops::rand_int, etc.
"""
from dataclasses import dataclass, field
from typing import List, Optional
from cugenopt.validation import validate_cuda_snippet, CuGenOptValidationError
@dataclass
class CustomOperator:
"""A user-defined search operator.
Args:
name: Human-readable name (used in logs and AOS stats).
code: CUDA code body that modifies `sol` in-place.
Available variables:
- sol: Solution reference (sol.data[row][col], sol.dim2_sizes[row])
- rng: curandState* for random number generation
- dim1: number of active rows
- prob: const CustomProblem* access problem data via prob->field
(e.g. prob->d_dist, prob->_n, prob->d_weights)
Must return bool (true if sol was modified).
encoding: Which encoding this operator targets ("permutation", "binary",
"integer", or "any"). Operators are only active when the problem
encoding matches.
initial_weight: Starting AOS weight (relative, will be normalized).
Higher = more likely to be sampled initially.
weight_cap: Maximum AOS weight after normalization (0 = use global cap).
"""
name: str
code: str
encoding: str = "any"
initial_weight: float = 0.5
weight_cap: float = 0.0
def __post_init__(self):
if not self.name or not self.name.strip():
raise CuGenOptValidationError("CustomOperator name cannot be empty")
self.code = validate_cuda_snippet(self.code, f"operator '{self.name}'")
valid_enc = {"permutation", "binary", "integer", "any"}
if self.encoding.lower() not in valid_enc:
raise CuGenOptValidationError(
f"CustomOperator encoding must be one of {valid_enc}, "
f"got '{self.encoding}'"
)
if self.initial_weight <= 0:
raise CuGenOptValidationError(
f"CustomOperator initial_weight must be > 0, got {self.initial_weight}"
)
# SeqID range for custom operators: 100..123 (MAX_SEQ=24 custom slots)
CUSTOM_SEQ_ID_BASE = 100
def generate_custom_operator_cuda(
operators: List[CustomOperator],
problem_encoding: str,
) -> tuple:
"""Generate CUDA code to inject custom operators into execute_custom_op.
Returns:
(switch_block, registry_block, filtered_operators):
- switch_block: CUDA switch cases for {{CUSTOM_OP_SWITCH}}
- registry_block: add() calls for {{CUSTOM_OP_REGISTRY}}
- filtered_operators: list of operators that matched the encoding
All empty strings / empty list if no operators match.
"""
if not operators:
return "", "", ""
filtered = _filter_by_encoding(operators, problem_encoding)
if not filtered:
return "", "", ""
switch_cases = []
registry_adds = []
for i, op in enumerate(filtered):
seq_id = CUSTOM_SEQ_ID_BASE + i
switch_cases.append(_generate_switch_case(seq_id, op))
registry_adds.append(
f" add({seq_id}, {op.initial_weight}f, {op.weight_cap}f); "
f"// custom: {op.name}"
)
switch_block = "\n".join(switch_cases)
registry_block = "\n".join(registry_adds)
return switch_block, registry_block, filtered
def _filter_by_encoding(
operators: List[CustomOperator],
problem_encoding: str,
) -> List[CustomOperator]:
"""Filter operators compatible with the problem encoding."""
enc = problem_encoding.lower()
return [
op for op in operators
if op.encoding.lower() == "any" or op.encoding.lower() == enc
]
def _generate_switch_case(seq_id: int, op: CustomOperator) -> str:
"""Generate a single switch case for execute_sequence."""
return f"""\
case {seq_id}: {{ // custom: {op.name}
{_indent(op.code, 12)}
}}"""
def _indent(text: str, spaces: int) -> str:
"""Indent each line of text."""
prefix = " " * spaces
lines = text.strip().split("\n")
return "\n".join(prefix + line for line in lines)

View file

@ -0,0 +1,261 @@
"""
Input validation and friendly error translation for cuGenOpt.
Two responsibilities:
1. Validate numpy arrays before JIT compilation (dtype, shape, NaN/Inf, contiguity)
2. Translate nvcc compilation errors into actionable Python messages
"""
import re
from typing import Dict, Optional, Sequence
import numpy as np
class CuGenOptValidationError(ValueError):
"""Raised when input data fails validation."""
pass
class CuGenOptCompileError(RuntimeError):
"""Raised when nvcc compilation fails, with a friendly summary."""
def __init__(self, raw_stderr: str, source_path: str):
self.raw_stderr = raw_stderr
self.source_path = source_path
self.friendly = _translate_nvcc_error(raw_stderr)
super().__init__(
f"{self.friendly}\n\n"
f"[raw nvcc output]\n{_truncate(raw_stderr, 1200)}\n\n"
f"Source saved at: {source_path}"
)
# ============================================================
# Array validation
# ============================================================
def validate_array(
arr: np.ndarray,
name: str,
*,
expected_dtype: Optional[np.dtype] = None,
expected_ndim: Optional[int] = None,
expected_shape: Optional[tuple] = None,
min_size: int = 1,
allow_nan: bool = False,
allow_inf: bool = False,
) -> np.ndarray:
"""Validate a single numpy array and return a contiguous copy if needed.
Raises CuGenOptValidationError with a clear message on failure.
"""
if not isinstance(arr, np.ndarray):
raise CuGenOptValidationError(
f"'{name}' must be a numpy array, got {type(arr).__name__}"
)
if expected_ndim is not None and arr.ndim != expected_ndim:
raise CuGenOptValidationError(
f"'{name}' must be {expected_ndim}D, got {arr.ndim}D with shape {arr.shape}"
)
if expected_shape is not None:
for i, (actual, expect) in enumerate(zip(arr.shape, expected_shape)):
if expect is not None and actual != expect:
raise CuGenOptValidationError(
f"'{name}' shape mismatch at axis {i}: "
f"expected {expected_shape}, got {arr.shape}"
)
if arr.size < min_size:
raise CuGenOptValidationError(
f"'{name}' is too small: size={arr.size}, minimum={min_size}"
)
if expected_dtype is not None:
arr = np.ascontiguousarray(arr, dtype=expected_dtype)
if not allow_nan and np.issubdtype(arr.dtype, np.floating) and np.isnan(arr).any():
nan_count = int(np.isnan(arr).sum())
raise CuGenOptValidationError(
f"'{name}' contains {nan_count} NaN value(s). "
f"Clean your data or set allow_nan=True."
)
if not allow_inf and np.issubdtype(arr.dtype, np.floating) and np.isinf(arr).any():
inf_count = int(np.isinf(arr).sum())
raise CuGenOptValidationError(
f"'{name}' contains {inf_count} Inf value(s). "
f"Clean your data or set allow_inf=True."
)
return np.ascontiguousarray(arr)
def validate_square_matrix(arr: np.ndarray, name: str, dtype=np.float32) -> np.ndarray:
"""Validate a square 2D matrix."""
arr = validate_array(arr, name, expected_ndim=2, expected_dtype=dtype)
if arr.shape[0] != arr.shape[1]:
raise CuGenOptValidationError(
f"'{name}' must be square, got shape {arr.shape}"
)
return arr
def validate_1d(arr: np.ndarray, name: str, *, length: Optional[int] = None,
dtype=np.float32) -> np.ndarray:
"""Validate a 1D array with optional length check."""
arr = validate_array(arr, name, expected_ndim=1, expected_dtype=dtype)
if length is not None and arr.shape[0] != length:
raise CuGenOptValidationError(
f"'{name}' length mismatch: expected {length}, got {arr.shape[0]}"
)
return arr
def validate_data_dict(data: Dict[str, np.ndarray], dtype_tag: str) -> Dict[str, np.ndarray]:
"""Validate a dict of name -> array for compile_and_solve data/int_data."""
target_dtype = np.float32 if dtype_tag == "float" else np.int32
validated = {}
for name, arr in data.items():
if not isinstance(arr, np.ndarray):
raise CuGenOptValidationError(
f"data['{name}'] must be a numpy array, got {type(arr).__name__}"
)
arr = validate_array(arr, f"data['{name}']", expected_dtype=target_dtype)
validated[name] = arr
return validated
def validate_encoding(encoding: str) -> str:
"""Validate encoding string."""
valid = {"permutation", "binary", "integer"}
enc = encoding.lower().strip()
if enc not in valid:
raise CuGenOptValidationError(
f"Unknown encoding '{encoding}'. Must be one of: {', '.join(sorted(valid))}"
)
return enc
def validate_positive_int(value, name: str, *, allow_zero: bool = False) -> int:
"""Validate that value is a positive integer."""
try:
v = int(value)
except (TypeError, ValueError):
raise CuGenOptValidationError(
f"'{name}' must be an integer, got {type(value).__name__}: {value!r}"
)
if allow_zero and v < 0:
raise CuGenOptValidationError(f"'{name}' must be >= 0, got {v}")
if not allow_zero and v < 1:
raise CuGenOptValidationError(f"'{name}' must be >= 1, got {v}")
return v
def validate_cuda_snippet(code: str, name: str) -> str:
"""Basic sanity check on a CUDA code snippet."""
code = code.strip()
if not code:
raise CuGenOptValidationError(f"'{name}' CUDA code snippet is empty")
dangerous = ["system(", "popen(", "exec(", "fork(", "unlink("]
for d in dangerous:
if d in code:
raise CuGenOptValidationError(
f"'{name}' contains potentially dangerous call: '{d}'"
)
return code
# ============================================================
# nvcc error translation
# ============================================================
_NVCC_PATTERNS = [
(
re.compile(r"error:\s*identifier\s+\"(\w+)\"\s+is\s+undefined", re.I),
lambda m: f"Undefined identifier '{m.group(1)}'. "
f"Check that all data field names in compute_obj/compute_penalty "
f"match the keys in your data dict."
),
(
re.compile(r"error:\s*expected\s+a\s+\"([^\"]+)\"", re.I),
lambda m: f"Syntax error: expected '{m.group(1)}'. "
f"Check for missing semicolons, braces, or parentheses."
),
(
re.compile(r"error:\s*no\s+suitable\s+conversion\s+function\s+from\s+\"([^\"]+)\"\s+to\s+\"([^\"]+)\"", re.I),
lambda m: f"Type mismatch: cannot convert '{m.group(1)}' to '{m.group(2)}'. "
f"Ensure you're using the correct types (float/int)."
),
(
re.compile(r"error:\s*too\s+(?:few|many)\s+arguments", re.I),
lambda m: f"Wrong number of arguments in a function call. "
f"Check the function signature."
),
(
re.compile(r"error:\s*class\s+\"(\w+)\"\s+has\s+no\s+member\s+\"(\w+)\"", re.I),
lambda m: f"'{m.group(1)}' has no member '{m.group(2)}'. "
f"Available solution members: data[row][col], dim2_sizes[row]."
),
(
re.compile(r"error:\s*expression\s+must\s+have\s+a\s+constant\s+value", re.I),
lambda m: f"Non-constant expression where a constant is required. "
f"CUDA device code cannot use dynamic allocation; "
f"use fixed-size arrays."
),
(
re.compile(r"ptxas\s+error\s*:\s*Entry\s+function.*uses\s+too\s+much\s+shared\s+data", re.I),
lambda m: f"Shared memory overflow. Your problem data is too large for GPU "
f"shared memory. Try reducing problem size or data arrays."
),
(
re.compile(r"nvcc\s+fatal\s*:\s*Unsupported\s+gpu\s+architecture\s+'compute_(\d+)'", re.I),
lambda m: f"GPU architecture sm_{m.group(1)} is not supported by your nvcc. "
f"Try specifying cuda_arch='sm_75' or update your CUDA toolkit."
),
(
re.compile(r"error:\s*return\s+value\s+type\s+does\s+not\s+match", re.I),
lambda m: f"Return type mismatch. compute_obj must return float. "
f"Make sure all code paths return a float value."
),
]
def _translate_nvcc_error(stderr: str) -> str:
"""Extract the most relevant error from nvcc output and provide a friendly message."""
messages = []
for pattern, formatter in _NVCC_PATTERNS:
match = pattern.search(stderr)
if match:
messages.append(formatter(match))
if messages:
header = "nvcc compilation failed. Likely cause(s):\n"
return header + "\n".join(f" - {m}" for m in messages)
error_lines = [
line.strip() for line in stderr.split("\n")
if "error" in line.lower() and not line.strip().startswith("#")
]
if error_lines:
summary = error_lines[0]
return (
f"nvcc compilation failed:\n {summary}\n\n"
f"Tip: Check your CUDA code snippets for syntax errors. "
f"Common issues: missing semicolons, undefined variables, "
f"wrong data field names."
)
return (
"nvcc compilation failed with an unknown error.\n"
"Check the raw output below for details."
)
def _truncate(text: str, max_len: int) -> str:
if len(text) <= max_len:
return text
return text[:max_len] + f"\n... ({len(text) - max_len} chars truncated)"