trustgraph/trustgraph_configurator/templates/2.1/model-hosting/intel-xpu-vllm.jsonnet
elpresidank 74cc8a4685 Squashed 'ai-context/trustgraph-templates/' content from commit 42a5fd1b
git-subtree-dir: ai-context/trustgraph-templates
git-subtree-split: 42a5fd1b678f32be378062e30451e2052ccb95dd
2026-04-05 21:09:49 -05:00

97 lines
3.3 KiB
Jsonnet

local images = import "values/images.jsonnet";
{
with:: function(key, value)
self + {
["vllm-service-" + key]:: value,
},
"vllm-service-model":: "teknium/OpenHermes-2.5-Mistral-7B",
"vllm-service-cpus":: "8.0",
"vllm-service-memory":: "16G",
"vllm-service-storage":: "20G",
"vllm-service-datatype":: "float16",
"vllm-service-max-model-len":: 4096,
"vllm-service-max-num-seqs":: 16,
"vllm-service-hf-token":: null,
"vllm-service" +: {
create:: function(engine)
local vol = engine.volume("vllm-storage")
.with_size($["vllm-service-storage"]);
local container =
engine.container("vllm-service")
.with_image(images["vllm-service-intel-xpu"])
.with_command([
"python",
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
$["vllm-service-model"],
"--served-model-name",
"model",
"--host",
"0.0.0.0",
"--port",
"7000",
"--device",
"xpu",
"--dtype",
$["vllm-service-datatype"],
"--enforce-eager",
"--max-model-len",
std.toString($["vllm-service-max-model-len"]),
"--max-num-seqs",
std.toString($["vllm-service-max-num-seqs"]),
"--block-size",
"64",
"--gpu-memory-util",
"0.85",
"--trust-remote-code",
"--disable-sliding-window",
])
.with_environment({
VLLM_USE_V1: "1",
VLLM_WORKER_MULTIPROC_METHOD: "spawn",
} + (
if $["vllm-service-hf-token"] != null
then { HF_TOKEN: $["vllm-service-hf-token"] }
else {}
))
.with_privileged(true)
.with_device("/dev/dri", "/dev/dri")
.with_ipc("host")
.with_group("video")
.with_group("render")
.with_capability("SYS_NICE")
.with_limits(
$["vllm-service-cpus"], $["vllm-service-memory"]
)
.with_reservations(
$["vllm-service-cpus"], $["vllm-service-memory"]
)
.with_port(7000, 7000, "vllm")
.with_bind_mount("/dev/dri/by-path", "/dev/dri/by-path")
.with_volume_mount(vol, "/root/.cache/huggingface");
local containerSet = engine.containers(
"vllm-service", [ container ]
);
local service =
engine.service(containerSet)
.with_port(7000, 7000, "vllm");
engine.resources([
vol,
containerSet,
service,
])
},
}