trustgraph/trustgraph_configurator/templates/1.8/model-hosting/nvidia-gpu-vllm.jsonnet

local images = import "values/images.jsonnet";

{

    with:: function(key, value)
        self + {
            ["vllm-service-" + key]:: value,
        },

    "vllm-service-model":: "mistralai/Mistral-7B-Instruct-v0.3",
    "vllm-service-cpus":: "0.5",
    "vllm-service-memory":: "1G",
    "vllm-service-storage":: "50G",
    "vllm-service-hf-token":: null,

    "vllm-service" +: {

        create:: function(engine)

            local vol = engine.volume("vllm-storage")
                .with_size($["vllm-service-storage"]);

            local container =
                engine.container("vllm-service")
                    .with_image(images["vllm-service-nvidia"])
                    .with_command([
                        "--model",
                        $["vllm-service-model"],
                        "--served-model-name",
                        "model",
                        "--host",
                        "0.0.0.0",
                        "--port",
                        "7000",
                    ])
                    .with_runtime("nvidia")
                    .with_environment({
                        VLLM_SKIP_WARMUP: "true",
                    } + (
                        if $["vllm-service-hf-token"] != null
                            then { HF_TOKEN: $["vllm-service-hf-token"] }
                            else {}
                    ))
                    .with_privileged(true)
                    .with_ipc("host")
                    .with_capability("SYS_NICE")
                    .with_limits(
                        $["vllm-service-cpus"], $["vllm-service-memory"]
                    )
                    .with_reservations(
                        $["vllm-service-cpus"], $["vllm-service-memory"]
                    )
                    .with_port(7000, 7000, "vllm")
                    .with_volume_mount(vol, "/root/.cache/huggingface");

            local containerSet = engine.containers(
                "vllm-service", [ container ]
            );

            local service =
                engine.service(containerSet)
                .with_port(7000, 7000, "vllm");

            engine.resources([
                vol,
                containerSet,
                service,
            ])

    },

}