This commit is contained in:
Adil Hafeez 2024-11-07 10:13:18 -08:00
parent 23ed25342a
commit 7b99379275
11 changed files with 761 additions and 392 deletions

View file

@ -13,6 +13,20 @@ Content-Type: application/json
]
}
### prompt gateway request default target
POST {{prompt_endpoint}}/v1/chat/completions HTTP/1.1
Content-Type: application/json
{
"messages": [
{
"role": "user",
"content": "hello"
}
]
}
### prompt gateway request (streaming)
POST {{prompt_endpoint}}/v1/chat/completions HTTP/1.1
Content-Type: application/json

View file

@ -26,4 +26,4 @@ COPY arch/envoy.template.yaml .
COPY arch/arch_config_schema.yaml .
ENTRYPOINT ["sh", "-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:trace"]
ENTRYPOINT ["sh", "-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:debug"]

View file

@ -3,6 +3,7 @@ services:
image: katanemo/archgw:latest
ports:
- "10000:10000"
- "10001:10001"
- "11000:11000"
- "12000:12000"
- "19901:9901"

View file

@ -3,6 +3,7 @@ services:
image: katanemo/archgw:latest
ports:
- "10000:10000"
- "10001:10001"
- "11000:11000"
- "12000:12000"
- "19901:9901"

View file

@ -3,6 +3,7 @@ services:
image: katanemo/archgw:latest
ports:
- "10000:10000"
- "10001:10001"
- "11000:11000"
- "12000:12000"
- "19901:9901"

View file

@ -98,12 +98,18 @@ static_resources:
domains:
- "*"
routes:
{% for provider in arch_llm_providers %}
- match:
prefix: "/"
headers:
- name: "x-arch-llm-provider"
string_match:
exact: {{ provider.name }}
route:
auto_host_rewrite: true
cluster: arch_llm_listener
cluster: {{ provider.provider }}
timeout: 60s
{% endfor %}
http_filters:
- name: envoy.filters.http.compressor
typed_config:
@ -131,6 +137,23 @@ static_resources:
code:
local:
filename: "/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm"
- name: envoy.filters.http.wasm
typed_config:
"@type": type.googleapis.com/udpa.type.v1.TypedStruct
type_url: type.googleapis.com/envoy.extensions.filters.http.wasm.v3.Wasm
value:
config:
name: "http_config"
root_id: llm_gateway
configuration:
"@type": "type.googleapis.com/google.protobuf.StringValue"
value: |
{{ arch_llm_config | indent(32) }}
vm_config:
runtime: "envoy.wasm.runtime.v8"
code:
local:
filename: "/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm"
- name: envoy.filters.http.decompressor
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.decompressor.v3.Decompressor
@ -146,7 +169,6 @@ static_resources:
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
- name: arch_internal
address:
socket_address:
@ -457,22 +479,6 @@ static_resources:
port_value: 10001
hostname: arch_prompt_gateway_listener
- name: arch_llm_listener
connect_timeout: 5s
type: LOGICAL_DNS
dns_lookup_family: V4_ONLY
lb_policy: ROUND_ROBIN
load_assignment:
cluster_name: arch_llm_listener
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: 0.0.0.0
port_value: 12000
hostname: arch_llm_listener
{% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %}
- name: opentelemetry_collector
type: STRICT_DNS

View file

@ -1095,7 +1095,17 @@ impl StreamContext {
{
let default_target_response_str = if self.streaming_response {
let chat_completion_response =
serde_json::from_slice::<ChatCompletionsResponse>(&body).unwrap();
match serde_json::from_slice::<ChatCompletionsResponse>(&body) {
Ok(chat_completion_response) => chat_completion_response,
Err(e) => {
warn!(
"error deserializing default target response: {}, body str: {}",
e,
String::from_utf8(body).unwrap()
);
return self.send_server_error(ServerError::Deserialization(e), None);
}
};
let chunks = vec![
ChatCompletionStreamResponse::new(

View file

@ -74,4 +74,4 @@ prompt_targets:
tracing:
random_sampling: 100
# trace_arch: true
trace_arch: true

View file

@ -1,3 +1,4 @@
import os
import time
import torch
import app.commons.utilities as utils
@ -15,12 +16,38 @@ from app.function_calling.model_utils import (
)
from unittest.mock import patch
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.resources import Resource
resource = Resource.create(
{
"service.name": "model-server",
}
)
# Initialize the tracer provider
trace.set_tracer_provider(TracerProvider(resource=resource))
tracer = trace.get_tracer(__name__)
logger = utils.get_model_server_logger()
logger.info(f"Ready to serve traffic. available device: {glb.DEVICE}")
app = FastAPI()
FastAPIInstrumentor().instrument_app(app)
# Configure the OTLP exporter (Jaeger, Zipkin, etc.)
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OLTP_HOST", "http://localhost:4317") # noqa: F821
)
trace.get_tracer_provider().add_span_processor(BatchSpanProcessor(otlp_exporter))
class EmbeddingRequest(BaseModel):
input: str

1047
model_server/poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -32,6 +32,10 @@ onnxruntime = "1.19.2"
httpx = "*"
pytest-asyncio = "*"
pytest = "*"
opentelemetry-api = "^1.28.0"
opentelemetry-sdk = "^1.28.0"
opentelemetry-exporter-otlp = "^1.28.0"
opentelemetry-instrumentation-fastapi = "^0.49b0"
[tool.poetry.scripts]
archgw_modelserver = "app.cli:run_server"