mirror of
https://github.com/katanemo/plano.git
synced 2026-06-17 15:25:17 +02:00
wip
This commit is contained in:
parent
23ed25342a
commit
7b99379275
11 changed files with 761 additions and 392 deletions
|
|
@ -13,6 +13,20 @@ Content-Type: application/json
|
|||
]
|
||||
}
|
||||
|
||||
### prompt gateway request default target
|
||||
POST {{prompt_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "hello"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
### prompt gateway request (streaming)
|
||||
POST {{prompt_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
Content-Type: application/json
|
||||
|
|
|
|||
|
|
@ -26,4 +26,4 @@ COPY arch/envoy.template.yaml .
|
|||
COPY arch/arch_config_schema.yaml .
|
||||
|
||||
|
||||
ENTRYPOINT ["sh", "-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:trace"]
|
||||
ENTRYPOINT ["sh", "-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:debug"]
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ services:
|
|||
image: katanemo/archgw:latest
|
||||
ports:
|
||||
- "10000:10000"
|
||||
- "10001:10001"
|
||||
- "11000:11000"
|
||||
- "12000:12000"
|
||||
- "19901:9901"
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ services:
|
|||
image: katanemo/archgw:latest
|
||||
ports:
|
||||
- "10000:10000"
|
||||
- "10001:10001"
|
||||
- "11000:11000"
|
||||
- "12000:12000"
|
||||
- "19901:9901"
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ services:
|
|||
image: katanemo/archgw:latest
|
||||
ports:
|
||||
- "10000:10000"
|
||||
- "10001:10001"
|
||||
- "11000:11000"
|
||||
- "12000:12000"
|
||||
- "19901:9901"
|
||||
|
|
|
|||
|
|
@ -98,12 +98,18 @@ static_resources:
|
|||
domains:
|
||||
- "*"
|
||||
routes:
|
||||
{% for provider in arch_llm_providers %}
|
||||
- match:
|
||||
prefix: "/"
|
||||
headers:
|
||||
- name: "x-arch-llm-provider"
|
||||
string_match:
|
||||
exact: {{ provider.name }}
|
||||
route:
|
||||
auto_host_rewrite: true
|
||||
cluster: arch_llm_listener
|
||||
cluster: {{ provider.provider }}
|
||||
timeout: 60s
|
||||
{% endfor %}
|
||||
http_filters:
|
||||
- name: envoy.filters.http.compressor
|
||||
typed_config:
|
||||
|
|
@ -131,6 +137,23 @@ static_resources:
|
|||
code:
|
||||
local:
|
||||
filename: "/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm"
|
||||
- name: envoy.filters.http.wasm
|
||||
typed_config:
|
||||
"@type": type.googleapis.com/udpa.type.v1.TypedStruct
|
||||
type_url: type.googleapis.com/envoy.extensions.filters.http.wasm.v3.Wasm
|
||||
value:
|
||||
config:
|
||||
name: "http_config"
|
||||
root_id: llm_gateway
|
||||
configuration:
|
||||
"@type": "type.googleapis.com/google.protobuf.StringValue"
|
||||
value: |
|
||||
{{ arch_llm_config | indent(32) }}
|
||||
vm_config:
|
||||
runtime: "envoy.wasm.runtime.v8"
|
||||
code:
|
||||
local:
|
||||
filename: "/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm"
|
||||
- name: envoy.filters.http.decompressor
|
||||
typed_config:
|
||||
"@type": type.googleapis.com/envoy.extensions.filters.http.decompressor.v3.Decompressor
|
||||
|
|
@ -146,7 +169,6 @@ static_resources:
|
|||
typed_config:
|
||||
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
|
||||
|
||||
|
||||
- name: arch_internal
|
||||
address:
|
||||
socket_address:
|
||||
|
|
@ -457,22 +479,6 @@ static_resources:
|
|||
port_value: 10001
|
||||
hostname: arch_prompt_gateway_listener
|
||||
|
||||
- name: arch_llm_listener
|
||||
connect_timeout: 5s
|
||||
type: LOGICAL_DNS
|
||||
dns_lookup_family: V4_ONLY
|
||||
lb_policy: ROUND_ROBIN
|
||||
load_assignment:
|
||||
cluster_name: arch_llm_listener
|
||||
endpoints:
|
||||
- lb_endpoints:
|
||||
- endpoint:
|
||||
address:
|
||||
socket_address:
|
||||
address: 0.0.0.0
|
||||
port_value: 12000
|
||||
hostname: arch_llm_listener
|
||||
|
||||
{% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %}
|
||||
- name: opentelemetry_collector
|
||||
type: STRICT_DNS
|
||||
|
|
|
|||
|
|
@ -1095,7 +1095,17 @@ impl StreamContext {
|
|||
{
|
||||
let default_target_response_str = if self.streaming_response {
|
||||
let chat_completion_response =
|
||||
serde_json::from_slice::<ChatCompletionsResponse>(&body).unwrap();
|
||||
match serde_json::from_slice::<ChatCompletionsResponse>(&body) {
|
||||
Ok(chat_completion_response) => chat_completion_response,
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"error deserializing default target response: {}, body str: {}",
|
||||
e,
|
||||
String::from_utf8(body).unwrap()
|
||||
);
|
||||
return self.send_server_error(ServerError::Deserialization(e), None);
|
||||
}
|
||||
};
|
||||
|
||||
let chunks = vec![
|
||||
ChatCompletionStreamResponse::new(
|
||||
|
|
|
|||
|
|
@ -74,4 +74,4 @@ prompt_targets:
|
|||
|
||||
tracing:
|
||||
random_sampling: 100
|
||||
# trace_arch: true
|
||||
trace_arch: true
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
import os
|
||||
import time
|
||||
import torch
|
||||
import app.commons.utilities as utils
|
||||
|
|
@ -15,12 +16,38 @@ from app.function_calling.model_utils import (
|
|||
)
|
||||
from unittest.mock import patch
|
||||
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
|
||||
resource = Resource.create(
|
||||
{
|
||||
"service.name": "model-server",
|
||||
}
|
||||
)
|
||||
|
||||
# Initialize the tracer provider
|
||||
trace.set_tracer_provider(TracerProvider(resource=resource))
|
||||
tracer = trace.get_tracer(__name__)
|
||||
|
||||
|
||||
logger = utils.get_model_server_logger()
|
||||
|
||||
logger.info(f"Ready to serve traffic. available device: {glb.DEVICE}")
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
FastAPIInstrumentor().instrument_app(app)
|
||||
|
||||
# Configure the OTLP exporter (Jaeger, Zipkin, etc.)
|
||||
otlp_exporter = OTLPSpanExporter(
|
||||
endpoint=os.getenv("OLTP_HOST", "http://localhost:4317") # noqa: F821
|
||||
)
|
||||
trace.get_tracer_provider().add_span_processor(BatchSpanProcessor(otlp_exporter))
|
||||
|
||||
|
||||
class EmbeddingRequest(BaseModel):
|
||||
input: str
|
||||
|
|
|
|||
1047
model_server/poetry.lock
generated
1047
model_server/poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -32,6 +32,10 @@ onnxruntime = "1.19.2"
|
|||
httpx = "*"
|
||||
pytest-asyncio = "*"
|
||||
pytest = "*"
|
||||
opentelemetry-api = "^1.28.0"
|
||||
opentelemetry-sdk = "^1.28.0"
|
||||
opentelemetry-exporter-otlp = "^1.28.0"
|
||||
opentelemetry-instrumentation-fastapi = "^0.49b0"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
archgw_modelserver = "app.cli:run_server"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue