mirror of
https://github.com/katanemo/plano.git
synced 2026-06-23 15:38:07 +02:00
simplify developer getting started experience (#102)
* Fixed build. Now, we have a bare bones version of the docker-compose file with only two services, archgw and archgw-model-server. Tested using CLI * some pre-commit fixes * fixed cargo formatting issues * fixed model server conflict changes --------- Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-261.local>
This commit is contained in:
parent
41cdef590a
commit
8654d3d5c5
20 changed files with 53 additions and 407 deletions
|
|
@ -10,8 +10,17 @@ COPY public_types /public_types
|
|||
RUN cargo build --release --target wasm32-wasi
|
||||
|
||||
# copy built filter into envoy image
|
||||
FROM envoyproxy/envoy:v1.30-latest
|
||||
FROM envoyproxy/envoy:v1.30-latest as envoy
|
||||
|
||||
#Build config generator, so that we have a single build image for both Rust and Python
|
||||
FROM python:3-slim as arch
|
||||
COPY --from=builder /arch/target/wasm32-wasi/release/intelligent_prompt_gateway.wasm /etc/envoy/proxy-wasm-plugins/intelligent_prompt_gateway.wasm
|
||||
# CMD ["envoy", "-c", "/etc/envoy/envoy.yaml"]
|
||||
# CMD ["envoy", "-c", "/etc/envoy/envoy.yaml", "--log-level", "debug"]
|
||||
CMD ["envoy", "-c", "/etc/envoy/envoy.yaml", "--component-log-level", "wasm:debug"]
|
||||
COPY --from=envoy /usr/local/bin/envoy /usr/local/bin/envoy
|
||||
WORKDIR /config
|
||||
COPY arch/requirements.txt .
|
||||
RUN pip install -r requirements.txt
|
||||
COPY arch/config_generator.py .
|
||||
COPY arch/envoy.template.yaml .
|
||||
COPY arch/arch_config_schema.yaml .
|
||||
|
||||
CMD ["sh", "-c", "python config_generator.py && envoy -c /etc/envoy/envoy.yaml --component-log-level wasm:debug"]
|
||||
|
|
|
|||
147
arch/arch_config_schema.yaml
Normal file
147
arch/arch_config_schema.yaml
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
$schema: "http://json-schema.org/draft-07/schema#"
|
||||
type: object
|
||||
properties:
|
||||
version:
|
||||
type: string
|
||||
listener:
|
||||
type: object
|
||||
properties:
|
||||
address:
|
||||
type: string
|
||||
port:
|
||||
type: integer
|
||||
message_format:
|
||||
type: string
|
||||
connect_timeout:
|
||||
type: string
|
||||
additionalProperties: false
|
||||
required:
|
||||
- address
|
||||
- port
|
||||
endpoints:
|
||||
type: object
|
||||
patternProperties:
|
||||
"^.*$":
|
||||
type: object
|
||||
properties:
|
||||
endpoint:
|
||||
type: string
|
||||
connect_timeout:
|
||||
type: string
|
||||
additionalProperties: false
|
||||
required:
|
||||
- endpoint
|
||||
llm_providers:
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
access_key:
|
||||
type: string
|
||||
model:
|
||||
type: string
|
||||
default:
|
||||
type: boolean
|
||||
additionalProperties: false
|
||||
required:
|
||||
- name
|
||||
- access_key
|
||||
- model
|
||||
overrides:
|
||||
type: object
|
||||
properties:
|
||||
prompt_target_intent_matching_threshold:
|
||||
type: number
|
||||
system_prompt:
|
||||
type: string
|
||||
prompt_targets:
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
default:
|
||||
type: boolean
|
||||
description:
|
||||
type: string
|
||||
parameters:
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
additionalProperties: false
|
||||
required:
|
||||
type: boolean
|
||||
default:
|
||||
type: string
|
||||
description:
|
||||
type: string
|
||||
type:
|
||||
type: string
|
||||
additionalProperties: false
|
||||
required:
|
||||
- name
|
||||
- description
|
||||
- type
|
||||
endpoint:
|
||||
type: object
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
path:
|
||||
type: string
|
||||
additionalProperties: false
|
||||
required:
|
||||
- name
|
||||
- path
|
||||
system_prompt:
|
||||
type: string
|
||||
additionalProperties: false
|
||||
required:
|
||||
- name
|
||||
- description
|
||||
ratelimits:
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
provider:
|
||||
type: string
|
||||
selector:
|
||||
type: object
|
||||
properties:
|
||||
key:
|
||||
type: string
|
||||
value:
|
||||
type: string
|
||||
additionalProperties: false
|
||||
required:
|
||||
- key
|
||||
- value
|
||||
limit:
|
||||
type: object
|
||||
properties:
|
||||
tokens:
|
||||
type: integer
|
||||
unit:
|
||||
type: string
|
||||
additionalProperties: false
|
||||
required:
|
||||
- tokens
|
||||
- unit
|
||||
additionalProperties: false
|
||||
required:
|
||||
- provider
|
||||
- selector
|
||||
- limit
|
||||
additionalProperties: false
|
||||
required:
|
||||
- version
|
||||
- listener
|
||||
- llm_providers
|
||||
- prompt_targets
|
||||
66
arch/config_generator.py
Normal file
66
arch/config_generator.py
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
import os
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
import yaml
|
||||
from jsonschema import validate
|
||||
|
||||
ENVOY_CONFIG_TEMPLATE_FILE = os.getenv('ENVOY_CONFIG_TEMPLATE_FILE', 'envoy.template.yaml')
|
||||
ARCH_CONFIG_FILE = os.getenv('ARCH_CONFIG_FILE', '/config/arch_config.yaml')
|
||||
ENVOY_CONFIG_FILE_RENDERED = os.getenv('ENVOY_CONFIG_FILE_RENDERED', '/etc/envoy/envoy.yaml')
|
||||
ARCH_CONFIG_SCHEMA_FILE = os.getenv('ARCH_CONFIG_SCHEMA_FILE', 'arch_config_schema.yaml')
|
||||
|
||||
env = Environment(loader=FileSystemLoader('./'))
|
||||
template = env.get_template('envoy.template.yaml')
|
||||
|
||||
with open(ARCH_CONFIG_FILE, 'r') as file:
|
||||
katanemo_config = file.read()
|
||||
|
||||
with open(ARCH_CONFIG_SCHEMA_FILE, 'r') as file:
|
||||
arch_config_schema = file.read()
|
||||
|
||||
config_yaml = yaml.safe_load(katanemo_config)
|
||||
config_schema_yaml = yaml.safe_load(arch_config_schema)
|
||||
|
||||
try:
|
||||
validate(config_yaml, config_schema_yaml)
|
||||
except Exception as e:
|
||||
print(f"Error validating arch_config file: {ARCH_CONFIG_FILE}, error: {e.message}")
|
||||
exit(1)
|
||||
|
||||
inferred_clusters = {}
|
||||
|
||||
for prompt_target in config_yaml["prompt_targets"]:
|
||||
name = prompt_target.get("endpoint", {}).get("name", "")
|
||||
if name not in inferred_clusters:
|
||||
inferred_clusters[name] = {
|
||||
"name": name,
|
||||
"port": 80, # default port
|
||||
}
|
||||
|
||||
print(inferred_clusters)
|
||||
|
||||
endpoints = config_yaml.get("endpoints", {})
|
||||
|
||||
# override the inferred clusters with the ones defined in the config
|
||||
for name, endpoint_details in endpoints.items():
|
||||
if name in inferred_clusters:
|
||||
print("updating cluster", endpoint_details)
|
||||
inferred_clusters[name].update(endpoint_details)
|
||||
endpoint = inferred_clusters[name]['endpoint']
|
||||
if len(endpoint.split(':')) > 1:
|
||||
inferred_clusters[name]['endpoint'] = endpoint.split(':')[0]
|
||||
inferred_clusters[name]['port'] = int(endpoint.split(':')[1])
|
||||
else:
|
||||
inferred_clusters[name] = endpoint_details
|
||||
|
||||
print("updated clusters", inferred_clusters)
|
||||
|
||||
data = {
|
||||
'katanemo_config': katanemo_config,
|
||||
'arch_clusters': inferred_clusters
|
||||
}
|
||||
|
||||
rendered = template.render(data)
|
||||
print(rendered)
|
||||
print(ENVOY_CONFIG_FILE_RENDERED)
|
||||
with open(ENVOY_CONFIG_FILE_RENDERED, 'w') as file:
|
||||
file.write(rendered)
|
||||
|
|
@ -1,43 +1,28 @@
|
|||
services:
|
||||
envoy:
|
||||
image: envoyproxy/envoy:v1.30-latest
|
||||
hostname: envoy
|
||||
archgw:
|
||||
build:
|
||||
context: ../
|
||||
dockerfile: arch/Dockerfile
|
||||
ports:
|
||||
- "10000:10000"
|
||||
- "19901:9901"
|
||||
- "18080:9901"
|
||||
volumes:
|
||||
- ./envoy.yaml:/etc/envoy/envoy.yaml
|
||||
- ./target/wasm32-wasi/release:/etc/envoy/proxy-wasm-plugins
|
||||
- ${ARCH_CONFIG_FILE}:/config/arch_config.yaml
|
||||
- /etc/ssl/cert.pem:/etc/ssl/cert.pem
|
||||
- ./arch_log:/var/log/
|
||||
depends_on:
|
||||
qdrant:
|
||||
condition: service_started
|
||||
embeddingserver:
|
||||
archgw_model_server:
|
||||
condition: service_healthy
|
||||
|
||||
embeddingserver:
|
||||
archgw_model_server:
|
||||
build:
|
||||
context: ../embedding-server
|
||||
context: ../model_server
|
||||
dockerfile: Dockerfile
|
||||
ports:
|
||||
- "18080:80"
|
||||
- "18081:80"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl" ,"http://localhost:80/healthz"]
|
||||
test: ["CMD", "curl" ,"http://localhost/healthz"]
|
||||
interval: 5s
|
||||
retries: 20
|
||||
|
||||
qdrant:
|
||||
image: qdrant/qdrant
|
||||
hostname: vector-db
|
||||
ports:
|
||||
- 16333:6333
|
||||
- 16334:6334
|
||||
|
||||
chatbot-ui:
|
||||
build:
|
||||
context: ../chatbot-ui
|
||||
dockerfile: Dockerfile
|
||||
ports:
|
||||
- "18080:8080"
|
||||
environment:
|
||||
- CHAT_COMPLETION_ENDPOINT=http://envoy:10000/v1
|
||||
volumes:
|
||||
- ~/.cache/huggingface:/root/.cache/huggingface
|
||||
|
|
|
|||
|
|
@ -132,20 +132,20 @@ static_resources:
|
|||
typed_config:
|
||||
"@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext
|
||||
sni: api.mistral.ai
|
||||
- name: model_server
|
||||
- name: archgw_model_server
|
||||
connect_timeout: 5s
|
||||
type: STRICT_DNS
|
||||
lb_policy: ROUND_ROBIN
|
||||
load_assignment:
|
||||
cluster_name: model_server
|
||||
cluster_name: archgw_model_server
|
||||
endpoints:
|
||||
- lb_endpoints:
|
||||
- endpoint:
|
||||
address:
|
||||
socket_address:
|
||||
address: model_server
|
||||
address: archgw_model_server
|
||||
port_value: 80
|
||||
hostname: "model_server"
|
||||
hostname: "archgw_model_server"
|
||||
- name: mistral_7b_instruct
|
||||
connect_timeout: 5s
|
||||
type: STRICT_DNS
|
||||
|
|
@ -171,7 +171,7 @@ static_resources:
|
|||
- endpoint:
|
||||
address:
|
||||
socket_address:
|
||||
address: model_server
|
||||
address: archgw_model_server
|
||||
port_value: 80
|
||||
hostname: "arch_fc"
|
||||
{% for _, cluster in arch_clusters.items() %}
|
||||
|
|
|
|||
|
|
@ -1,16 +0,0 @@
|
|||
#!/bin/sh
|
||||
|
||||
echo 'Deleting prompt_vector_store collection'
|
||||
curl -X DELETE http://localhost:16333/collections/prompt_vector_store
|
||||
echo
|
||||
echo 'Creating prompt_vector_store collection'
|
||||
curl -X PUT 'http://localhost:16333/collections/prompt_vector_store' \
|
||||
-H 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"vectors": {
|
||||
"size": 1024,
|
||||
"distance": "Cosine"
|
||||
}
|
||||
}'
|
||||
echo
|
||||
echo 'Created prompt_vector_store collection'
|
||||
3
arch/requirements.txt
Normal file
3
arch/requirements.txt
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
jinja2
|
||||
pyyaml
|
||||
jsonschema
|
||||
|
|
@ -7,6 +7,6 @@ pub const USER_ROLE: &str = "user";
|
|||
pub const GPT_35_TURBO: &str = "gpt-3.5-turbo";
|
||||
pub const ARC_FC_CLUSTER: &str = "arch_fc";
|
||||
pub const ARCH_FC_REQUEST_TIMEOUT_MS: u64 = 120000; // 2 minutes
|
||||
pub const MODEL_SERVER_NAME: &str = "model_server";
|
||||
pub const MODEL_SERVER_NAME: &str = "archgw_model_server";
|
||||
pub const ARCH_ROUTING_HEADER: &str = "x-arch-llm-provider";
|
||||
pub const ARCH_MESSAGES_KEY: &str = "arch_messages";
|
||||
|
|
|
|||
|
|
@ -141,7 +141,10 @@ impl FilterContext {
|
|||
) {
|
||||
Ok(token_id) => token_id,
|
||||
Err(e) => {
|
||||
panic!("Error dispatching HTTP call: {:?}", e);
|
||||
panic!(
|
||||
"Error dispatching HTTP call: {}, error: {:?}",
|
||||
MODEL_SERVER_NAME, e
|
||||
);
|
||||
}
|
||||
};
|
||||
token_id
|
||||
|
|
|
|||
|
|
@ -104,7 +104,7 @@ fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) {
|
|||
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
|
||||
.returning(Some(chat_completions_request_body))
|
||||
// The actual call is not important in this test, we just need to grab the token_id
|
||||
.expect_http_call(Some("model_server"), None, None, None, None)
|
||||
.expect_http_call(Some("archgw_model_server"), None, None, None, None)
|
||||
.returning(Some(1))
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_metric_increment("active_http_calls", 1)
|
||||
|
|
@ -136,7 +136,7 @@ fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) {
|
|||
.returning(Some(&embeddings_response_buffer))
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_http_call(Some("model_server"), None, None, None, None)
|
||||
.expect_http_call(Some("archgw_model_server"), None, None, None, None)
|
||||
.returning(Some(2))
|
||||
.expect_metric_increment("active_http_calls", 1)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
|
|
@ -313,7 +313,7 @@ fn successful_request_to_open_ai_chat_completions() {
|
|||
.returning(Some(chat_completions_request_body))
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Info), None)
|
||||
.expect_http_call(Some("model_server"), None, None, None, None)
|
||||
.expect_http_call(Some("archgw_model_server"), None, None, None, None)
|
||||
.returning(Some(4))
|
||||
.expect_metric_increment("active_http_calls", 1)
|
||||
.execute_and_expect(ReturnType::Action(Action::Pause))
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue