simplify developer getting started experience (#102)

* Fixed build. Now, we have a bare bones version of the docker-compose file with only two services, archgw and archgw-model-server. Tested using CLI

* some pre-commit fixes

* fixed cargo formatting issues

* fixed model server conflict changes

---------

Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-261.local>
This commit is contained in:
Salman Paracha 2024-10-01 10:02:23 -07:00 committed by GitHub
parent 41cdef590a
commit 8654d3d5c5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 53 additions and 407 deletions

View file

@ -10,8 +10,17 @@ COPY public_types /public_types
RUN cargo build --release --target wasm32-wasi
# copy built filter into envoy image
FROM envoyproxy/envoy:v1.30-latest
FROM envoyproxy/envoy:v1.30-latest as envoy
#Build config generator, so that we have a single build image for both Rust and Python
FROM python:3-slim as arch
COPY --from=builder /arch/target/wasm32-wasi/release/intelligent_prompt_gateway.wasm /etc/envoy/proxy-wasm-plugins/intelligent_prompt_gateway.wasm
# CMD ["envoy", "-c", "/etc/envoy/envoy.yaml"]
# CMD ["envoy", "-c", "/etc/envoy/envoy.yaml", "--log-level", "debug"]
CMD ["envoy", "-c", "/etc/envoy/envoy.yaml", "--component-log-level", "wasm:debug"]
COPY --from=envoy /usr/local/bin/envoy /usr/local/bin/envoy
WORKDIR /config
COPY arch/requirements.txt .
RUN pip install -r requirements.txt
COPY arch/config_generator.py .
COPY arch/envoy.template.yaml .
COPY arch/arch_config_schema.yaml .
CMD ["sh", "-c", "python config_generator.py && envoy -c /etc/envoy/envoy.yaml --component-log-level wasm:debug"]

View file

@ -0,0 +1,147 @@
$schema: "http://json-schema.org/draft-07/schema#"
type: object
properties:
version:
type: string
listener:
type: object
properties:
address:
type: string
port:
type: integer
message_format:
type: string
connect_timeout:
type: string
additionalProperties: false
required:
- address
- port
endpoints:
type: object
patternProperties:
"^.*$":
type: object
properties:
endpoint:
type: string
connect_timeout:
type: string
additionalProperties: false
required:
- endpoint
llm_providers:
type: array
items:
type: object
properties:
name:
type: string
access_key:
type: string
model:
type: string
default:
type: boolean
additionalProperties: false
required:
- name
- access_key
- model
overrides:
type: object
properties:
prompt_target_intent_matching_threshold:
type: number
system_prompt:
type: string
prompt_targets:
type: array
items:
type: object
properties:
name:
type: string
default:
type: boolean
description:
type: string
parameters:
type: array
items:
type: object
properties:
name:
type: string
additionalProperties: false
required:
type: boolean
default:
type: string
description:
type: string
type:
type: string
additionalProperties: false
required:
- name
- description
- type
endpoint:
type: object
properties:
name:
type: string
path:
type: string
additionalProperties: false
required:
- name
- path
system_prompt:
type: string
additionalProperties: false
required:
- name
- description
ratelimits:
type: array
items:
type: object
properties:
provider:
type: string
selector:
type: object
properties:
key:
type: string
value:
type: string
additionalProperties: false
required:
- key
- value
limit:
type: object
properties:
tokens:
type: integer
unit:
type: string
additionalProperties: false
required:
- tokens
- unit
additionalProperties: false
required:
- provider
- selector
- limit
additionalProperties: false
required:
- version
- listener
- llm_providers
- prompt_targets

66
arch/config_generator.py Normal file
View file

@ -0,0 +1,66 @@
import os
from jinja2 import Environment, FileSystemLoader
import yaml
from jsonschema import validate
ENVOY_CONFIG_TEMPLATE_FILE = os.getenv('ENVOY_CONFIG_TEMPLATE_FILE', 'envoy.template.yaml')
ARCH_CONFIG_FILE = os.getenv('ARCH_CONFIG_FILE', '/config/arch_config.yaml')
ENVOY_CONFIG_FILE_RENDERED = os.getenv('ENVOY_CONFIG_FILE_RENDERED', '/etc/envoy/envoy.yaml')
ARCH_CONFIG_SCHEMA_FILE = os.getenv('ARCH_CONFIG_SCHEMA_FILE', 'arch_config_schema.yaml')
env = Environment(loader=FileSystemLoader('./'))
template = env.get_template('envoy.template.yaml')
with open(ARCH_CONFIG_FILE, 'r') as file:
katanemo_config = file.read()
with open(ARCH_CONFIG_SCHEMA_FILE, 'r') as file:
arch_config_schema = file.read()
config_yaml = yaml.safe_load(katanemo_config)
config_schema_yaml = yaml.safe_load(arch_config_schema)
try:
validate(config_yaml, config_schema_yaml)
except Exception as e:
print(f"Error validating arch_config file: {ARCH_CONFIG_FILE}, error: {e.message}")
exit(1)
inferred_clusters = {}
for prompt_target in config_yaml["prompt_targets"]:
name = prompt_target.get("endpoint", {}).get("name", "")
if name not in inferred_clusters:
inferred_clusters[name] = {
"name": name,
"port": 80, # default port
}
print(inferred_clusters)
endpoints = config_yaml.get("endpoints", {})
# override the inferred clusters with the ones defined in the config
for name, endpoint_details in endpoints.items():
if name in inferred_clusters:
print("updating cluster", endpoint_details)
inferred_clusters[name].update(endpoint_details)
endpoint = inferred_clusters[name]['endpoint']
if len(endpoint.split(':')) > 1:
inferred_clusters[name]['endpoint'] = endpoint.split(':')[0]
inferred_clusters[name]['port'] = int(endpoint.split(':')[1])
else:
inferred_clusters[name] = endpoint_details
print("updated clusters", inferred_clusters)
data = {
'katanemo_config': katanemo_config,
'arch_clusters': inferred_clusters
}
rendered = template.render(data)
print(rendered)
print(ENVOY_CONFIG_FILE_RENDERED)
with open(ENVOY_CONFIG_FILE_RENDERED, 'w') as file:
file.write(rendered)

View file

@ -1,43 +1,28 @@
services:
envoy:
image: envoyproxy/envoy:v1.30-latest
hostname: envoy
archgw:
build:
context: ../
dockerfile: arch/Dockerfile
ports:
- "10000:10000"
- "19901:9901"
- "18080:9901"
volumes:
- ./envoy.yaml:/etc/envoy/envoy.yaml
- ./target/wasm32-wasi/release:/etc/envoy/proxy-wasm-plugins
- ${ARCH_CONFIG_FILE}:/config/arch_config.yaml
- /etc/ssl/cert.pem:/etc/ssl/cert.pem
- ./arch_log:/var/log/
depends_on:
qdrant:
condition: service_started
embeddingserver:
archgw_model_server:
condition: service_healthy
embeddingserver:
archgw_model_server:
build:
context: ../embedding-server
context: ../model_server
dockerfile: Dockerfile
ports:
- "18080:80"
- "18081:80"
healthcheck:
test: ["CMD", "curl" ,"http://localhost:80/healthz"]
test: ["CMD", "curl" ,"http://localhost/healthz"]
interval: 5s
retries: 20
qdrant:
image: qdrant/qdrant
hostname: vector-db
ports:
- 16333:6333
- 16334:6334
chatbot-ui:
build:
context: ../chatbot-ui
dockerfile: Dockerfile
ports:
- "18080:8080"
environment:
- CHAT_COMPLETION_ENDPOINT=http://envoy:10000/v1
volumes:
- ~/.cache/huggingface:/root/.cache/huggingface

View file

@ -132,20 +132,20 @@ static_resources:
typed_config:
"@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext
sni: api.mistral.ai
- name: model_server
- name: archgw_model_server
connect_timeout: 5s
type: STRICT_DNS
lb_policy: ROUND_ROBIN
load_assignment:
cluster_name: model_server
cluster_name: archgw_model_server
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: model_server
address: archgw_model_server
port_value: 80
hostname: "model_server"
hostname: "archgw_model_server"
- name: mistral_7b_instruct
connect_timeout: 5s
type: STRICT_DNS
@ -171,7 +171,7 @@ static_resources:
- endpoint:
address:
socket_address:
address: model_server
address: archgw_model_server
port_value: 80
hostname: "arch_fc"
{% for _, cluster in arch_clusters.items() %}

View file

@ -1,16 +0,0 @@
#!/bin/sh
echo 'Deleting prompt_vector_store collection'
curl -X DELETE http://localhost:16333/collections/prompt_vector_store
echo
echo 'Creating prompt_vector_store collection'
curl -X PUT 'http://localhost:16333/collections/prompt_vector_store' \
-H 'Content-Type: application/json' \
--data-raw '{
"vectors": {
"size": 1024,
"distance": "Cosine"
}
}'
echo
echo 'Created prompt_vector_store collection'

3
arch/requirements.txt Normal file
View file

@ -0,0 +1,3 @@
jinja2
pyyaml
jsonschema

View file

@ -7,6 +7,6 @@ pub const USER_ROLE: &str = "user";
pub const GPT_35_TURBO: &str = "gpt-3.5-turbo";
pub const ARC_FC_CLUSTER: &str = "arch_fc";
pub const ARCH_FC_REQUEST_TIMEOUT_MS: u64 = 120000; // 2 minutes
pub const MODEL_SERVER_NAME: &str = "model_server";
pub const MODEL_SERVER_NAME: &str = "archgw_model_server";
pub const ARCH_ROUTING_HEADER: &str = "x-arch-llm-provider";
pub const ARCH_MESSAGES_KEY: &str = "arch_messages";

View file

@ -141,7 +141,10 @@ impl FilterContext {
) {
Ok(token_id) => token_id,
Err(e) => {
panic!("Error dispatching HTTP call: {:?}", e);
panic!(
"Error dispatching HTTP call: {}, error: {:?}",
MODEL_SERVER_NAME, e
);
}
};
token_id

View file

@ -104,7 +104,7 @@ fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) {
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
// The actual call is not important in this test, we just need to grab the token_id
.expect_http_call(Some("model_server"), None, None, None, None)
.expect_http_call(Some("archgw_model_server"), None, None, None, None)
.returning(Some(1))
.expect_log(Some(LogLevel::Debug), None)
.expect_metric_increment("active_http_calls", 1)
@ -136,7 +136,7 @@ fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) {
.returning(Some(&embeddings_response_buffer))
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_http_call(Some("model_server"), None, None, None, None)
.expect_http_call(Some("archgw_model_server"), None, None, None, None)
.returning(Some(2))
.expect_metric_increment("active_http_calls", 1)
.expect_log(Some(LogLevel::Debug), None)
@ -313,7 +313,7 @@ fn successful_request_to_open_ai_chat_completions() {
.returning(Some(chat_completions_request_body))
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), None)
.expect_http_call(Some("model_server"), None, None, None, None)
.expect_http_call(Some("archgw_model_server"), None, None, None, None)
.returning(Some(4))
.expect_metric_increment("active_http_calls", 1)
.execute_and_expect(ReturnType::Action(Action::Pause))