mirror of
https://github.com/katanemo/plano.git
synced 2026-04-25 00:36:34 +02:00
Add support for local llm (mistral 7b) (#31)
This commit is contained in:
parent
b49fc2f264
commit
445b1ea210
24 changed files with 703 additions and 51 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
|
@ -7,4 +7,6 @@ grafana-data
|
|||
prom_data
|
||||
.env
|
||||
qdrant_data
|
||||
demos/weather-forecast/generated/
|
||||
generated
|
||||
.DS_Store
|
||||
*.gguf
|
||||
|
|
|
|||
16
chatbot-ui/.vscode/launch.json
vendored
Normal file
16
chatbot-ui/.vscode/launch.json
vendored
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "chatbot-ui",
|
||||
"cwd": "${workspaceFolder}/app",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "run.py",
|
||||
"console": "integratedTerminal",
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -19,18 +19,21 @@ class Message(BaseModel):
|
|||
role: str
|
||||
content: str
|
||||
|
||||
async def make_completion(messages:List[Message], nb_retries:int=3, delay:int=30) -> Optional[str]:
|
||||
async def make_completion(messages:List[Message], nb_retries:int=3, delay:int=120) -> Optional[str]:
|
||||
"""
|
||||
Sends a request to the ChatGPT API to retrieve a response based on a list of previous messages.
|
||||
"""
|
||||
header = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {OPENAI_API_KEY}"
|
||||
}
|
||||
|
||||
if OPENAI_API_KEY is not None and OPENAI_API_KEY != "":
|
||||
header["Authorization"] = f"Bearer {OPENAI_API_KEY}"
|
||||
|
||||
if OPENAI_API_KEY is None or OPENAI_API_KEY == "":
|
||||
logger.error("No OpenAI API Key found. Please create .env file and set OPENAI_API_KEY env var !")
|
||||
return None
|
||||
if CHAT_COMPLETION_ENDPOINT.startswith("https://api.openai.com"):
|
||||
logger.error("No OpenAI API Key found. Please create .env file and set OPENAI_API_KEY env var !")
|
||||
return None
|
||||
try:
|
||||
async with async_timeout.timeout(delay=delay):
|
||||
async with httpx.AsyncClient(headers=header) as aio_client:
|
||||
|
|
@ -44,7 +47,8 @@ async def make_completion(messages:List[Message], nb_retries:int=3, delay:int=30
|
|||
json = {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": messages
|
||||
}
|
||||
},
|
||||
timeout=delay
|
||||
)
|
||||
logger.debug(f"Status Code : {resp.status_code}")
|
||||
if resp.status_code == 200:
|
||||
|
|
@ -66,7 +70,8 @@ async def predict(input, history):
|
|||
"""
|
||||
history.append({"role": "user", "content": input})
|
||||
response = await make_completion(history)
|
||||
history.append({"role": "assistant", "content": response})
|
||||
if response is not None:
|
||||
history.append({"role": "assistant", "content": response})
|
||||
messages = [(history[i]["content"], history[i+1]["content"]) for i in range(0, len(history)-1, 2)]
|
||||
return messages, history
|
||||
|
||||
|
|
|
|||
22
demos/weather-forecast-local-llm/README.md
Normal file
22
demos/weather-forecast-local-llm/README.md
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
# Weather forecasting
|
||||
This demo shows how you can use intelligent prompt gateway to provide realtime weather forecast using Mistral LLM locally hosted using llama.cpp as LLM Hosting Service.
|
||||
|
||||
# Startig the demo
|
||||
1. Ensure that submodule is up to date
|
||||
```sh
|
||||
git submodule sync --recursive
|
||||
```
|
||||
1. Download mistral 7b model using following shell command
|
||||
```sh
|
||||
sh download_mistral_7b.sh
|
||||
```
|
||||
2. Start services
|
||||
```sh
|
||||
docker compose up
|
||||
```
|
||||
3. Navigate to http://localhost:18080/
|
||||
4. You can type in queries like "how is the weather in Seattle"
|
||||
1. You can also ask follow up questions like "show me sunny days"
|
||||
5. To see metrics navigate to "http://localhost:3000/" (use admin/grafana for login)
|
||||
1. Open up dahsboard named "Intelligent Gateway Overview"
|
||||
2. On this dashboard you can see reuqest latency and number of requests
|
||||
88
demos/weather-forecast-local-llm/docker-compose.yaml
Normal file
88
demos/weather-forecast-local-llm/docker-compose.yaml
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
services:
|
||||
config-generator:
|
||||
build:
|
||||
context: ../../
|
||||
dockerfile: config_generator/Dockerfile
|
||||
volumes:
|
||||
- ./katanemo-config.yaml:/usr/src/app/katanemo-config.yaml
|
||||
- ./generated:/usr/src/app/out
|
||||
envoy:
|
||||
build:
|
||||
context: ../../
|
||||
dockerfile: envoyfilter/Dockerfile
|
||||
hostname: envoy
|
||||
ports:
|
||||
- "10000:10000"
|
||||
- "19901:9901"
|
||||
volumes:
|
||||
- ./generated/envoy.yaml:/etc/envoy/envoy.yaml
|
||||
- /etc/ssl/cert.pem:/etc/ssl/cert.pem
|
||||
depends_on:
|
||||
config-generator:
|
||||
condition: service_completed_successfully
|
||||
embeddingserver:
|
||||
condition: service_healthy
|
||||
|
||||
embeddingserver:
|
||||
build:
|
||||
context: ../../embedding-server
|
||||
dockerfile: Dockerfile
|
||||
ports:
|
||||
- "18081:80"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl" ,"http://localhost:80/healthz"]
|
||||
interval: 5s
|
||||
retries: 20
|
||||
volumes:
|
||||
- ~/.cache/huggingface:/root/.cache/huggingface
|
||||
qdrant:
|
||||
image: qdrant/qdrant
|
||||
hostname: vector-db
|
||||
ports:
|
||||
- 16333:6333
|
||||
- 16334:6334
|
||||
|
||||
chatbot-ui:
|
||||
build:
|
||||
context: ../../chatbot-ui
|
||||
dockerfile: Dockerfile
|
||||
ports:
|
||||
- "18080:8080"
|
||||
environment:
|
||||
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||
- CHAT_COMPLETION_ENDPOINT=http://envoy:10000/v1/chat/completions
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus
|
||||
container_name: prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yaml'
|
||||
ports:
|
||||
- 9090:9090
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./prometheus:/etc/prometheus
|
||||
- ./prom_data:/prometheus
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana
|
||||
container_name: grafana
|
||||
ports:
|
||||
- 3000:3000
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
- GF_SECURITY_ADMIN_PASSWORD=grafana
|
||||
volumes:
|
||||
- ./grafana:/etc/grafana/provisioning/datasources
|
||||
- ./grafana/dashboard.yaml:/etc/grafana/provisioning/dashboards/main.yaml
|
||||
- ./grafana/dashboards:/var/lib/grafana/dashboards
|
||||
|
||||
mistral_7b_instruct:
|
||||
image: ghcr.io/ggerganov/llama.cpp:server
|
||||
hostname: mistral_7b_instruct
|
||||
ports:
|
||||
- "10001:10001"
|
||||
volumes:
|
||||
- ./mistral-7b-instruct-v0.2.Q4_K_M.gguf:/models/model.gguf
|
||||
command: ["--host", "0.0.0.0", "--port", "10001", "-m", "/models/model.gguf"]
|
||||
1
demos/weather-forecast-local-llm/download_mistral_7b.sh
Normal file
1
demos/weather-forecast-local-llm/download_mistral_7b.sh
Normal file
|
|
@ -0,0 +1 @@
|
|||
huggingface-cli download TheBloke/Mistral-7B-Instruct-v0.2-GGUF mistral-7b-instruct-v0.2.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False
|
||||
12
demos/weather-forecast-local-llm/grafana/dashboard.yaml
Normal file
12
demos/weather-forecast-local-llm/grafana/dashboard.yaml
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: "Dashboard provider"
|
||||
orgId: 1
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: false
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
foldersFromFilesStructure: true
|
||||
|
|
@ -0,0 +1,355 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "grafana",
|
||||
"uid": "-- Grafana --"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
"expr": "avg(rate(envoy_cluster_internal_upstream_rq_time_sum[1m]) / rate(envoy_cluster_internal_upstream_rq_time_count[1m])) by (envoy_cluster_name)",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
"includeNullMetadata": true,
|
||||
"instant": false,
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
}
|
||||
],
|
||||
"title": "request latency - internal (ms)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
"expr": "avg(rate(envoy_cluster_external_upstream_rq_time_sum[1m]) / rate(envoy_cluster_external_upstream_rq_time_count[1m])) by (envoy_cluster_name)",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
"includeNullMetadata": true,
|
||||
"instant": false,
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
}
|
||||
],
|
||||
"title": "request latency - external (ms)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
"expr": "avg(rate(envoy_cluster_internal_upstream_rq_completed[1m])) by (envoy_cluster_name)",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
"instant": false,
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
"expr": "avg(rate(envoy_cluster_external_upstream_rq_completed[1m])) by (envoy_cluster_name)",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
"includeNullMetadata": true,
|
||||
"instant": false,
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "B",
|
||||
"useBackend": false
|
||||
}
|
||||
],
|
||||
"title": "Upstream request count",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": [],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-15m",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "Intelligent Gateway Overview",
|
||||
"uid": "adt6uhx5lk8aob",
|
||||
"version": 3,
|
||||
"weekStart": ""
|
||||
}
|
||||
9
demos/weather-forecast-local-llm/grafana/datasource.yaml
Normal file
9
demos/weather-forecast-local-llm/grafana/datasource.yaml
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
access: proxy
|
||||
editable: true
|
||||
46
demos/weather-forecast-local-llm/katanemo-config.yaml
Normal file
46
demos/weather-forecast-local-llm/katanemo-config.yaml
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
default_prompt_endpoint: "127.0.0.1"
|
||||
load_balancing: "round_robin"
|
||||
timeout_ms: 5000
|
||||
|
||||
embedding_provider:
|
||||
name: "SentenceTransformer"
|
||||
model: "all-MiniLM-L6-v2"
|
||||
|
||||
llm_providers:
|
||||
|
||||
- name: open-ai-gpt-4
|
||||
api_key: "$OPEN_AI_API_KEY"
|
||||
model: gpt-4
|
||||
|
||||
- name: mistral_7b_instruct
|
||||
model: mistral-7b-instruct
|
||||
endpoint: http://mistral_7b_instruct:10001/v1/chat/completions
|
||||
default: true
|
||||
|
||||
prompt_targets:
|
||||
|
||||
- type: context_resolver
|
||||
name: weather_forecast
|
||||
few_shot_examples:
|
||||
- what is the weather in New York?
|
||||
- how is the weather in San Francisco?
|
||||
- what is the forecast in Chicago?
|
||||
entities:
|
||||
- name: city
|
||||
required: true
|
||||
- name: days
|
||||
endpoint:
|
||||
cluster: weatherhost
|
||||
path: /weather
|
||||
system_prompt: |
|
||||
You are a helpful weather forecaster. Use weater data that is provided to you. Please following following guidelines when responding to user queries:
|
||||
- Use farenheight for temperature
|
||||
- Use miles per hour for wind speed
|
||||
|
||||
#TODO: add support for adding custom clusters e.g.
|
||||
# clusters:
|
||||
# qdrant:
|
||||
# options:
|
||||
# - address: "qdrant"
|
||||
# - address: "weatherhost"
|
||||
# - port: 6333
|
||||
23
demos/weather-forecast-local-llm/prometheus/prometheus.yaml
Normal file
23
demos/weather-forecast-local-llm/prometheus/prometheus.yaml
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
global:
|
||||
scrape_interval: 15s
|
||||
scrape_timeout: 10s
|
||||
evaluation_interval: 15s
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: []
|
||||
scheme: http
|
||||
timeout: 10s
|
||||
api_version: v1
|
||||
scrape_configs:
|
||||
- job_name: envoy
|
||||
honor_timestamps: true
|
||||
scrape_interval: 15s
|
||||
scrape_timeout: 10s
|
||||
metrics_path: /stats
|
||||
scheme: http
|
||||
static_configs:
|
||||
- targets:
|
||||
- envoy:9901
|
||||
params:
|
||||
format: ['prometheus']
|
||||
|
|
@ -4,12 +4,12 @@ This demo shows how you can use intelligent prompt gateway to provide realtime w
|
|||
# Startig the demo
|
||||
1. Ensure that submodule is up to date
|
||||
```sh
|
||||
$ git submodule sync --recursive
|
||||
git submodule sync --recursive
|
||||
```
|
||||
1. Create `.env` file and set OpenAI key using env var `OPENAI_API_KEY`
|
||||
1. Start services
|
||||
```sh
|
||||
$ docker compose up
|
||||
docker compose up
|
||||
```
|
||||
1. Navigate to http://localhost:18080/
|
||||
1. You can type in queries like "how is the weather in Seattle"
|
||||
|
|
|
|||
|
|
@ -17,8 +17,6 @@ services:
|
|||
volumes:
|
||||
- ./generated/envoy.yaml:/etc/envoy/envoy.yaml
|
||||
- /etc/ssl/cert.pem:/etc/ssl/cert.pem
|
||||
networks:
|
||||
- envoymesh
|
||||
depends_on:
|
||||
config-generator:
|
||||
condition: service_completed_successfully
|
||||
|
|
@ -35,17 +33,14 @@ services:
|
|||
test: ["CMD", "curl" ,"http://localhost:80/healthz"]
|
||||
interval: 5s
|
||||
retries: 20
|
||||
networks:
|
||||
- envoymesh
|
||||
|
||||
volumes:
|
||||
- ~/.cache/huggingface:/root/.cache/huggingface
|
||||
qdrant:
|
||||
image: qdrant/qdrant
|
||||
hostname: vector-db
|
||||
ports:
|
||||
- 16333:6333
|
||||
- 16334:6334
|
||||
networks:
|
||||
- envoymesh
|
||||
|
||||
chatbot-ui:
|
||||
build:
|
||||
|
|
@ -53,8 +48,6 @@ services:
|
|||
dockerfile: Dockerfile
|
||||
ports:
|
||||
- "18080:8080"
|
||||
networks:
|
||||
- envoymesh
|
||||
environment:
|
||||
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||
- CHAT_COMPLETION_ENDPOINT=http://envoy:10000/v1/chat/completions
|
||||
|
|
@ -70,8 +63,6 @@ services:
|
|||
volumes:
|
||||
- ./prometheus:/etc/prometheus
|
||||
- ./prom_data:/prometheus
|
||||
networks:
|
||||
- envoymesh
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana
|
||||
|
|
@ -86,9 +77,3 @@ services:
|
|||
- ./grafana:/etc/grafana/provisioning/datasources
|
||||
- ./grafana/dashboard.yaml:/etc/grafana/provisioning/dashboards/main.yaml
|
||||
- ./grafana/dashboards:/var/lib/grafana/dashboards
|
||||
# - ./grafana-data:/var/lib/grafana
|
||||
networks:
|
||||
- envoymesh
|
||||
|
||||
networks:
|
||||
envoymesh: {}
|
||||
|
|
|
|||
|
|
@ -8,9 +8,10 @@ embedding_provider:
|
|||
|
||||
llm_providers:
|
||||
|
||||
- name: "open-ai-gpt-4"
|
||||
api_key: "$OPEN_AI_API_KEY"
|
||||
- name: open-ai-gpt-4
|
||||
api_key: $OPEN_AI_API_KEY
|
||||
model: gpt-4
|
||||
default: true
|
||||
|
||||
prompt_targets:
|
||||
|
||||
|
|
|
|||
16
embedding-server/.vscode/launch.json
vendored
Normal file
16
embedding-server/.vscode/launch.json
vendored
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "embedding server",
|
||||
"cwd": "${workspaceFolder}/app",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "uvicorn",
|
||||
"args": ["main:app","--reload", "--port", "8000"],
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -35,9 +35,11 @@ RUN apt-get update && apt-get install -y \
|
|||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN python install.py
|
||||
# comment it out for now as we don't want to download the model every time we build the image
|
||||
# we will mount host cache to docker image to avoid downloading the model every time
|
||||
# see docker-compose file for more details
|
||||
|
||||
# RUN python install.py && \
|
||||
# find /root/.cache/torch/sentence_transformers/ -name onnx -exec rm -rf {} +
|
||||
|
||||
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"]
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import os
|
|||
import sentence_transformers
|
||||
from gliner import GLiNER
|
||||
|
||||
def load_transformers(models = os.getenv("MODELS", "sentence-transformers/all-MiniLM-L6-v2")):
|
||||
def load_transformers(models = os.getenv("MODELS", "BAAI/bge-large-en-v1.5")):
|
||||
transformers = {}
|
||||
|
||||
for model in models.split(','):
|
||||
|
|
|
|||
|
|
@ -9,12 +9,13 @@ services:
|
|||
- ./envoy.yaml:/etc/envoy/envoy.yaml
|
||||
- ./target/wasm32-wasi/release:/etc/envoy/proxy-wasm-plugins
|
||||
- /etc/ssl/cert.pem:/etc/ssl/cert.pem
|
||||
networks:
|
||||
- envoymesh
|
||||
depends_on:
|
||||
qdrant:
|
||||
condition: service_started
|
||||
embeddingserver:
|
||||
condition: service_healthy
|
||||
|
||||
|
||||
embeddingserver:
|
||||
build:
|
||||
context: ../embedding-server
|
||||
|
|
@ -25,8 +26,6 @@ services:
|
|||
test: ["CMD", "curl" ,"http://localhost:80/healthz"]
|
||||
interval: 5s
|
||||
retries: 20
|
||||
networks:
|
||||
- envoymesh
|
||||
|
||||
qdrant:
|
||||
image: qdrant/qdrant
|
||||
|
|
@ -34,10 +33,12 @@ services:
|
|||
ports:
|
||||
- 16333:6333
|
||||
- 16334:6334
|
||||
volumes:
|
||||
- ./qdrant_data:/qdrant/storage
|
||||
networks:
|
||||
- envoymesh
|
||||
|
||||
networks:
|
||||
envoymesh: {}
|
||||
chatbot-ui:
|
||||
build:
|
||||
context: ../chatbot-ui
|
||||
dockerfile: Dockerfile
|
||||
ports:
|
||||
- "18080:8080"
|
||||
environment:
|
||||
- CHAT_COMPLETION_ENDPOINT=http://envoy:10000/v1/chat/completions
|
||||
|
|
|
|||
1
envoyfilter/download_mistral_7b.sh
Normal file
1
envoyfilter/download_mistral_7b.sh
Normal file
|
|
@ -0,0 +1 @@
|
|||
huggingface-cli download TheBloke/Mistral-7B-Instruct-v0.2-GGUF mistral-7b-instruct-v0.2.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False
|
||||
|
|
@ -34,9 +34,19 @@ static_resources:
|
|||
routes:
|
||||
- match:
|
||||
prefix: "/v1/chat/completions"
|
||||
headers:
|
||||
name: "Authorization"
|
||||
present_match: true
|
||||
route:
|
||||
auto_host_rewrite: true
|
||||
cluster: openai
|
||||
timeout: 60s
|
||||
- match:
|
||||
prefix: "/v1/chat/completions"
|
||||
route:
|
||||
auto_host_rewrite: true
|
||||
cluster: mistral_7b_instruct
|
||||
timeout: 60s
|
||||
- match:
|
||||
prefix: "/embeddings"
|
||||
route:
|
||||
|
|
@ -156,3 +166,17 @@ static_resources:
|
|||
address: qdrant
|
||||
port_value: 6333
|
||||
hostname: "qdrant"
|
||||
- name: mistral_7b_instruct
|
||||
connect_timeout: 5s
|
||||
type: STRICT_DNS
|
||||
lb_policy: ROUND_ROBIN
|
||||
load_assignment:
|
||||
cluster_name: qdrant
|
||||
endpoints:
|
||||
- lb_endpoints:
|
||||
- endpoint:
|
||||
address:
|
||||
socket_address:
|
||||
address: mistral_7b_instruct
|
||||
port_value: 10001
|
||||
hostname: "mistral_7b_instruct"
|
||||
|
|
|
|||
|
|
@ -28,15 +28,26 @@ static_resources:
|
|||
route:
|
||||
auto_host_rewrite: true
|
||||
cluster: openai
|
||||
timeout: 60s
|
||||
- name: local_service
|
||||
domains:
|
||||
- "*"
|
||||
routes:
|
||||
- match:
|
||||
prefix: "/v1/chat/completions"
|
||||
headers:
|
||||
name: "Authorization"
|
||||
present_match: true
|
||||
route:
|
||||
auto_host_rewrite: true
|
||||
cluster: openai
|
||||
timeout: 60s
|
||||
- match:
|
||||
prefix: "/v1/chat/completions"
|
||||
route:
|
||||
auto_host_rewrite: true
|
||||
cluster: mistral_7b_instruct
|
||||
timeout: 60s
|
||||
- match:
|
||||
prefix: "/embeddings"
|
||||
route:
|
||||
|
|
@ -68,10 +79,16 @@ static_resources:
|
|||
|
||||
llm_providers:
|
||||
|
||||
- name: "open-ai-gpt-4"
|
||||
- name: open-ai-gpt-4
|
||||
api_key: "$OPEN_AI_API_KEY"
|
||||
model: gpt-4
|
||||
|
||||
- name: mistral_7b_instruct
|
||||
model: mistral-7b-instruct
|
||||
endpoint: http://mistral_7b_instruct:10001/v1/chat/completions
|
||||
default: true
|
||||
|
||||
|
||||
prompt_targets:
|
||||
|
||||
- type: context_resolver
|
||||
|
|
@ -131,7 +148,6 @@ static_resources:
|
|||
tls_params:
|
||||
tls_minimum_protocol_version: TLSv1_2
|
||||
tls_maximum_protocol_version: TLSv1_3
|
||||
|
||||
- name: embeddingserver
|
||||
connect_timeout: 5s
|
||||
type: STRICT_DNS
|
||||
|
|
@ -143,8 +159,8 @@ static_resources:
|
|||
- endpoint:
|
||||
address:
|
||||
socket_address:
|
||||
address: embeddingserver
|
||||
port_value: 80
|
||||
address: host.docker.internal
|
||||
port_value: 8000
|
||||
hostname: "embeddingserver"
|
||||
- name: weatherhost
|
||||
connect_timeout: 5s
|
||||
|
|
@ -157,8 +173,8 @@ static_resources:
|
|||
- endpoint:
|
||||
address:
|
||||
socket_address:
|
||||
address: embeddingserver
|
||||
port_value: 80
|
||||
address: host.docker.internal
|
||||
port_value: 8000
|
||||
hostname: "embeddingserver"
|
||||
- name: nerhost
|
||||
connect_timeout: 5s
|
||||
|
|
@ -171,8 +187,8 @@ static_resources:
|
|||
- endpoint:
|
||||
address:
|
||||
socket_address:
|
||||
address: embeddingserver
|
||||
port_value: 80
|
||||
address: host.docker.internal
|
||||
port_value: 8000
|
||||
hostname: "embeddingserver"
|
||||
- name: qdrant
|
||||
connect_timeout: 5s
|
||||
|
|
@ -188,3 +204,17 @@ static_resources:
|
|||
address: qdrant
|
||||
port_value: 6333
|
||||
hostname: "qdrant"
|
||||
- name: mistral_7b_instruct
|
||||
connect_timeout: 5s
|
||||
type: STRICT_DNS
|
||||
lb_policy: ROUND_ROBIN
|
||||
load_assignment:
|
||||
cluster_name: qdrant
|
||||
endpoints:
|
||||
- lb_endpoints:
|
||||
- endpoint:
|
||||
address:
|
||||
socket_address:
|
||||
address: mistral_7b_instruct
|
||||
port_value: 10001
|
||||
hostname: "mistral_7b_instruct"
|
||||
|
|
|
|||
|
|
@ -30,8 +30,16 @@ pub struct EmbeddingProviver {
|
|||
//TODO: use enum for model, but if there is a new model, we need to update the code
|
||||
pub struct LlmProvider {
|
||||
pub name: String,
|
||||
pub api_key: String,
|
||||
pub api_key: Option<String>,
|
||||
pub model: String,
|
||||
pub default: Option<bool>,
|
||||
pub endpoint: Option<EnpointType>,
|
||||
}
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum EnpointType {
|
||||
String(String),
|
||||
Struct(Endpoint),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
|
|
|
|||
|
|
@ -50,6 +50,7 @@ impl StreamContext {
|
|||
// However, a missing Content-Length header is not grounds for bad requests given that intermediary hops could
|
||||
// manipulate the body in benign ways e.g., compression.
|
||||
self.set_http_request_header("content-length", None);
|
||||
// self.set_http_request_header("authorization", None);
|
||||
}
|
||||
|
||||
fn modify_path_header(&mut self) {
|
||||
|
|
@ -330,7 +331,7 @@ impl StreamContext {
|
|||
return;
|
||||
}
|
||||
};
|
||||
info!("sending request to openai: msg len: {}", json_string.len());
|
||||
info!("sending request to openai: msg {}", json_string);
|
||||
self.set_http_request_body(0, json_string.len(), &json_string.into_bytes());
|
||||
self.resume_http_request();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,6 +23,10 @@
|
|||
{
|
||||
"name": "demos/weather-forecast",
|
||||
"path": "./demos/weather-forecast",
|
||||
},
|
||||
{
|
||||
"name": "demos/weather-forecast-local-llm",
|
||||
"path": "./demos/weather-forecast-local-llm",
|
||||
}
|
||||
],
|
||||
"settings": {}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue