Add support for local llm (mistral 7b) (#31)

2026-06-08 14:55:14 +02:00 · 2024-08-06 23:40:06 -07:00 · 2024-08-06 23:40:06 -07:00 · 445b1ea210
commit 445b1ea210
parent b49fc2f264
24 changed files with 703 additions and 51 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,4 +7,6 @@ grafana-data
 prom_data
 .env
 qdrant_data
-demos/weather-forecast/generated/
+generated
+.DS_Store
+*.gguf
--- a/chatbot-ui/.vscode/launch.json
+++ b/chatbot-ui/.vscode/launch.json
@ -0,0 +1,16 @@
+{
+  // Use IntelliSense to learn about possible attributes.
+  // Hover to view descriptions of existing attributes.
+  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "chatbot-ui",
+      "cwd": "${workspaceFolder}/app",
+      "type": "debugpy",
+      "request": "launch",
+      "program": "run.py",
+      "console": "integratedTerminal",
+    }
+  ]
+}
--- a/chatbot-ui/app/run.py
+++ b/chatbot-ui/app/run.py
@ -19,18 +19,21 @@ class Message(BaseModel):
    role: str
    content: str

-async def make_completion(messages:List[Message], nb_retries:int=3, delay:int=30) -> Optional[str]:
+async def make_completion(messages:List[Message], nb_retries:int=3, delay:int=120) -> Optional[str]:
    """
    Sends a request to the ChatGPT API to retrieve a response based on a list of previous messages.
    """
    header = {
        "Content-Type": "application/json",
-        "Authorization": f"Bearer {OPENAI_API_KEY}"
    }

+    if OPENAI_API_KEY is not None and OPENAI_API_KEY != "":
+        header["Authorization"] = f"Bearer {OPENAI_API_KEY}"
+
    if OPENAI_API_KEY is None or OPENAI_API_KEY == "":
-        logger.error("No OpenAI API Key found. Please create .env file and set OPENAI_API_KEY env var !")
-        return None
+        if CHAT_COMPLETION_ENDPOINT.startswith("https://api.openai.com"):
+          logger.error("No OpenAI API Key found. Please create .env file and set OPENAI_API_KEY env var !")
+          return None
    try:
        async with async_timeout.timeout(delay=delay):
            async with httpx.AsyncClient(headers=header) as aio_client:
@ -44,7 +47,8 @@ async def make_completion(messages:List[Message], nb_retries:int=3, delay:int=30
                            json = {
                                "model": "gpt-3.5-turbo",
                                "messages": messages
-                            }
+                            },
+                            timeout=delay
                        )
                        logger.debug(f"Status Code : {resp.status_code}")
                        if resp.status_code == 200:
@ -66,7 +70,8 @@ async def predict(input, history):
    """
    history.append({"role": "user", "content": input})
    response = await make_completion(history)
-    history.append({"role": "assistant", "content": response})
+    if response is not None:
+      history.append({"role": "assistant", "content": response})
    messages = [(history[i]["content"], history[i+1]["content"]) for i in range(0, len(history)-1, 2)]
    return messages, history

--- a/demos/weather-forecast-local-llm/README.md
+++ b/demos/weather-forecast-local-llm/README.md
@ -0,0 +1,22 @@
+# Weather forecasting
+This demo shows how you can use intelligent prompt gateway to provide realtime weather forecast using Mistral LLM locally hosted using llama.cpp as LLM Hosting Service.
+
+# Startig the demo
+1. Ensure that submodule is up to date
+   ```sh
+   git submodule sync --recursive
+   ```
+1. Download mistral 7b model using following shell command
+   ```sh
+   sh download_mistral_7b.sh
+   ```
+2. Start services
+   ```sh
+   docker compose up
+   ```
+3. Navigate to http://localhost:18080/
+4. You can type in queries like "how is the weather in Seattle"
+   1. You can also ask follow up questions like "show me sunny days"
+5. To see metrics navigate to "http://localhost:3000/" (use admin/grafana for login)
+   1. Open up dahsboard named "Intelligent Gateway Overview"
+   2. On this dashboard you can see reuqest latency and number of requests
--- a/demos/weather-forecast-local-llm/docker-compose.yaml
+++ b/demos/weather-forecast-local-llm/docker-compose.yaml
@ -0,0 +1,88 @@
+services:
+  config-generator:
+    build:
+      context: ../../
+      dockerfile: config_generator/Dockerfile
+    volumes:
+      - ./katanemo-config.yaml:/usr/src/app/katanemo-config.yaml
+      - ./generated:/usr/src/app/out
+  envoy:
+    build:
+      context: ../../
+      dockerfile: envoyfilter/Dockerfile
+    hostname: envoy
+    ports:
+      - "10000:10000"
+      - "19901:9901"
+    volumes:
+      - ./generated/envoy.yaml:/etc/envoy/envoy.yaml
+      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
+    depends_on:
+      config-generator:
+        condition: service_completed_successfully
+      embeddingserver:
+        condition: service_healthy
+
+  embeddingserver:
+    build:
+      context: ../../embedding-server
+      dockerfile: Dockerfile
+    ports:
+      - "18081:80"
+    healthcheck:
+        test: ["CMD", "curl" ,"http://localhost:80/healthz"]
+        interval: 5s
+        retries: 20
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+  qdrant:
+    image: qdrant/qdrant
+    hostname: vector-db
+    ports:
+      - 16333:6333
+      - 16334:6334
+
+  chatbot-ui:
+    build:
+      context: ../../chatbot-ui
+      dockerfile: Dockerfile
+    ports:
+      - "18080:8080"
+    environment:
+      - OPENAI_API_KEY=${OPENAI_API_KEY}
+      - CHAT_COMPLETION_ENDPOINT=http://envoy:10000/v1/chat/completions
+
+  prometheus:
+    image: prom/prometheus
+    container_name: prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yaml'
+    ports:
+      - 9090:9090
+    restart: unless-stopped
+    volumes:
+      - ./prometheus:/etc/prometheus
+      - ./prom_data:/prometheus
+
+  grafana:
+    image: grafana/grafana
+    container_name: grafana
+    ports:
+      - 3000:3000
+    restart: unless-stopped
+    environment:
+      - GF_SECURITY_ADMIN_USER=admin
+      - GF_SECURITY_ADMIN_PASSWORD=grafana
+    volumes:
+      - ./grafana:/etc/grafana/provisioning/datasources
+      - ./grafana/dashboard.yaml:/etc/grafana/provisioning/dashboards/main.yaml
+      - ./grafana/dashboards:/var/lib/grafana/dashboards
+
+  mistral_7b_instruct:
+    image: ghcr.io/ggerganov/llama.cpp:server
+    hostname: mistral_7b_instruct
+    ports:
+      - "10001:10001"
+    volumes:
+      - ./mistral-7b-instruct-v0.2.Q4_K_M.gguf:/models/model.gguf
+    command: ["--host", "0.0.0.0", "--port", "10001", "-m", "/models/model.gguf"]
--- a/demos/weather-forecast-local-llm/download_mistral_7b.sh
+++ b/demos/weather-forecast-local-llm/download_mistral_7b.sh
@ -0,0 +1 @@
+huggingface-cli download TheBloke/Mistral-7B-Instruct-v0.2-GGUF mistral-7b-instruct-v0.2.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False
--- a/demos/weather-forecast-local-llm/grafana/dashboard.yaml
+++ b/demos/weather-forecast-local-llm/grafana/dashboard.yaml
@ -0,0 +1,12 @@
+apiVersion: 1
+
+providers:
+  - name: "Dashboard provider"
+    orgId: 1
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: false
+    options:
+      path: /var/lib/grafana/dashboards
+      foldersFromFilesStructure: true
--- a/demos/weather-forecast-local-llm/grafana/dashboards/envoy_overview.json
+++ b/demos/weather-forecast-local-llm/grafana/dashboards/envoy_overview.json
@ -0,0 +1,355 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "links": [],
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "PBFA97CFB590B2093"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "id": 2,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "avg(rate(envoy_cluster_internal_upstream_rq_time_sum[1m]) / rate(envoy_cluster_internal_upstream_rq_time_count[1m])) by (envoy_cluster_name)",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "request latency - internal (ms)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "PBFA97CFB590B2093"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "id": 1,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "avg(rate(envoy_cluster_external_upstream_rq_time_sum[1m]) / rate(envoy_cluster_external_upstream_rq_time_count[1m])) by (envoy_cluster_name)",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "request latency - external (ms)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "PBFA97CFB590B2093"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "id": 3,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "avg(rate(envoy_cluster_internal_upstream_rq_completed[1m])) by (envoy_cluster_name)",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "avg(rate(envoy_cluster_external_upstream_rq_completed[1m])) by (envoy_cluster_name)",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        }
+      ],
+      "title": "Upstream request count",
+      "type": "timeseries"
+    }
+  ],
+  "schemaVersion": 39,
+  "tags": [],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-15m",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Intelligent Gateway Overview",
+  "uid": "adt6uhx5lk8aob",
+  "version": 3,
+  "weekStart": ""
+}
--- a/demos/weather-forecast-local-llm/grafana/datasource.yaml
+++ b/demos/weather-forecast-local-llm/grafana/datasource.yaml
@ -0,0 +1,9 @@
+apiVersion: 1
+
+datasources:
+- name: Prometheus
+  type: prometheus
+  url: http://prometheus:9090
+  isDefault: true
+  access: proxy
+  editable: true
--- a/demos/weather-forecast-local-llm/katanemo-config.yaml
+++ b/demos/weather-forecast-local-llm/katanemo-config.yaml
@ -0,0 +1,46 @@
+default_prompt_endpoint: "127.0.0.1"
+load_balancing: "round_robin"
+timeout_ms: 5000
+
+embedding_provider:
+  name: "SentenceTransformer"
+  model: "all-MiniLM-L6-v2"
+
+llm_providers:
+
+  - name: open-ai-gpt-4
+    api_key: "$OPEN_AI_API_KEY"
+    model: gpt-4
+
+  - name: mistral_7b_instruct
+    model: mistral-7b-instruct
+    endpoint: http://mistral_7b_instruct:10001/v1/chat/completions
+    default: true
+
+prompt_targets:
+
+  - type: context_resolver
+    name: weather_forecast
+    few_shot_examples:
+      - what is the weather in New York?
+      - how is the weather in San Francisco?
+      - what is the forecast in Chicago?
+    entities:
+      - name: city
+        required: true
+      - name: days
+    endpoint:
+      cluster: weatherhost
+      path: /weather
+    system_prompt: |
+      You are a helpful weather forecaster. Use weater data that is provided to you. Please following following guidelines when responding to user queries:
+      - Use farenheight for temperature
+      - Use miles per hour for wind speed
+
+#TODO: add support for adding custom clusters e.g.
+    #  clusters:
+    #     qdrant:
+    #       options:
+    #         - address: "qdrant"
+    #         - address: "weatherhost"
+    #         - port: 6333
--- a/demos/weather-forecast-local-llm/prometheus/prometheus.yaml
+++ b/demos/weather-forecast-local-llm/prometheus/prometheus.yaml
@ -0,0 +1,23 @@
+global:
+  scrape_interval: 15s
+  scrape_timeout: 10s
+  evaluation_interval: 15s
+alerting:
+  alertmanagers:
+  - static_configs:
+    - targets: []
+    scheme: http
+    timeout: 10s
+    api_version: v1
+scrape_configs:
+- job_name: envoy
+  honor_timestamps: true
+  scrape_interval: 15s
+  scrape_timeout: 10s
+  metrics_path: /stats
+  scheme: http
+  static_configs:
+  - targets:
+    - envoy:9901
+  params:
+    format: ['prometheus']
--- a/demos/weather-forecast/README.md
+++ b/demos/weather-forecast/README.md
@ -4,12 +4,12 @@ This demo shows how you can use intelligent prompt gateway to provide realtime w
 # Startig the demo
 1. Ensure that submodule is up to date
   ```sh
-   $ git submodule sync --recursive
+   git submodule sync --recursive
   ```
 1. Create `.env` file and set OpenAI key using env var `OPENAI_API_KEY`
 1. Start services
   ```sh
-   $ docker compose up
+   docker compose up
   ```
 1. Navigate to http://localhost:18080/
 1. You can type in queries like "how is the weather in Seattle"
--- a/demos/weather-forecast/docker-compose.yaml
+++ b/demos/weather-forecast/docker-compose.yaml
@ -17,8 +17,6 @@ services:
    volumes:
      - ./generated/envoy.yaml:/etc/envoy/envoy.yaml
      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
-    networks:
-      - envoymesh
    depends_on:
      config-generator:
        condition: service_completed_successfully
@ -35,17 +33,14 @@ services:
        test: ["CMD", "curl" ,"http://localhost:80/healthz"]
        interval: 5s
        retries: 20
-    networks:
-      - envoymesh
-
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
  qdrant:
    image: qdrant/qdrant
    hostname: vector-db
    ports:
      - 16333:6333
      - 16334:6334
-    networks:
-      - envoymesh

  chatbot-ui:
    build:
@ -53,8 +48,6 @@ services:
      dockerfile: Dockerfile
    ports:
      - "18080:8080"
-    networks:
-      - envoymesh
    environment:
      - OPENAI_API_KEY=${OPENAI_API_KEY}
      - CHAT_COMPLETION_ENDPOINT=http://envoy:10000/v1/chat/completions
@ -70,8 +63,6 @@ services:
    volumes:
      - ./prometheus:/etc/prometheus
      - ./prom_data:/prometheus
-    networks:
-      - envoymesh

  grafana:
    image: grafana/grafana
@ -86,9 +77,3 @@ services:
      - ./grafana:/etc/grafana/provisioning/datasources
      - ./grafana/dashboard.yaml:/etc/grafana/provisioning/dashboards/main.yaml
      - ./grafana/dashboards:/var/lib/grafana/dashboards
-      # - ./grafana-data:/var/lib/grafana
-    networks:
-      - envoymesh
-
-networks:
-  envoymesh: {}
--- a/demos/weather-forecast/katanemo-config.yaml
+++ b/demos/weather-forecast/katanemo-config.yaml
@ -8,9 +8,10 @@ embedding_provider:

 llm_providers:

-  - name: "open-ai-gpt-4"
-    api_key: "$OPEN_AI_API_KEY"
+  - name: open-ai-gpt-4
+    api_key: $OPEN_AI_API_KEY
    model: gpt-4
+    default: true

 prompt_targets:

--- a/embedding-server/.vscode/launch.json
+++ b/embedding-server/.vscode/launch.json
@ -0,0 +1,16 @@
+{
+  // Use IntelliSense to learn about possible attributes.
+  // Hover to view descriptions of existing attributes.
+  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "embedding server",
+      "cwd": "${workspaceFolder}/app",
+      "type": "debugpy",
+      "request": "launch",
+      "module": "uvicorn",
+      "args": ["main:app","--reload", "--port", "8000"],
+    }
+  ]
+}
--- a/embedding-server/Dockerfile
+++ b/embedding-server/Dockerfile
@ -35,9 +35,11 @@ RUN apt-get update && apt-get install -y \
  curl \
  && rm -rf /var/lib/apt/lists/*

-RUN python install.py
+# comment it out for now as we don't want to download the model every time we build the image
+# we will mount host cache to docker image to avoid downloading the model every time
+# see docker-compose file for more details
+
 # RUN python install.py && \
 #   find /root/.cache/torch/sentence_transformers/ -name onnx -exec rm -rf {} +

-
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"]
--- a/embedding-server/app/load_models.py
+++ b/embedding-server/app/load_models.py
@ -2,7 +2,7 @@ import os
 import sentence_transformers
 from gliner import GLiNER

-def load_transformers(models = os.getenv("MODELS", "sentence-transformers/all-MiniLM-L6-v2")):
+def load_transformers(models = os.getenv("MODELS", "BAAI/bge-large-en-v1.5")):
    transformers = {}

    for model in models.split(','):
--- a/envoyfilter/docker-compose.yaml
+++ b/envoyfilter/docker-compose.yaml
@ -9,12 +9,13 @@ services:
      - ./envoy.yaml:/etc/envoy/envoy.yaml
      - ./target/wasm32-wasi/release:/etc/envoy/proxy-wasm-plugins
      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
-    networks:
-      - envoymesh
    depends_on:
+      qdrant:
+        condition: service_started
      embeddingserver:
        condition: service_healthy

+
  embeddingserver:
    build:
      context: ../embedding-server
@ -25,8 +26,6 @@ services:
        test: ["CMD", "curl" ,"http://localhost:80/healthz"]
        interval: 5s
        retries: 20
-    networks:
-      - envoymesh

  qdrant:
    image: qdrant/qdrant
@ -34,10 +33,12 @@ services:
    ports:
      - 16333:6333
      - 16334:6334
-    volumes:
-      - ./qdrant_data:/qdrant/storage
-    networks:
-      - envoymesh

-networks:
-  envoymesh: {}
+  chatbot-ui:
+    build:
+      context: ../chatbot-ui
+      dockerfile: Dockerfile
+    ports:
+      - "18080:8080"
+    environment:
+      - CHAT_COMPLETION_ENDPOINT=http://envoy:10000/v1/chat/completions
--- a/envoyfilter/download_mistral_7b.sh
+++ b/envoyfilter/download_mistral_7b.sh
@ -0,0 +1 @@
+huggingface-cli download TheBloke/Mistral-7B-Instruct-v0.2-GGUF mistral-7b-instruct-v0.2.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False
--- a/envoyfilter/envoy.template.yaml
+++ b/envoyfilter/envoy.template.yaml
@ -34,9 +34,19 @@ static_resources:
                    routes:
                      - match:
                          prefix: "/v1/chat/completions"
+                          headers:
+                            name: "Authorization"
+                            present_match: true
                        route:
                          auto_host_rewrite: true
                          cluster: openai
+                          timeout: 60s
+                      - match:
+                          prefix: "/v1/chat/completions"
+                        route:
+                          auto_host_rewrite: true
+                          cluster: mistral_7b_instruct
+                          timeout: 60s
                      - match:
                          prefix: "/embeddings"
                        route:
@ -156,3 +166,17 @@ static_resources:
                      address: qdrant
                      port_value: 6333
                  hostname: "qdrant"
+    - name: mistral_7b_instruct
+      connect_timeout: 5s
+      type: STRICT_DNS
+      lb_policy: ROUND_ROBIN
+      load_assignment:
+        cluster_name: qdrant
+        endpoints:
+          - lb_endpoints:
+              - endpoint:
+                  address:
+                    socket_address:
+                      address: mistral_7b_instruct
+                      port_value: 10001
+                  hostname: "mistral_7b_instruct"
--- a/envoyfilter/envoy.yaml
+++ b/envoyfilter/envoy.yaml
@ -28,15 +28,26 @@ static_resources:
                        route:
                          auto_host_rewrite: true
                          cluster: openai
+                          timeout: 60s
                  - name: local_service
                    domains:
                      - "*"
                    routes:
                      - match:
                          prefix: "/v1/chat/completions"
+                          headers:
+                            name: "Authorization"
+                            present_match: true
                        route:
                          auto_host_rewrite: true
                          cluster: openai
+                          timeout: 60s
+                      - match:
+                          prefix: "/v1/chat/completions"
+                        route:
+                          auto_host_rewrite: true
+                          cluster: mistral_7b_instruct
+                          timeout: 60s
                      - match:
                          prefix: "/embeddings"
                        route:
@ -68,10 +79,16 @@ static_resources:

                              llm_providers:

-                                - name: "open-ai-gpt-4"
+                                - name: open-ai-gpt-4
                                  api_key: "$OPEN_AI_API_KEY"
                                  model: gpt-4

+                                - name: mistral_7b_instruct
+                                  model: mistral-7b-instruct
+                                  endpoint: http://mistral_7b_instruct:10001/v1/chat/completions
+                                  default: true
+
+
                              prompt_targets:

                                - type: context_resolver
@ -131,7 +148,6 @@ static_resources:
            tls_params:
              tls_minimum_protocol_version: TLSv1_2
              tls_maximum_protocol_version: TLSv1_3
-
    - name: embeddingserver
      connect_timeout: 5s
      type: STRICT_DNS
@ -143,8 +159,8 @@ static_resources:
              - endpoint:
                  address:
                    socket_address:
-                      address: embeddingserver
-                      port_value: 80
+                      address: host.docker.internal
+                      port_value: 8000
                  hostname: "embeddingserver"
    - name: weatherhost
      connect_timeout: 5s
@ -157,8 +173,8 @@ static_resources:
              - endpoint:
                  address:
                    socket_address:
-                      address: embeddingserver
-                      port_value: 80
+                      address: host.docker.internal
+                      port_value: 8000
                  hostname: "embeddingserver"
    - name: nerhost
      connect_timeout: 5s
@ -171,8 +187,8 @@ static_resources:
              - endpoint:
                  address:
                    socket_address:
-                      address: embeddingserver
-                      port_value: 80
+                      address: host.docker.internal
+                      port_value: 8000
                  hostname: "embeddingserver"
    - name: qdrant
      connect_timeout: 5s
@ -188,3 +204,17 @@ static_resources:
                      address: qdrant
                      port_value: 6333
                  hostname: "qdrant"
+    - name: mistral_7b_instruct
+      connect_timeout: 5s
+      type: STRICT_DNS
+      lb_policy: ROUND_ROBIN
+      load_assignment:
+        cluster_name: qdrant
+        endpoints:
+          - lb_endpoints:
+              - endpoint:
+                  address:
+                    socket_address:
+                      address: mistral_7b_instruct
+                      port_value: 10001
+                  hostname: "mistral_7b_instruct"
--- a/envoyfilter/src/configuration.rs
+++ b/envoyfilter/src/configuration.rs
@ -30,8 +30,16 @@ pub struct EmbeddingProviver {
 //TODO: use enum for model, but if there is a new model, we need to update the code
 pub struct LlmProvider {
    pub name: String,
-    pub api_key: String,
+    pub api_key: Option<String>,
    pub model: String,
+    pub default: Option<bool>,
+    pub endpoint: Option<EnpointType>,
+}
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum EnpointType {
+    String(String),
+    Struct(Endpoint),
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/envoyfilter/src/stream_context.rs
+++ b/envoyfilter/src/stream_context.rs
@ -50,6 +50,7 @@ impl StreamContext {
        // However, a missing Content-Length header is not grounds for bad requests given that intermediary hops could
        // manipulate the body in benign ways e.g., compression.
        self.set_http_request_header("content-length", None);
+        // self.set_http_request_header("authorization", None);
    }

    fn modify_path_header(&mut self) {
@ -330,7 +331,7 @@ impl StreamContext {
                return;
            }
        };
-        info!("sending request to openai: msg len: {}", json_string.len());
+        info!("sending request to openai: msg {}", json_string);
        self.set_http_request_body(0, json_string.len(), &json_string.into_bytes());
        self.resume_http_request();
    }
--- a/gateway.code-workspace
+++ b/gateway.code-workspace
@ -23,6 +23,10 @@
    {
      "name": "demos/weather-forecast",
      "path": "./demos/weather-forecast",
+    },
+    {
+      "name": "demos/weather-forecast-local-llm",
+      "path": "./demos/weather-forecast-local-llm",
    }
  ],
  "settings": {}
				`@ -0,0 +1 @@`
				`huggingface-cli download TheBloke/Mistral-7B-Instruct-v0.2-GGUF mistral-7b-instruct-v0.2.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False`