Add function calling support using bolt-fc-1b (#35)

2026-04-28 02:23:56 +02:00 · 2024-09-10 14:24:46 -07:00 · 2024-09-10 14:24:46 -07:00 · 7b5203a2ce
commit 7b5203a2ce
parent fdfad87347
39 changed files with 1763 additions and 416 deletions
--- a/demos/function-calling/Bolt-FC-1B-Q3_K_L.model_file
+++ b/demos/function-calling/Bolt-FC-1B-Q3_K_L.model_file
@ -0,0 +1,25 @@
+FROM Bolt-Function-Calling-1B-Q3_K_L.gguf
+
+# Set the size of the context window used to generate the next token
+# PARAMETER num_ctx 16384
+PARAMETER num_ctx 4096
+
+# Set parameters for response generation
+PARAMETER num_predict 1024
+PARAMETER temperature 0.1
+PARAMETER top_p 0.5
+PARAMETER top_k 32022
+PARAMETER repeat_penalty 1.0
+PARAMETER stop "<|EOT|>"
+
+# Set the random number seed to use for generation
+PARAMETER seed 42
+
+# Set the prompt template to be passed into the model
+TEMPLATE """{{ if .System }}<｜begin▁of▁sentence｜>
+{{ .System }}
+{{ end }}{{ if .Prompt }}### Instruction:
+{{ .Prompt }}
+{{ end }}### Response:
+{{ .Response }}
+<|EOT|>"""
--- a/demos/function-calling/Bolt-FC-1B-Q4_K_M.model_file
+++ b/demos/function-calling/Bolt-FC-1B-Q4_K_M.model_file
@ -0,0 +1,24 @@
+FROM Bolt-Function-Calling-1B-Q4_K_M.gguf
+
+# Set the size of the context window used to generate the next token
+PARAMETER num_ctx 4096
+
+# Set parameters for response generation
+PARAMETER num_predict 1024
+PARAMETER temperature 0.1
+PARAMETER top_p 0.5
+PARAMETER top_k 32022
+PARAMETER repeat_penalty 1.0
+PARAMETER stop "<|EOT|>"
+
+# Set the random number seed to use for generation
+PARAMETER seed 42
+
+# Set the prompt template to be passed into the model
+TEMPLATE """{{ if .System }}<｜begin▁of▁sentence｜>
+{{ .System }}
+{{ end }}{{ if .Prompt }}### Instruction:
+{{ .Prompt }}
+{{ end }}### Response:
+{{ .Response }}
+<|EOT|>"""
--- a/demos/function-calling/README.md
+++ b/demos/function-calling/README.md
@ -0,0 +1,24 @@
+# Function calling
+This demo shows how you can use intelligent prompt gateway to do function calling. This demo assumes you are using ollama running natively. If you want to run ollama running inside docker then please update ollama endpoint in docker-compose file.
+
+# Startig the demo
+1. Ensure that submodule is up to date
+   ```sh
+   git submodule sync --recursive
+   ```
+1. Create `.env` file and set OpenAI key using env var `OPENAI_API_KEY`
+1. Start services
+   ```sh
+   docker compose up
+   ```
+1. Download Bolt-FC model. This demo assumes we have downloaded `Bolt-Function-Calling-1B:Q4_K_M` to local folder
+2. Create model file in ollama repository
+   ```sh
+   ollama create Bolt-Function-Calling-1B:Q4_K_M -f Bolt-FC-1B-Q4_K_M.model_file
+   ```
+3. Navigate to http://localhost:18080/
+4. You can type in queries like "how is the weather in Seattle"
+   1. You can also ask follow up questions like "show me sunny days"
+5. To see metrics navigate to "http://localhost:3000/" (use admin/grafana for login)
+   1. Open up dahsboard named "Intelligent Gateway Overview"
+   2. On this dashboard you can see reuqest latency and number of requests
--- a/demos/function-calling/docker-compose.yaml
+++ b/demos/function-calling/docker-compose.yaml
@ -0,0 +1,115 @@
+
+services:
+
+  config-generator:
+    build:
+      context: ../../
+      dockerfile: config_generator/Dockerfile
+    volumes:
+      - ../../envoyfilter/envoy.template.yaml:/usr/src/app/envoy.template.yaml
+      - ./katanemo-config.yaml:/usr/src/app/katanemo-config.yaml
+      - ./generated:/usr/src/app/out
+
+  envoy:
+    build:
+      context: ../../
+      dockerfile: envoyfilter/Dockerfile
+    hostname: envoy
+    ports:
+      - "10000:10000"
+      - "19901:9901"
+    volumes:
+      - ./generated/envoy.yaml:/etc/envoy/envoy.yaml
+      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
+    depends_on:
+      config-generator:
+        condition: service_completed_successfully
+      embeddingserver:
+        condition: service_healthy
+    environment:
+      - LOG_LEVEL=debug
+
+  embeddingserver:
+    build:
+      context: ../../embedding-server
+      dockerfile: Dockerfile
+    ports:
+      - "18081:80"
+    healthcheck:
+        test: ["CMD", "curl" ,"http://localhost:80/healthz"]
+        interval: 5s
+        retries: 20
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+
+  functionresolver:
+    build:
+      context: ../../function_resolver
+      dockerfile: Dockerfile
+    ports:
+      - "18082:80"
+    healthcheck:
+        test: ["CMD", "curl" ,"http://localhost:80/healthz"]
+        interval: 5s
+        retries: 20
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+    environment:
+      # use ollama endpoint that is hosted by host machine (no virtualization)
+      - OLLAMA_ENDPOINT=host.docker.internal
+      # uncomment following line to use ollama endpoint that is hosted by docker
+      # - OLLAMA_ENDPOINT=ollama
+
+  ollama:
+    image: ollama/ollama
+    container_name: ollama
+    volumes:
+      - ./ollama:/root/.ollama
+    restart: unless-stopped
+    ports:
+      - '11434:11434'
+    profiles:
+      - manual
+
+  qdrant:
+    image: qdrant/qdrant
+    hostname: vector-db
+    ports:
+      - 16333:6333
+      - 16334:6334
+
+  chatbot-ui:
+    build:
+      context: ../../chatbot-ui
+      dockerfile: Dockerfile
+    ports:
+      - "18080:8080"
+    environment:
+      - OPENAI_API_KEY=${OPENAI_API_KEY}
+      - CHAT_COMPLETION_ENDPOINT=http://envoy:10000/v1/chat/completions
+
+  prometheus:
+    image: prom/prometheus
+    container_name: prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yaml'
+    ports:
+      - 9090:9090
+    restart: unless-stopped
+    volumes:
+      - ./prometheus:/etc/prometheus
+      - ./prom_data:/prometheus
+
+  grafana:
+    image: grafana/grafana
+    container_name: grafana
+    ports:
+      - 3000:3000
+    restart: unless-stopped
+    environment:
+      - GF_SECURITY_ADMIN_USER=admin
+      - GF_SECURITY_ADMIN_PASSWORD=grafana
+    volumes:
+      - ./grafana:/etc/grafana/provisioning/datasources
+      - ./grafana/dashboard.yaml:/etc/grafana/provisioning/dashboards/main.yaml
+      - ./grafana/dashboards:/var/lib/grafana/dashboards
--- a/demos/function-calling/grafana/dashboard.yaml
+++ b/demos/function-calling/grafana/dashboard.yaml
@ -0,0 +1,12 @@
+apiVersion: 1
+
+providers:
+  - name: "Dashboard provider"
+    orgId: 1
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: false
+    options:
+      path: /var/lib/grafana/dashboards
+      foldersFromFilesStructure: true
--- a/demos/function-calling/grafana/dashboards/envoy_overview.json
+++ b/demos/function-calling/grafana/dashboards/envoy_overview.json
@ -0,0 +1,355 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "links": [],
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "PBFA97CFB590B2093"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "id": 2,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "avg(rate(envoy_cluster_internal_upstream_rq_time_sum[1m]) / rate(envoy_cluster_internal_upstream_rq_time_count[1m])) by (envoy_cluster_name)",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "request latency - internal (ms)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "PBFA97CFB590B2093"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "id": 1,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "avg(rate(envoy_cluster_external_upstream_rq_time_sum[1m]) / rate(envoy_cluster_external_upstream_rq_time_count[1m])) by (envoy_cluster_name)",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "request latency - external (ms)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "PBFA97CFB590B2093"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "id": 3,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "avg(rate(envoy_cluster_internal_upstream_rq_completed[1m])) by (envoy_cluster_name)",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "avg(rate(envoy_cluster_external_upstream_rq_completed[1m])) by (envoy_cluster_name)",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        }
+      ],
+      "title": "Upstream request count",
+      "type": "timeseries"
+    }
+  ],
+  "schemaVersion": 39,
+  "tags": [],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-15m",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Intelligent Gateway Overview",
+  "uid": "adt6uhx5lk8aob",
+  "version": 3,
+  "weekStart": ""
+}
--- a/demos/function-calling/grafana/datasource.yaml
+++ b/demos/function-calling/grafana/datasource.yaml
@ -0,0 +1,9 @@
+apiVersion: 1
+
+datasources:
+- name: Prometheus
+  type: prometheus
+  url: http://prometheus:9090
+  isDefault: true
+  access: proxy
+  editable: true
--- a/demos/function-calling/katanemo-config.yaml
+++ b/demos/function-calling/katanemo-config.yaml
@ -0,0 +1,41 @@
+default_prompt_endpoint: "127.0.0.1"
+load_balancing: "round_robin"
+timeout_ms: 5000
+
+
+# should not be here
+embedding_provider:
+  name: "bge-large-en-v1.5"
+  model: "BAAI/bge-large-en-v1.5"
+
+llm_providers:
+
+  - name: open-ai-gpt-4
+    api_key: $OPEN_AI_API_KEY
+    model: gpt-4
+    default: true
+
+prompt_targets:
+
+  - type: function_resolver
+    name: weather_forecast
+    description: This function resolver provides weather forecast information for a given city.
+    few_shot_examples:
+      - what is the weather in New York?
+      - how is the weather in San Francisco?
+      - what is the forecast in Chicago?
+    parameters:
+      - name: city
+        required: true
+        description: The city for which the weather forecast is requested.
+      - name: days
+        description: The number of days for which the weather forecast is requested.
+      - name: units
+        description: The units in which the weather forecast is requested.
+    endpoint:
+      cluster: weatherhost
+      path: /weather
+    system_prompt: |
+      You are a helpful weather forecaster. Use weater data that is provided to you. Please following following guidelines when responding to user queries:
+      - Use farenheight for temperature
+      - Use miles per hour for wind speed
--- a/demos/function-calling/prometheus/prometheus.yaml
+++ b/demos/function-calling/prometheus/prometheus.yaml
@ -0,0 +1,23 @@
+global:
+  scrape_interval: 15s
+  scrape_timeout: 10s
+  evaluation_interval: 15s
+alerting:
+  alertmanagers:
+  - static_configs:
+    - targets: []
+    scheme: http
+    timeout: 10s
+    api_version: v1
+scrape_configs:
+- job_name: envoy
+  honor_timestamps: true
+  scrape_interval: 15s
+  scrape_timeout: 10s
+  metrics_path: /stats
+  scheme: http
+  static_configs:
+  - targets:
+    - envoy:9901
+  params:
+    format: ['prometheus']