Merge branch 'main' into adil/add_acm_demo

This commit is contained in:
Adil Hafeez 2025-04-15 15:12:46 -07:00
commit 6edad0870b
No known key found for this signature in database
GPG key ID: 9B18EF7691369645
125 changed files with 6680 additions and 2314 deletions

View file

@ -1,4 +1,4 @@
name: Publish Docker image
name: Publish docker image (latest)
env:
DOCKER_IMAGE: katanemo/archgw

View file

@ -0,0 +1,87 @@
name: Publish docker image (release)
on:
release:
types: [published]
jobs:
# Build ARM64 image on native ARM64 runner
build-arm64:
runs-on: [linux-arm64]
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.DOCKER_IMAGE }}
- name: Build and Push ARM64 Image
uses: docker/build-push-action@v5
with:
context: .
file: ./arch/Dockerfile
platforms: linux/arm64
push: true
tags: ${{ steps.meta.outputs.tags }}-arm64
# Build AMD64 image on GitHub's AMD64 runner
build-amd64:
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.DOCKER_IMAGE }}
- name: Build and Push AMD64 Image
uses: docker/build-push-action@v5
with:
context: .
file: ./arch/Dockerfile
platforms: linux/amd64
push: true
tags: ${{ steps.meta.outputs.tags }}-amd64
# Combine ARM64 and AMD64 images into a multi-arch manifest
create-manifest:
runs-on: ubuntu-latest
needs: [build-arm64, build-amd64] # Wait for both builds
steps:
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.DOCKER_IMAGE }}
- name: Create Multi-Arch Manifest
run: |
# Combine the architecture-specific images into a single manifest
docker buildx imagetools create -t ${{ steps.meta.outputs.tags }} \
${{ env.DOCKER_IMAGE }}:arm64 \
${{ env.DOCKER_IMAGE }}:amd64

View file

@ -30,6 +30,7 @@ jobs:
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
run: |
docker compose up | tee &> archgw.logs &
@ -55,5 +56,6 @@ jobs:
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
run: |
docker compose down

View file

@ -32,6 +32,11 @@ jobs:
run: |
python -m venv venv
- name: install hurl
run: |
curl --location --remote-name https://github.com/Orange-OpenSource/hurl/releases/download/4.0.0/hurl_4.0.0_amd64.deb
sudo dpkg -i hurl_4.0.0_amd64.deb
- name: install model server, arch gateway and test dependencies
run: |
source venv/bin/activate
@ -43,6 +48,7 @@ jobs:
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
run: |
source venv/bin/activate
cd demos/shared/test_runner && sh run_demo_tests.sh

View file

@ -29,6 +29,7 @@ jobs:
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
run: |
python -mvenv venv
source venv/bin/activate && cd tests/e2e && bash run_e2e_tests.sh

View file

@ -0,0 +1,31 @@
name: arch config tests
on:
push:
branches:
- main
pull_request:
jobs:
validate_arch_config:
runs-on: ubuntu-latest
defaults:
run:
working-directory: .
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.12"
- name: build arch docker image
run: |
docker build -f arch/Dockerfile . -t katanemo/archgw
- name: validate arch config
run: |
bash arch/validate_arch_config.sh

View file

@ -1,4 +1,5 @@
# Contribution
# Contribution
We would love feedback on our [Roadmap](https://github.com/orgs/katanemo/projects/1) and we welcome contributions to **Arch**!
Whether you're fixing bugs, adding new features, improving documentation, or creating tutorials, your help is much appreciated.
@ -22,7 +23,9 @@ $ cd arch
```
### 3. Create a branch
Use a descriptive name for your branch (e.g., fix-bug-123, add-feature-x).
```bash
$ git checkout -b <your-branch-name>
```
@ -32,6 +35,7 @@ $ git checkout -b <your-branch-name>
Make your changes in the relevant files. If you're adding new features or fixing bugs, please include tests where applicable.
### 5. Test your changes
```bash
cd arch
cargo test
@ -51,4 +55,4 @@ Contribution Guidelines
Follow the existing coding style.
Update documentation as needed.
To get in touch with us, please join our [discord server](https://discord.gg/rbjqVbpa). We will be monitoring that actively and offering support there.
To get in touch with us, please join our [discord server](https://discord.gg/pGZf2gcwEc). We will be monitoring that actively and offering support there.

View file

@ -3,7 +3,9 @@
</div>
<div align="center">
_Arch is an intelligent (edge and LLM) proxy designed for agentic applications - to help you protect, observe, and build agentic tasks by simply connecting (existing) APIs._
_The intelligent (edge and LLM) proxy server for agentic applications._<br><br>
Move faster by letting Arch handle the **pesky** heavy lifting in building agents: fast input clarification, agent routing, seamless integration of prompts with tools for common tasks, and unified access and observability of LLMs.
[Quickstart](#Quickstart) •
[Demos](#Demos) •
@ -16,26 +18,32 @@ _Arch is an intelligent (edge and LLM) proxy designed for agentic applications -
[![rust tests (prompt and llm gateway)](https://github.com/katanemo/arch/actions/workflows/rust_tests.yml/badge.svg)](https://github.com/katanemo/arch/actions/workflows/rust_tests.yml)
[![e2e tests](https://github.com/katanemo/arch/actions/workflows/e2e_tests.yml/badge.svg)](https://github.com/katanemo/arch/actions/workflows/e2e_tests.yml)
[![Build and Deploy Documentation](https://github.com/katanemo/arch/actions/workflows/static.yml/badge.svg)](https://github.com/katanemo/arch/actions/workflows/static.yml)
</div>
# Overview
<a href="https://www.producthunt.com/posts/arch-3?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_souce=badge-arch&#0045;3" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=565761&theme=light&period=daily" alt="Arch - Build&#0032;fast&#0044;&#0032;hyper&#0045;personalized&#0032;agents&#0032;with&#0032;intelligent&#0032;infra | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
<a href="https://www.producthunt.com/posts/arch-3?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_souce=badge-arch&#0045;3" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=565761&theme=dark&period=daily&t=1742359429995" alt="Arch - Build&#0032;fast&#0044;&#0032;hyper&#0045;personalized&#0032;agents&#0032;with&#0032;intelligent&#0032;infra | Product Hunt" style="width: 188px; height: 41px;" width="188" height="41" /></a>
Past the thrill of an AI demo, have you found yourself hitting these walls? You know, the all too familiar ones:
Arch Gateway was built by the contributors of [Envoy Proxy](https://www.envoyproxy.io/) with the belief that:
- You go from one BIG prompt to specialized prompts, but get stuck building **routing and handoff** code?
- You want use new LLMs, but struggle to **quickly and safely add LLMs** without writing integration code?
- You're bogged down with prompt engineering just to **clarify user intent and validate inputs** effectively?
- You're wasting cycles choosing and integrating code for **observability** instead of it happening transparently?
>Prompts are nuanced and opaque user requests, which require the same capabilities as traditional HTTP requests including secure handling, intelligent routing, robust observability, and integration with backend (API) systems for personalization outside core business logic.*
And you think to yourself, can't I move faster by focusing on higher-level objectives in a language/framework agnostic way? Well, you can! **Arch Gateway** was built by the contributors of [Envoy Proxy](https://www.envoyproxy.io/) with the belief that:
Arch is engineered with purpose-built LLMs to handle critical but pesky tasks related to the handling and processing of prompts. This includes detecting and rejecting [jailbreak](https://github.com/verazuo/jailbreak_llms) attempts, intent-based routing for improved task accuracy, mapping user request into "backend" functions, and managing the observability of prompts and LLM API calls in a centralized way.
>Prompts are nuanced and opaque user requests, which require the same capabilities as traditional HTTP requests including secure handling, intelligent routing, robust observability, and integration with backend (API) systems to improve speed and accuracy for common agentic scenarios all outside core application logic.*
**Core Features**:
- **Intent-based prompt routing & fast ⚡ function-calling via APIs**. Engineered with purpose-built [LLMs](https://huggingface.co/collections/katanemo/arch-function-66f209a693ea8df14317ad68) to handle fast, cost-effective, and accurate prompt-based tasks like function/API calling, and parameter extraction from prompts to build more task-accurate agentic applications.
- **Prompt [Guard](https://huggingface.co/collections/katanemo/arch-guard-6702bdc08b889e4bce8f446d)**: Arch centralizes guardrails to prevent jailbreak attempts and ensure safe user interactions without writing a single line of code.
- **LLM Routing & Traffic Management**: Arch centralizes calls to LLMs used by your applications, offering smart retries, automatic cutover, and resilient upstream connections for continuous availability.
- **Observability**: Arch uses the W3C Trace Context standard to enable complete request tracing across applications, ensuring compatibility with observability tools, and provides metrics to monitor latency, token usage, and error rates, helping optimize AI application performance.
- **Built on [Envoy](https://envoyproxy.io)**: Arch runs alongside application servers as a separate containerized process, and builds on top of Envoy's proven HTTP management and scalability features to handle ingress and egress traffic related to prompts and LLMs.
- `🚦 Routing`. Engineered with purpose-built [LLMs](https://huggingface.co/collections/katanemo/arch-function-66f209a693ea8df14317ad68) for fast (<100ms) agent routing and hand-off scenarios
- `⚡ Tools Use`: For common agentic scenarios let Arch instantly clarify and convert prompts to tools/API calls
- `⛨ Guardrails`: Centrally configure and prevent harmful outcomes and ensure safe user interactions
- `🔗 Access to LLMs`: Centralize access and traffic to LLMs with smart retries for continuous availability
- `🕵 Observability`: W3C compatible request tracing and LLM metrics that instantly plugin with popular tools
- `🧱 Built on Envoy`: Arch runs alongside app servers as a containerized process, and builds on top of [Envoy's](https://envoyproxy.io) proven HTTP management and scalability features to handle ingress and egress traffic related to prompts and LLMs.
**High-Level Sequence Diagram**:
![alt text](docs/source/_static/img/arch_network_diagram_high_level.png)
@ -73,7 +81,7 @@ Arch's CLI allows you to manage and interact with the Arch gateway efficiently.
```console
$ python -m venv venv
$ source venv/bin/activate # On Windows, use: venv\Scripts\activate
$ pip install archgw==0.2.1
$ pip install archgw==0.2.6
```
### Build AI Agent with Arch Gateway
@ -143,7 +151,7 @@ $ archgw up arch_config.yaml
2024-12-05 16:56:27,979 - cli.main - INFO - Starting archgw cli version: 0.1.5
...
2024-12-05 16:56:28,485 - cli.utils - INFO - Schema validation successful!
2024-12-05 16:56:28,485 - cli.main - INFO - Starging arch model server and arch gateway
2024-12-05 16:56:28,485 - cli.main - INFO - Starting arch model server and arch gateway
...
2024-12-05 16:56:51,647 - cli.core - INFO - Container is healthy!
@ -241,7 +249,7 @@ client = OpenAI(
response = client.chat.completions.create(
# we select model from arch_config file
model="--",
model="None",
messages=[{"role": "user", "content": "What is the capital of France?"}],
)
@ -301,6 +309,33 @@ Arch is designed to support best-in class observability by supporting open stand
![alt text](docs/source/_static/img/tracing.png)
## Debugging
When debugging issues / errors application logs and access logs provide key information to give you more context on whats going on with the system. Arch gateway runs in info log level and following is a typical output you could see in a typical interaction between developer and arch gateway,
```
$ archgw up --service archgw --foreground
...
[2025-03-26 18:32:01.350][26][info] prompt_gateway: on_http_request_body: sending request to model server
[2025-03-26 18:32:01.851][26][info] prompt_gateway: on_http_call_response: model server response received
[2025-03-26 18:32:01.852][26][info] prompt_gateway: on_http_call_response: dispatching api call to developer endpoint: weather_forecast_service, path: /weather, method: POST
[2025-03-26 18:32:01.882][26][info] prompt_gateway: on_http_call_response: developer api call response received: status code: 200
[2025-03-26 18:32:01.882][26][info] prompt_gateway: on_http_call_response: sending request to upstream llm
[2025-03-26 18:32:01.883][26][info] llm_gateway: on_http_request_body: provider: gpt-4o-mini, model requested: None, model selected: gpt-4o-mini
[2025-03-26 18:32:02.818][26][info] llm_gateway: on_http_response_body: time to first token: 1468ms
[2025-03-26 18:32:04.532][26][info] llm_gateway: on_http_response_body: request latency: 3183ms
...
```
Log level can be changed to debug to get more details. To enable debug logs edit (Dockerfile)[arch/Dockerfile], change the log level `--component-log-level wasm:info` to `--component-log-level wasm:debug`. And after that you need to rebuild docker image and restart the arch gateway using following set of commands,
```
# make sure you are at the root of the repo
$ archgw build
# go to your service that has arch_config.yaml file and issue following command,
$ archgw up --service archgw --foreground
```
## Contribution
We would love feedback on our [Roadmap](https://github.com/orgs/katanemo/projects/1) and we welcome contributions to **Arch**!
Whether you're fixing bugs, adding new features, improving documentation, or creating tutorials, your help is much appreciated.

View file

@ -28,4 +28,5 @@ COPY arch/arch_config_schema.yaml .
RUN pip install requests
RUN touch /var/log/envoy.log
ENTRYPOINT ["sh","-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:debug 2>&1 | tee /var/log/envoy.log"]
# ENTRYPOINT ["sh","-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --log-level trace 2>&1 | tee /var/log/envoy.log"]
ENTRYPOINT ["sh","-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:info 2>&1 | tee /var/log/envoy.log"]

View file

@ -5,8 +5,9 @@ properties:
type: string
listeners:
type: object
additionalProperties: false
properties:
prompt_gateway:
ingress_traffic:
type: object
properties:
address:
@ -20,7 +21,7 @@ properties:
timeout:
type: string
additionalProperties: false
llm_gateway:
egress_traffic:
type: object
properties:
address:
@ -31,7 +32,6 @@ properties:
type: string
enum:
- openai
- huggingface
timeout:
type: string
additionalProperties: false
@ -62,7 +62,7 @@ properties:
properties:
name:
type: string
# this field is deprecated, use provider_interface instead
# provider field is deprecated, use provider_interface instead
provider:
type: string
enum:
@ -78,8 +78,11 @@ properties:
type: string
default:
type: boolean
# endpoint field is deprecated, use base_url instead
endpoint:
type: string
base_url:
type: string
protocol:
type: string
enum:
@ -90,7 +93,6 @@ properties:
additionalProperties: false
required:
- name
- model
overrides:
type: object
properties:
@ -98,6 +100,8 @@ properties:
type: number
optimize_context_window:
type: boolean
use_agent_orchestrator:
type: boolean
system_prompt:
type: string
prompt_targets:
@ -124,7 +128,10 @@ properties:
required:
type: boolean
default:
type: string
anyOf:
- type: string
- type: integer
- type: boolean
description:
type: string
type:
@ -132,7 +139,10 @@ properties:
enum:
type: array
items:
type: string
anyOf:
- type: string
- type: integer
- type: boolean
in_path:
type: boolean
format:
@ -241,5 +251,4 @@ properties:
additionalProperties: false
required:
- version
- listeners
- llm_providers

View file

@ -29,7 +29,7 @@ stats_config:
- 180000
static_resources:
listeners:
- name: arch_listener_http
- name: ingress_traffic
address:
socket_address:
address: {{ prompt_gateway_listener.address }}
@ -55,7 +55,7 @@ static_resources:
random_sampling:
value: {{ arch_tracing.random_sampling }}
{% endif %}
stat_prefix: arch_listener_http
stat_prefix: ingress_traffic
codec_type: AUTO
scheme_header_transformation:
scheme_to_overwrite: https
@ -82,7 +82,7 @@ static_resources:
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
- name: arch_prompt_gateway_listener
- name: ingress_traffic_prompt
address:
socket_address:
address: 0.0.0.0
@ -104,11 +104,11 @@ static_resources:
envoy_grpc:
cluster_name: opentelemetry_collector
timeout: 0.250s
service_name: prompt_processor
service_name: ingress_traffic
random_sampling:
value: {{ arch_tracing.random_sampling }}
{% endif %}
stat_prefix: arch_prompt_gateway_listener
stat_prefix: ingress_traffic
codec_type: AUTO
scheme_header_transformation:
scheme_to_overwrite: https
@ -142,6 +142,19 @@ static_resources:
cluster: {{ llm_cluster_name }}
timeout: 60s
{% endfor %}
{% if agent_orchestrator %}
- match:
prefix: "/"
headers:
- name: "x-arch-llm-provider"
string_match:
exact: {{ agent_orchestrator }}
route:
auto_host_rewrite: true
cluster: {{ agent_orchestrator }}
timeout: 60s
{% endif %}
http_filters:
- name: envoy.filters.http.compressor
typed_config:
@ -201,7 +214,7 @@ static_resources:
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
- name: arch_internal
- name: egress_api_traffic
address:
socket_address:
address: 0.0.0.0
@ -223,11 +236,11 @@ static_resources:
envoy_grpc:
cluster_name: opentelemetry_collector
timeout: 0.250s
service_name: prompt_processor
service_name: egress_api_traffic
random_sampling:
value: {{ arch_tracing.random_sampling }}
{% endif %}
stat_prefix: arch_internal
stat_prefix: egress_api_traffic
codec_type: AUTO
scheme_header_transformation:
scheme_to_overwrite: https
@ -273,12 +286,12 @@ static_resources:
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
- name: arch_listener_http_llm
- name: egress_traffic
address:
socket_address:
address: {{ llm_gateway_listener.address }}
port_value: {{ llm_gateway_listener.port }}
traffic_direction: INBOUND
traffic_direction: OUTBOUND
filter_chains:
- filters:
- name: envoy.filters.network.http_connection_manager
@ -299,7 +312,7 @@ static_resources:
random_sampling:
value: {{ arch_tracing.random_sampling }}
{% endif %}
stat_prefix: arch_listener_http
stat_prefix: egress_traffic
codec_type: AUTO
scheme_header_transformation:
scheme_to_overwrite: https
@ -326,7 +339,7 @@ static_resources:
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
- name: arch_listener_llm
- name: egress_traffic_llm
address:
socket_address:
address: 0.0.0.0
@ -347,11 +360,11 @@ static_resources:
envoy_grpc:
cluster_name: opentelemetry_collector
timeout: 0.250s
service_name: llm_gateway
service_name: egress_traffic_llm
random_sampling:
value: {{ arch_tracing.random_sampling }}
{% endif %}
stat_prefix: arch_listener_http
stat_prefix: egress_traffic
codec_type: AUTO
scheme_header_transformation:
scheme_to_overwrite: https

View file

@ -19,7 +19,7 @@ source venv/bin/activate
### Step 3: Run the build script
```bash
pip install archgw==0.2.1
pip install archgw==0.2.6
```
## Uninstall Instructions: archgw CLI

View file

@ -3,6 +3,7 @@ import os
from jinja2 import Environment, FileSystemLoader
import yaml
from jsonschema import validate
from urllib.parse import urlparse
ENVOY_CONFIG_TEMPLATE_FILE = os.getenv(
"ENVOY_CONFIG_TEMPLATE_FILE", "envoy.template.yaml"
@ -47,7 +48,7 @@ def validate_and_render_schema():
arch_config_schema = file.read()
config_yaml = yaml.safe_load(arch_config)
config_schema_yaml = yaml.safe_load(arch_config_schema)
_ = yaml.safe_load(arch_config_schema)
inferred_clusters = {}
endpoints = config_yaml.get("endpoints", {})
@ -91,6 +92,9 @@ def validate_and_render_schema():
del llm_provider["provider"]
updated_llm_providers.append(llm_provider)
if llm_provider.get("endpoint") and llm_provider.get("base_url"):
raise Exception("Please provide either endpoint or base_url, not both")
if llm_provider.get("endpoint", None):
endpoint = llm_provider["endpoint"]
protocol = llm_provider.get("protocol", "http")
@ -98,13 +102,39 @@ def validate_and_render_schema():
endpoint, protocol
)
llms_with_endpoint.append(llm_provider)
elif llm_provider.get("base_url", None):
base_url = llm_provider["base_url"]
urlparse_result = urlparse(base_url)
if llm_provider.get("port"):
raise Exception("Please provider port in base_url")
if urlparse_result.scheme == "" or urlparse_result.scheme not in [
"http",
"https",
]:
raise Exception(
"Please provide a valid URL with scheme (http/https) in base_url"
)
protocol = urlparse_result.scheme
port = urlparse_result.port
if port is None:
if protocol == "http":
port = 80
else:
port = 443
endpoint = urlparse_result.hostname
llm_provider["endpoint"] = endpoint
llm_provider["port"] = port
llm_provider["protocol"] = protocol
llms_with_endpoint.append(llm_provider)
config_yaml["llm_providers"] = updated_llm_providers
arch_config_string = yaml.dump(config_yaml)
arch_llm_config_string = yaml.dump(config_yaml)
prompt_gateway_listener = config_yaml.get("listeners", {}).get("prompt_gateway", {})
prompt_gateway_listener = config_yaml.get("listeners", {}).get(
"ingress_traffic", {}
)
if prompt_gateway_listener.get("port") == None:
prompt_gateway_listener["port"] = 10000 # default port for prompt gateway
if prompt_gateway_listener.get("address") == None:
@ -112,7 +142,7 @@ def validate_and_render_schema():
if prompt_gateway_listener.get("timeout") == None:
prompt_gateway_listener["timeout"] = "10s"
llm_gateway_listener = config_yaml.get("listeners", {}).get("llm_gateway", {})
llm_gateway_listener = config_yaml.get("listeners", {}).get("egress_traffic", {})
if llm_gateway_listener.get("port") == None:
llm_gateway_listener["port"] = 12000 # default port for llm gateway
if llm_gateway_listener.get("address") == None:
@ -120,6 +150,26 @@ def validate_and_render_schema():
if llm_gateway_listener.get("timeout") == None:
llm_gateway_listener["timeout"] = "10s"
use_agent_orchestrator = config_yaml.get("overrides", {}).get(
"use_agent_orchestrator", False
)
agent_orchestrator = None
if use_agent_orchestrator:
print("Using agent orchestrator")
if len(endpoints) == 0:
raise Exception(
"Please provide agent orchestrator in the endpoints section in your arch_config.yaml file"
)
elif len(endpoints) > 1:
raise Exception(
"Please provide single agent orchestrator in the endpoints section in your arch_config.yaml file"
)
else:
agent_orchestrator = list(endpoints.keys())[0]
print("agent_orchestrator: ", agent_orchestrator)
data = {
"prompt_gateway_listener": prompt_gateway_listener,
"llm_gateway_listener": llm_gateway_listener,
@ -129,6 +179,7 @@ def validate_and_render_schema():
"arch_llm_providers": config_yaml["llm_providers"],
"arch_tracing": arch_tracing,
"local_llms": llms_with_endpoint,
"agent_orchestrator": agent_orchestrator,
}
rendered = template.render(data)

View file

@ -2,110 +2,49 @@ import subprocess
import os
import time
import sys
import glob
import docker
from docker.errors import DockerException
from cli.utils import getLogger, update_docker_host_env
import yaml
from cli.utils import getLogger
from cli.consts import (
ARCHGW_DOCKER_IMAGE,
ARCHGW_DOCKER_NAME,
KATANEMO_LOCAL_MODEL_LIST,
MODEL_SERVER_LOG_FILE,
ACCESS_LOG_FILES,
)
from huggingface_hub import snapshot_download
from dotenv import dotenv_values
import yaml
import subprocess
from cli.docker_cli import (
docker_container_status,
docker_remove_container,
docker_start_archgw_detached,
docker_stop_container,
health_check_endpoint,
stream_gateway_logs,
)
log = getLogger(__name__)
def start_archgw_docker(
client, arch_config_file, env, prompt_gateway_port, llm_gateway_port
):
logs_path = "~/archgw_logs"
logs_path_abs = os.path.expanduser(logs_path)
def _get_gateway_ports(arch_config_file: str) -> tuple:
PROMPT_GATEWAY_DEFAULT_PORT = 10000
LLM_GATEWAY_DEFAULT_PORT = 12000
return client.containers.run(
name=ARCHGW_DOCKER_NAME,
image=ARCHGW_DOCKER_IMAGE,
detach=True, # Run in detached mode
ports={
f"{prompt_gateway_port}/tcp": prompt_gateway_port,
"10001/tcp": 10001,
"11000/tcp": 11000,
f"{llm_gateway_port}/tcp": llm_gateway_port,
"9901/tcp": 19901,
},
volumes={
f"{arch_config_file}": {
"bind": "/app/arch_config.yaml",
"mode": "ro",
},
"/etc/ssl/cert.pem": {"bind": "/etc/ssl/cert.pem", "mode": "ro"},
logs_path_abs: {"bind": "/var/log"},
},
environment={
"OTEL_TRACING_HTTP_ENDPOINT": "http://host.docker.internal:4318/v1/traces",
"MODEL_SERVER_PORT": os.getenv("MODEL_SERVER_PORT", "51000"),
**env,
},
extra_hosts={"host.docker.internal": "host-gateway"},
healthcheck={
"test": [
"CMD",
"curl",
"-f",
f"http://localhost:{prompt_gateway_port}/healthz",
],
"interval": 5000000000, # 5 seconds
"timeout": 1000000000, # 1 seconds
"retries": 3,
},
# parse arch_config_file yaml file and get prompt_gateway_port
arch_config_dict = {}
with open(arch_config_file) as f:
arch_config_dict = yaml.safe_load(f)
prompt_gateway_port = (
arch_config_dict.get("listeners", {})
.get("ingress_traffic", {})
.get("port", PROMPT_GATEWAY_DEFAULT_PORT)
)
llm_gateway_port = (
arch_config_dict.get("listeners", {})
.get("egress_traffic", {})
.get("port", LLM_GATEWAY_DEFAULT_PORT)
)
def stream_gateway_logs(follow):
"""
Stream logs from the arch gateway service.
"""
log.info("Logs from arch gateway service.")
options = ["docker", "logs", "archgw"]
if follow:
options.append("-f")
try:
# Run `docker-compose logs` to stream logs from the gateway service
subprocess.run(
options,
check=True,
stdout=sys.stdout,
stderr=sys.stderr,
)
except subprocess.CalledProcessError as e:
log.info(f"Failed to stream logs: {str(e)}")
def stream_access_logs(follow):
"""
Get the archgw access logs
"""
log_file_pattern_expanded = os.path.expanduser(ACCESS_LOG_FILES)
log_files = glob.glob(log_file_pattern_expanded)
stream_command = ["tail"]
if follow:
stream_command.append("-f")
stream_command.extend(log_files)
subprocess.run(
stream_command,
check=True,
stdout=sys.stdout,
stderr=sys.stderr,
)
return prompt_gateway_port, llm_gateway_port
def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
@ -119,73 +58,58 @@ def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
log.info("Starting arch gateway")
try:
try:
client = docker.from_env()
except DockerException as e:
# try setting up the docker host environment variable and retry
update_docker_host_env()
client = docker.from_env()
archgw_container_status = docker_container_status(ARCHGW_DOCKER_NAME)
if archgw_container_status != "not found":
log.info("archgw found in docker, stopping and removing it")
docker_stop_container(ARCHGW_DOCKER_NAME)
docker_remove_container(ARCHGW_DOCKER_NAME)
try:
container = client.containers.get("archgw")
log.info("archgw container found in docker, stopping and removing it")
# ensure that previous docker container is stopped and removed
container.stop()
container.remove()
log.info("Stopped and removed archgw container")
except docker.errors.NotFound as e:
pass
prompt_gateway_port, llm_gateway_port = _get_gateway_ports(arch_config_file)
# parse arch_config_file yaml file and get prompt_gateway_port
arch_config_dict = {}
with open(arch_config_file) as f:
arch_config_dict = yaml.safe_load(f)
prompt_gateway_port = (
arch_config_dict.get("listeners", {})
.get("prompt_gateway", {})
.get("port", 10000)
)
llm_gateway_port = (
arch_config_dict.get("listeners", {})
.get("llm_gateway", {})
.get("port", 12000)
)
container = start_archgw_docker(
client, arch_config_file, env, prompt_gateway_port, llm_gateway_port
return_code, _, archgw_stderr = docker_start_archgw_detached(
arch_config_file,
os.path.expanduser("~/archgw_logs"),
env,
prompt_gateway_port,
llm_gateway_port,
)
if return_code != 0:
log.info("Failed to start arch gateway: " + str(return_code))
log.info("stderr: " + archgw_stderr)
sys.exit(1)
start_time = time.time()
while True:
container = client.containers.get(container.id)
prompt_gateway_health_check_status = health_check_endpoint(
f"http://localhost:{prompt_gateway_port}/healthz"
)
llm_gateway_health_check_status = health_check_endpoint(
f"http://localhost:{llm_gateway_port}/healthz"
)
archgw_status = docker_container_status(ARCHGW_DOCKER_NAME)
current_time = time.time()
elapsed_time = current_time - start_time
# Check if timeout is reached
if elapsed_time > log_timeout:
log.info(f"Stopping log monitoring after {log_timeout} seconds.")
log.info(f"stopping log monitoring after {log_timeout} seconds.")
break
container_status = container.attrs["State"]["Health"]["Status"]
if container_status == "healthy":
log.info("Container is healthy!")
if prompt_gateway_health_check_status or llm_gateway_health_check_status:
log.info("archgw is running and is healthy!")
break
else:
log.info(f"Container health status: {container_status}")
log.info(f"archgw status: {archgw_status}, health status: starting")
time.sleep(1)
if foreground:
for line in container.logs(stream=True):
print(line.decode("utf-8").strip("\n"))
stream_gateway_logs(follow=True)
except KeyboardInterrupt:
log.info("Keyboard interrupt received, stopping arch gateway service.")
stop_arch()
except docker.errors.APIError as e:
log.info(f"Failed to start Arch: {str(e)}")
def stop_arch():
@ -199,10 +123,10 @@ def stop_arch():
try:
subprocess.run(
["docker", "stop", "archgw"],
["docker", "stop", ARCHGW_DOCKER_NAME],
)
subprocess.run(
["docker", "remove", "archgw"],
["docker", "rm", ARCHGW_DOCKER_NAME],
)
log.info("Successfully shut down arch gateway service.")

View file

@ -0,0 +1,133 @@
import subprocess
import json
import sys
import requests
from cli.consts import ARCHGW_DOCKER_IMAGE, ARCHGW_DOCKER_NAME
from cli.utils import getLogger
log = getLogger(__name__)
def docker_container_status(container: str) -> str:
result = subprocess.run(
["docker", "inspect", "--type=container", container],
capture_output=True,
text=True,
check=False,
)
if result.returncode != 0:
return "not found"
container_status = json.loads(result.stdout)[0]
return container_status.get("State", {}).get("Status", "")
def docker_stop_container(container: str) -> str:
result = subprocess.run(
["docker", "stop", container], capture_output=True, text=True, check=False
)
return result.returncode
def docker_remove_container(container: str) -> str:
result = subprocess.run(
["docker", "rm", container], capture_output=True, text=True, check=False
)
return result.returncode
def docker_start_archgw_detached(
arch_config_file: str,
logs_path_abs: str,
env: dict,
prompt_gateway_port,
llm_gateway_port,
) -> str:
env_args = [item for key, value in env.items() for item in ["-e", f"{key}={value}"]]
port_mappings = [
f"{prompt_gateway_port}:{prompt_gateway_port}",
f"{llm_gateway_port}:{llm_gateway_port}",
"9901:19901",
]
port_mappings_args = [item for port in port_mappings for item in ("-p", port)]
volume_mappings = [
f"{logs_path_abs}:/var/log:rw",
f"{arch_config_file}:/app/arch_config.yaml:ro",
# "/Users/adilhafeez/src/intelligent-prompt-gateway/crates/target/wasm32-wasip1/release:/etc/envoy/proxy-wasm-plugins:ro",
]
volume_mappings_args = [
item for volume in volume_mappings for item in ("-v", volume)
]
options = [
"docker",
"run",
"-d",
"--name",
ARCHGW_DOCKER_NAME,
*port_mappings_args,
*volume_mappings_args,
*env_args,
"--add-host",
"host.docker.internal:host-gateway",
ARCHGW_DOCKER_IMAGE,
]
result = subprocess.run(options, capture_output=True, text=True, check=False)
return result.returncode, result.stdout, result.stderr
def health_check_endpoint(endpoint: str) -> bool:
try:
response = requests.get(endpoint)
if response.status_code == 200:
return True
except requests.RequestException as e:
pass
return False
def stream_gateway_logs(follow):
"""
Stream logs from the arch gateway service.
"""
log.info("Logs from arch gateway service.")
options = ["docker", "logs"]
if follow:
options.append("-f")
options.append(ARCHGW_DOCKER_NAME)
try:
# Run `docker-compose logs` to stream logs from the gateway service
subprocess.run(
options,
check=True,
stdout=sys.stdout,
stderr=sys.stderr,
)
except subprocess.CalledProcessError as e:
log.info(f"Failed to stream logs: {str(e)}")
def docker_validate_archgw_schema(arch_config_file):
result = subprocess.run(
[
"docker",
"run",
"--rm",
"-v",
f"{arch_config_file}:/app/arch_config.yaml:ro",
"--entrypoint",
"python",
ARCHGW_DOCKER_IMAGE,
"config_generator.py",
],
capture_output=True,
text=True,
check=False,
)
return result.returncode, result.stdout, result.stderr

View file

@ -5,11 +5,12 @@ import subprocess
import multiprocessing
import importlib.metadata
from cli import targets
from cli.docker_cli import docker_validate_archgw_schema, stream_gateway_logs
from cli.utils import (
getLogger,
get_llm_provider_access_keys,
load_env_file_to_dict,
validate_schema,
stream_access_logs,
)
from cli.core import (
start_arch_modelserver,
@ -17,12 +18,9 @@ from cli.core import (
start_arch,
stop_arch,
download_models_from_hf,
stream_access_logs,
stream_gateway_logs,
)
from cli.consts import (
KATANEMO_DOCKERHUB_REPO,
KATANEMO_LOCAL_MODEL_LIST,
SERVICE_NAME_ARCHGW,
SERVICE_NAME_MODEL_SERVER,
SERVICE_ALL,
@ -174,17 +172,24 @@ def up(file, path, service, foreground):
log.info(f"Validating {arch_config_file}")
try:
validate_schema(arch_config_file)
except Exception as e:
log.info(f"Exiting archgw up: validation failed")
log.info(f"Error: {str(e)}")
(
validation_return_code,
validation_stdout,
validation_stderr,
) = docker_validate_archgw_schema(arch_config_file)
if validation_return_code != 0:
log.info(f"Error: Validation failed. Exiting")
log.info(f"Validation stdout: {validation_stdout}")
log.info(f"Validation stderr: {validation_stderr}")
sys.exit(1)
log.info("Starting arch model server and arch gateway")
# Set the ARCH_CONFIG_FILE environment variable
env_stage = {}
env_stage = {
"OTEL_TRACING_HTTP_ENDPOINT": "http://host.docker.internal:4318/v1/traces",
"MODEL_SERVER_PORT": os.getenv("MODEL_SERVER_PORT", "51000"),
}
env = os.environ.copy()
# check if access_keys are preesnt in the config file
access_keys = get_llm_provider_access_keys(arch_config_file=arch_config_file)

View file

@ -2,7 +2,6 @@ import ast
import sys
import yaml
from typing import Any
from pydantic import BaseModel
FLASK_ROUTE_DECORATORS = ["route", "get", "post", "put", "delete", "patch"]
FASTAPI_ROUTE_DECORATORS = ["get", "post", "put", "delete", "patch"]

View file

@ -1,10 +1,11 @@
import glob
import os
import subprocess
import sys
import yaml
import logging
import docker
from docker.errors import DockerException
from cli.consts import ARCHGW_DOCKER_IMAGE, ARCHGW_DOCKER_NAME
from cli.consts import ACCESS_LOG_FILES
logging.basicConfig(
level=logging.INFO,
@ -21,63 +22,6 @@ def getLogger(name="cli"):
log = getLogger(__name__)
def update_docker_host_env():
"""
Update DOCKER_HOST environment variable to use the local Docker socket
"""
if os.getenv("DOCKER_HOST"):
return
default_docker_socket = os.getenv("DEFAULT_DOCKER_SOCKET", "/var/run/docker.sock")
if not os.path.exists(default_docker_socket):
home_dir = os.getenv("HOME")
docker_host = f"unix://{home_dir}/.docker/run/docker.sock"
log.info(
f"Default docker socket {default_docker_socket} not found, using {docker_host}"
)
os.environ["DOCKER_HOST"] = docker_host
def validate_schema(arch_config_file: str) -> None:
try:
try:
client = docker.from_env()
except DockerException as e:
# try setting up the docker host environment variable and retry
update_docker_host_env()
client = docker.from_env()
container = client.containers.run(
image=ARCHGW_DOCKER_IMAGE,
volumes={
f"{arch_config_file}": {
"bind": "/app/arch_config.yaml",
"mode": "ro",
},
},
entrypoint=["python", "config_generator.py"],
detach=True,
)
# Wait for the container to finish and get the exit code
exit_code = container.wait()
# Check exit code for validation success
if exit_code["StatusCode"] != 0:
# Validation failed (non-zero exit code)
logs = container.logs().decode() # Get container logs for debugging
raise ValueError(
f"Validation failed. Container exited with code {exit_code}.\nLogs:\n{logs}"
)
# Successful validation (exit code 0)
log.info("Schema validation successful!")
except docker.errors.APIError as e:
# Handle container creation error
raise ValueError(f"Failed to create container: {e}")
def get_llm_provider_access_keys(arch_config_file):
with open(arch_config_file, "r") as file:
arch_config = file.read()
@ -127,3 +71,23 @@ def load_env_file_to_dict(file_path):
env_dict[key] = value
return env_dict
def stream_access_logs(follow):
"""
Get the archgw access logs
"""
log_file_pattern_expanded = os.path.expanduser(ACCESS_LOG_FILES)
log_files = glob.glob(log_file_pattern_expanded)
stream_command = ["tail"]
if follow:
stream_command.append("-f")
stream_command.extend(log_files)
subprocess.run(
stream_command,
check=True,
stdout=sys.stdout,
stderr=sys.stderr,
)

2224
arch/tools/poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "archgw"
version = "0.2.1"
version = "0.2.6"
description = "Python-based CLI tool to manage Arch Gateway."
authors = ["Katanemo Labs, Inc."]
packages = [
@ -10,13 +10,11 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.10"
archgw_modelserver = "^0.2.1"
archgw_modelserver = "^0.2.6"
click = "^8.1.7"
jinja2 = "^3.1.4"
jsonschema = "^4.23.0"
setuptools = "75.5.0"
docker = "^7.1.0"
python-dotenv = "^1.0.1"
pyyaml = "^6.0.2"
[tool.poetry.scripts]

View file

@ -0,0 +1,20 @@
#!/bin/bash
failed_files=()
for file in $(find . -name arch_config.yaml -o -name arch_config_full_reference.yaml); do
echo "Validating $file..."
if ! docker run --rm -v "$(pwd)/$file:/app/arch_config.yaml:ro" --entrypoint /bin/sh katanemo/archgw:latest -c "python config_generator.py" 2>&1 > /dev/null ; then
echo "Validation failed for $file"
failed_files+=("$file")
fi
done
# Print summary of failed files
if [ ${#failed_files[@]} -ne 0 ]; then
echo -e "\nValidation failed for the following files:"
printf '%s\n' "${failed_files[@]}"
exit 1
else
echo -e "\nAll files validated successfully!"
fi

View file

@ -31,6 +31,10 @@
{
"name": "chatbot_ui",
"path": "demos/shared/chatbot_ui"
},
{
"name": "java_demo",
"path": "demos/samples_java/weather_forcecast_service"
}
],
"settings": {

View file

@ -135,7 +135,10 @@ impl From<String> for ParameterType {
"array" => ParameterType::List,
"dict" => ParameterType::Dict,
"dictionary" => ParameterType::Dict,
_ => ParameterType::String,
_ => {
log::warn!("Unknown parameter type: {}, assuming type str", s);
ParameterType::String
}
}
}
}
@ -186,7 +189,7 @@ pub struct ToolCall {
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FunctionCallDetail {
pub name: String,
pub arguments: HashMap<String, Value>,
pub arguments: Option<HashMap<String, Value>>,
}
#[derive(Debug, Deserialize, Serialize)]
@ -202,13 +205,6 @@ pub struct ToolCallState {
pub enum ArchState {
ToolCall(Vec<ToolCallState>),
}
#[derive(Deserialize, Serialize)]
#[serde(untagged)]
pub enum ModelServerResponse {
ChatCompletionsResponse(ChatCompletionsResponse),
ModelServerErrorResponse(ModelServerErrorResponse),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelServerErrorResponse {
pub result: String,

View file

@ -25,6 +25,7 @@ pub struct Configuration {
pub struct Overrides {
pub prompt_target_intent_matching_threshold: Option<f64>,
pub optimize_context_window: Option<bool>,
pub use_agent_orchestrator: Option<bool>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
@ -159,7 +160,7 @@ pub struct LlmProvider {
pub name: String,
pub provider_interface: LlmProviderType,
pub access_key: Option<String>,
pub model: String,
pub model: Option<String>,
pub default: Option<bool>,
pub stream: Option<bool>,
pub endpoint: Option<String>,
@ -326,16 +327,6 @@ mod test {
Some("/agent/summary".to_string())
);
let error_target = config.error_target.as_ref().unwrap();
assert_eq!(
error_target.endpoint.as_ref().unwrap().name,
"error_target_1".to_string()
);
assert_eq!(
error_target.endpoint.as_ref().unwrap().path,
Some("/error".to_string())
);
let tracing = config.tracing.as_ref().unwrap();
assert_eq!(tracing.sampling_rate.unwrap(), 0.1);

View file

@ -11,10 +11,13 @@ pub const MODEL_SERVER_NAME: &str = "model_server";
pub const ARCH_ROUTING_HEADER: &str = "x-arch-llm-provider";
pub const MESSAGES_KEY: &str = "messages";
pub const ARCH_PROVIDER_HINT_HEADER: &str = "x-arch-llm-provider-hint";
pub const CHAT_COMPLETIONS_PATH: &str = "/v1/chat/completions";
pub const CHAT_COMPLETIONS_PATH: [&str; 2] = ["/v1/chat/completions", "/openai/v1/chat/completions"];
pub const HEALTHZ_PATH: &str = "/healthz";
pub const ARCH_STATE_HEADER: &str = "x-arch-state";
pub const ARCH_FC_MODEL_NAME: &str = "Arch-Function-1.5B";
pub const X_ARCH_STATE_HEADER: &str = "x-arch-state";
pub const X_ARCH_API_RESPONSE: &str = "x-arch-api-response-message";
pub const X_ARCH_TOOL_CALL: &str = "x-arch-tool-call-message";
pub const X_ARCH_FC_MODEL_RESPONSE: &str = "x-arch-fc-model-response";
pub const ARCH_FC_MODEL_NAME: &str = "Arch-Function";
pub const REQUEST_ID_HEADER: &str = "x-request-id";
pub const TRACE_PARENT_HEADER: &str = "traceparent";
pub const ARCH_INTERNAL_CLUSTER_NAME: &str = "arch_internal";

View file

@ -3,7 +3,7 @@ use crate::{
stats::{Gauge, IncrementingMetric},
};
use derivative::Derivative;
use log::trace;
use log::debug;
use proxy_wasm::traits::Context;
use serde::Serialize;
use std::{cell::RefCell, collections::HashMap, fmt::Debug, time::Duration};
@ -48,10 +48,9 @@ pub trait Client: Context {
call_args: CallArgs,
call_context: Self::CallContext,
) -> Result<u32, ClientError> {
trace!(
debug!(
"dispatching http call with args={:?} context={:?}",
call_args,
call_context
call_args, call_context
);
match self.dispatch_http_call(

View file

@ -1,19 +1,25 @@
use log::trace;
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
#[allow(dead_code)]
pub enum Error {
#[error("Unknown model: {model_name}")]
UnknownModel { model_name: String },
}
use log::debug;
#[allow(dead_code)]
pub fn token_count(model_name: &str, text: &str) -> Result<usize, Error> {
trace!("getting token count model={}", model_name);
pub fn token_count(model_name: &str, text: &str) -> Result<usize, String> {
debug!("getting token count model={}", model_name);
//HACK: add support for tokenizing mistral and other models
//filed issue https://github.com/katanemo/arch/issues/222
let updated_model = match model_name.starts_with("gpt") {
false => {
debug!(
"tiktoken_rs: unsupported model: {}, using gpt-4 to compute token count",
model_name
);
"gpt-4"
}
true => model_name,
};
// Consideration: is it more expensive to instantiate the BPE object every time, or to contend the singleton?
let bpe = tiktoken_rs::get_bpe_from_model(model_name).map_err(|_| Error::UnknownModel {
model_name: model_name.to_string(),
})?;
let bpe = tiktoken_rs::get_bpe_from_model(updated_model).map_err(|e| e.to_string())?;
Ok(bpe.encode_ordinary(text).len())
}
@ -30,14 +36,4 @@ mod test {
token_count(model_name, text).expect("correct tokenization")
);
}
#[test]
fn unrecognized_model() {
assert_eq!(
Error::UnknownModel {
model_name: "unknown".to_string()
},
token_count("unknown", "").expect_err("unknown model")
)
}
}

View file

@ -166,7 +166,7 @@ impl TraceData {
attributes: vec![Attribute {
key: "service.name".to_string(),
value: AttributeValue {
string_value: Some("upstream-llm".to_string()),
string_value: Some("egress_llm_traffic".to_string()),
},
}],
};

View file

@ -1,6 +1,7 @@
use crate::metrics::Metrics;
use crate::stream_context::StreamContext;
use common::configuration::Configuration;
use common::configuration::Overrides;
use common::consts::OTEL_COLLECTOR_HTTP;
use common::consts::OTEL_POST_PATH;
use common::http::CallArgs;
@ -31,6 +32,7 @@ pub struct FilterContext {
callouts: RefCell<HashMap<u32, CallContext>>,
llm_providers: Option<Rc<LlmProviders>>,
traces_queue: Arc<Mutex<VecDeque<TraceData>>>,
overrides: Rc<Option<Overrides>>,
}
impl FilterContext {
@ -40,6 +42,7 @@ impl FilterContext {
metrics: Rc::new(Metrics::new()),
llm_providers: None,
traces_queue: Arc::new(Mutex::new(VecDeque::new())),
overrides: Rc::new(None),
}
}
}
@ -69,6 +72,7 @@ impl RootContext for FilterContext {
};
ratelimit::ratelimits(Some(config.ratelimits.unwrap_or_default()));
self.overrides = Rc::new(config.overrides);
match config.llm_providers.try_into() {
Ok(llm_providers) => self.llm_providers = Some(Rc::new(llm_providers)),
@ -93,6 +97,7 @@ impl RootContext for FilterContext {
.expect("LLM Providers must exist when Streams are being created"),
),
Arc::clone(&self.traces_queue),
Rc::clone(&self.overrides),
)))
}

View file

@ -3,9 +3,9 @@ use common::api::open_ai::{
ChatCompletionStreamResponseServerEvents, ChatCompletionsRequest, ChatCompletionsResponse,
Message, StreamOptions,
};
use common::configuration::LlmProvider;
use common::configuration::{LlmProvider, LlmProviderType, Overrides};
use common::consts::{
ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, CHAT_COMPLETIONS_PATH,
ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, CHAT_COMPLETIONS_PATH, HEALTHZ_PATH,
RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, TRACE_PARENT_HEADER,
};
use common::errors::ServerError;
@ -15,7 +15,7 @@ use common::stats::{IncrementingMetric, RecordingMetric};
use common::tracing::{Event, Span, TraceData, Traceparent};
use common::{ratelimit, routing, tokenizer};
use http::StatusCode;
use log::{debug, trace, warn};
use log::{debug, info, warn};
use proxy_wasm::hostcalls::get_current_time;
use proxy_wasm::traits::*;
use proxy_wasm::types::*;
@ -42,6 +42,7 @@ pub struct StreamContext {
request_body_sent_time: Option<u128>,
user_message: Option<Message>,
traces_queue: Arc<Mutex<VecDeque<TraceData>>>,
overrides: Rc<Option<Overrides>>,
}
impl StreamContext {
@ -50,10 +51,12 @@ impl StreamContext {
metrics: Rc<Metrics>,
llm_providers: Rc<LlmProviders>,
traces_queue: Arc<Mutex<VecDeque<TraceData>>>,
overrides: Rc<Option<Overrides>>,
) -> Self {
StreamContext {
context_id,
metrics,
overrides,
ratelimit_selector: None,
streaming_response: false,
response_tokens: 0,
@ -86,10 +89,34 @@ impl StreamContext {
provider_hint,
));
// Check if we need to modify the path based on the provider's base_url
let needs_openai_prefix = self
.llm_provider
.as_ref()
.and_then(|provider| provider.endpoint.as_ref())
.map(|url| url.contains("api.groq.com"))
.unwrap_or(false);
if needs_openai_prefix {
if let Some(path) = self.get_http_request_header(":path") {
if path.starts_with("/v1/") {
let new_path = format!("/openai{}", path);
self.set_http_request_header(":path", Some(new_path.as_str()));
}
}
}
debug!(
"request received: llm provider hint: {:?}, selected llm: {}",
self.get_http_request_header(ARCH_PROVIDER_HINT_HEADER),
self.llm_provider.as_ref().unwrap().name
"request received: llm provider hint: {}, selected llm: {}, model: {}",
self.get_http_request_header(ARCH_PROVIDER_HINT_HEADER)
.unwrap_or_default(),
self.llm_provider.as_ref().unwrap().name,
self.llm_provider
.as_ref()
.unwrap()
.model
.as_ref()
.unwrap_or(&String::new())
);
}
@ -130,7 +157,7 @@ impl StreamContext {
}
fn send_server_error(&self, error: ServerError, override_status_code: Option<StatusCode>) {
debug!("server error occurred: {}", error);
warn!("server error occurred: {}", error);
self.send_http_response(
override_status_code
.unwrap_or(StatusCode::INTERNAL_SERVER_ERROR)
@ -149,11 +176,11 @@ impl StreamContext {
// Tokenize and record token count.
let token_count = tokenizer::token_count(model, json_string).unwrap_or(0);
debug!("Recorded input token count: {}", token_count);
// Record the token count to metrics.
self.metrics
.input_sequence_length
.record(token_count as u64);
trace!("Recorded input token count: {}", token_count);
// Check if rate limiting needs to be applied.
if let Some(selector) = self.ratelimit_selector.take() {
@ -164,7 +191,7 @@ impl StreamContext {
NonZero::new(token_count as u32).unwrap(),
)?;
} else {
trace!("No rate limit applied for model: {}", model);
debug!("No rate limit applied for model: {}", model);
}
Ok(())
@ -176,29 +203,59 @@ impl HttpContext for StreamContext {
// Envoy's HTTP model is event driven. The WASM ABI has given implementors events to hook onto
// the lifecycle of the http request and response.
fn on_http_request_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action {
self.select_llm_provider();
// if endpoint is not set then use provider name as routing header so envoy can resolve the cluster name
if self.llm_provider().endpoint.is_none() {
self.add_http_request_header(
ARCH_ROUTING_HEADER,
&self.llm_provider().provider_interface.to_string(),
);
} else {
self.add_http_request_header(ARCH_ROUTING_HEADER, &self.llm_provider().name);
let request_path = self.get_http_request_header(":path").unwrap_or_default();
if request_path == HEALTHZ_PATH {
self.send_http_response(200, vec![], None);
return Action::Continue;
}
if let Err(error) = self.modify_auth_headers() {
// ensure that the provider has an endpoint if the access key is missing else return a bad request
if self.llm_provider.as_ref().unwrap().endpoint.is_none() {
self.send_server_error(error, Some(StatusCode::BAD_REQUEST));
let routing_header_value = self.get_http_request_header(ARCH_ROUTING_HEADER);
let use_agent_orchestrator = match self.overrides.as_ref() {
Some(overrides) => overrides.use_agent_orchestrator.unwrap_or_default(),
None => false,
};
if let Some(routing_header_value) = routing_header_value.as_ref() {
info!("routing header already set: {}", routing_header_value);
self.llm_provider = Some(Rc::new(LlmProvider {
name: routing_header_value.to_string(),
provider_interface: LlmProviderType::OpenAI,
access_key: None,
endpoint: None,
model: None,
default: None,
stream: None,
port: None,
rate_limits: None,
}));
} else {
self.select_llm_provider();
if self.llm_provider().endpoint.is_some() {
self.add_http_request_header(
ARCH_ROUTING_HEADER,
&self.llm_provider().name.to_string(),
);
} else {
self.add_http_request_header(
ARCH_ROUTING_HEADER,
&self.llm_provider().provider_interface.to_string(),
);
}
if let Err(error) = self.modify_auth_headers() {
// ensure that the provider has an endpoint if the access key is missing else return a bad request
if self.llm_provider.as_ref().unwrap().endpoint.is_none() && !use_agent_orchestrator
{
self.send_server_error(error, Some(StatusCode::BAD_REQUEST));
}
}
}
self.delete_content_length_header();
self.save_ratelimit_header();
self.is_chat_completions_request =
self.get_http_request_header(":path").unwrap_or_default() == CHAT_COMPLETIONS_PATH;
let request_path = self.get_http_request_header(":path").unwrap_or_default();
self.is_chat_completions_request = CHAT_COMPLETIONS_PATH.contains(&request_path.as_str());
self.request_id = self.get_http_request_header(REQUEST_ID_HEADER);
self.traceparent = self.get_http_request_header(TRACE_PARENT_HEADER);
@ -207,6 +264,11 @@ impl HttpContext for StreamContext {
}
fn on_http_request_body(&mut self, body_size: usize, end_of_stream: bool) -> Action {
debug!(
"on_http_request_body [S={}] bytes={} end_stream={}",
self.context_id, body_size, end_of_stream
);
// Let the client send the gateway all the data before sending to the LLM_provider.
// TODO: consider a streaming API.
@ -222,34 +284,41 @@ impl HttpContext for StreamContext {
return Action::Continue;
}
let body_bytes = match self.get_http_request_body(0, body_size) {
Some(body_bytes) => body_bytes,
None => {
self.send_server_error(
ServerError::LogicError(format!(
"Failed to obtain body bytes even though body_size is {}",
body_size
)),
None,
);
return Action::Pause;
}
};
// Deserialize body into spec.
// Currently OpenAI API.
let mut deserialized_body: ChatCompletionsRequest =
match self.get_http_request_body(0, body_size) {
Some(body_bytes) => match serde_json::from_slice(&body_bytes) {
Ok(deserialized) => deserialized,
Err(e) => {
self.send_server_error(
ServerError::Deserialization(e),
Some(StatusCode::BAD_REQUEST),
);
return Action::Pause;
}
},
None => {
match serde_json::from_slice(&body_bytes) {
Ok(deserialized) => deserialized,
Err(e) => {
debug!(
"on_http_request_body: request body: {}",
String::from_utf8_lossy(&body_bytes)
);
self.send_server_error(
ServerError::LogicError(format!(
"Failed to obtain body bytes even though body_size is {}",
body_size
)),
None,
ServerError::Deserialization(e),
Some(StatusCode::BAD_REQUEST),
);
return Action::Pause;
}
};
// remove metadata from the request body
deserialized_body.metadata = None;
//TODO: move this to prompt gateway
// deserialized_body.metadata = None;
// delete model key from message array
for message in deserialized_body.messages.iter_mut() {
message.model = None;
@ -262,15 +331,47 @@ impl HttpContext for StreamContext {
.last()
.cloned();
// override model name from the llm provider
deserialized_body
.model
.clone_from(&self.llm_provider.as_ref().unwrap().model);
let model_name = match self.llm_provider.as_ref() {
Some(llm_provider) => llm_provider.model.as_ref(),
None => None,
};
let use_agent_orchestrator = match self.overrides.as_ref() {
Some(overrides) => overrides.use_agent_orchestrator.unwrap_or_default(),
None => false,
};
let model_requested = deserialized_body.model.clone();
if deserialized_body.model.is_empty() || deserialized_body.model.to_lowercase() == "none" {
deserialized_body.model = match model_name {
Some(model_name) => model_name.clone(),
None => {
if use_agent_orchestrator {
"agent_orchestrator".to_string()
} else {
self.send_server_error(
ServerError::BadRequest {
why: format!("No model specified in request and couldn't determine model name from arch_config. Model name in req: {}, arch_config, provider: {}, model: {:?}", deserialized_body.model, self.llm_provider().name, self.llm_provider().model).to_string(),
},
Some(StatusCode::BAD_REQUEST),
);
return Action::Continue;
}
}
}
}
info!(
"on_http_request_body: provider: {}, model requested: {}, model selected: {}",
self.llm_provider().name,
model_requested,
model_name.unwrap_or(&"None".to_string()),
);
let chat_completion_request_str = serde_json::to_string(&deserialized_body).unwrap();
trace!(
"arch => {:?}, body: {}",
deserialized_body.model,
debug!(
"on_http_request_body: request body: {}",
chat_completion_request_str
);
@ -307,10 +408,9 @@ impl HttpContext for StreamContext {
}
fn on_http_response_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action {
trace!(
debug!(
"on_http_response_headers [S={}] end_stream={}",
self.context_id,
_end_of_stream
self.context_id, _end_of_stream
);
self.set_property(
@ -322,15 +422,18 @@ impl HttpContext for StreamContext {
}
fn on_http_response_body(&mut self, body_size: usize, end_of_stream: bool) -> Action {
trace!(
debug!(
"on_http_response_body [S={}] bytes={} end_stream={}",
self.context_id,
body_size,
end_of_stream
self.context_id, body_size, end_of_stream
);
if self.request_body_sent_time.is_none() {
debug!("on_http_response_body: request body not sent, not doing any processing in llm filter");
return Action::Continue;
}
if !self.is_chat_completions_request {
debug!("non-chatcompletion request");
info!("on_http_response_body: non-chatcompletion request");
return Action::Continue;
}
@ -342,7 +445,7 @@ impl HttpContext for StreamContext {
Ok(duration) => {
// Convert the duration to milliseconds
let duration_ms = duration.as_millis();
debug!("request latency: {}ms", duration_ms);
info!("on_http_response_body: request latency: {}ms", duration_ms);
// Record the latency to the latency histogram
self.metrics.request_latency.record(duration_ms as u64);
@ -353,7 +456,7 @@ impl HttpContext for StreamContext {
// Record the time per output token
self.metrics.time_per_output_token.record(tpot);
trace!(
debug!(
"time per token: {}ms, tokens per second: {}",
tpot,
1000 / tpot
@ -381,7 +484,7 @@ impl HttpContext for StreamContext {
Ok(traceparent) => {
let mut trace_data = common::tracing::TraceData::new();
let mut llm_span = Span::new(
"upstream_llm_time".to_string(),
"egress_traffic".to_string(),
Some(traceparent.trace_id),
Some(traceparent.parent_id),
self.request_body_sent_time.unwrap(),
@ -417,10 +520,9 @@ impl HttpContext for StreamContext {
let body = if self.streaming_response {
let chunk_start = 0;
let chunk_size = body_size;
trace!(
"streaming response reading, {}..{}",
chunk_start,
chunk_size
debug!(
"on_http_response_body: streaming response reading, {}..{}",
chunk_start, chunk_size
);
let streaming_chunk = match self.get_http_response_body(0, chunk_size) {
Some(chunk) => chunk,
@ -442,7 +544,7 @@ impl HttpContext for StreamContext {
}
streaming_chunk
} else {
trace!("non streaming response bytes read: 0:{}", body_size);
debug!("non streaming response bytes read: 0:{}", body_size);
match self.get_http_response_body(0, body_size) {
Some(body) => body,
None => {
@ -455,17 +557,21 @@ impl HttpContext for StreamContext {
let body_utf8 = match String::from_utf8(body) {
Ok(body_utf8) => body_utf8,
Err(e) => {
debug!("could not convert to utf8: {}", e);
warn!("could not convert to utf8: {}", e);
return Action::Continue;
}
};
if self.streaming_response {
if body_utf8 == "data: [DONE]\n" {
return Action::Continue;
}
let chat_completions_chunk_response_events =
match ChatCompletionStreamResponseServerEvents::try_from(body_utf8.as_str()) {
Ok(response) => response,
Err(e) => {
debug!(
warn!(
"invalid streaming response: body str: {}, {:?}",
body_utf8, e
);
@ -474,33 +580,27 @@ impl HttpContext for StreamContext {
};
if chat_completions_chunk_response_events.events.is_empty() {
debug!("empty streaming response");
warn!(
"couldn't parse any streaming events: body str: {}",
body_utf8
);
return Action::Continue;
}
let mut model = chat_completions_chunk_response_events
let model = chat_completions_chunk_response_events
.events
.first()
.unwrap()
.model
.clone();
let tokens_str = chat_completions_chunk_response_events.to_string();
//HACK: add support for tokenizing mistral and other models
//filed issue https://github.com/katanemo/arch/issues/222
if !model.as_ref().unwrap().starts_with("gpt") {
warn!(
"tiktoken_rs: unsupported model: {}, using gpt-4 to compute token count",
model.as_ref().unwrap()
);
}
model = Some("gpt-4".to_string());
let token_count =
match tokenizer::token_count(model.as_ref().unwrap().as_str(), tokens_str.as_str())
{
Ok(token_count) => token_count,
Err(e) => {
debug!("could not get token count: {:?}", e);
warn!("could not get token count: {:?}", e);
return Action::Continue;
}
};
@ -514,7 +614,10 @@ impl HttpContext for StreamContext {
match current_time.duration_since(self.start_time) {
Ok(duration) => {
let duration_ms = duration.as_millis();
debug!("time to first token: {}ms", duration_ms);
info!(
"on_http_response_body: time to first token: {}ms",
duration_ms
);
self.ttft_duration = Some(duration);
self.metrics.time_to_first_token.record(duration_ms as u64);
}
@ -524,12 +627,12 @@ impl HttpContext for StreamContext {
}
}
} else {
trace!("non streaming response");
debug!("non streaming response");
let chat_completions_response: ChatCompletionsResponse =
match serde_json::from_str(body_utf8.as_str()) {
Ok(de) => de,
Err(err) => {
debug!(
info!(
"non chat-completion compliant response received err: {}, body: {}",
err, body_utf8
);
@ -546,11 +649,9 @@ impl HttpContext for StreamContext {
}
}
trace!(
debug!(
"recv [S={}] total_tokens={} end_stream={}",
self.context_id,
self.response_tokens,
end_of_stream
self.context_id, self.response_tokens, end_of_stream
);
Action::Continue

View file

@ -18,12 +18,19 @@ fn wasm_module() -> String {
fn request_headers_expectations(module: &mut Tester, http_context: i32) {
module
.call_proxy_on_request_headers(http_context, 0, false)
.expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some(":path"))
.returning(Some("/v1/chat/completions"))
.expect_get_header_map_value(
Some(MapType::HttpRequestHeaders),
Some("x-arch-llm-provider"),
)
.returning(None)
.expect_get_header_map_value(
Some(MapType::HttpRequestHeaders),
Some("x-arch-llm-provider-hint"),
)
.returning(None)
.expect_log(Some(LogLevel::Debug), Some("request received: llm provider hint: Some(\"default\"), selected llm: open-ai-gpt-4"))
.expect_log(Some(LogLevel::Debug), Some("request received: llm provider hint: default, selected llm: open-ai-gpt-4, model: gpt-4"))
.expect_add_header_map_value(
Some(MapType::HttpRequestHeaders),
Some("x-arch-llm-provider"),
@ -34,6 +41,7 @@ fn request_headers_expectations(module: &mut Tester, http_context: i32) {
Some("Authorization"),
Some("Bearer secret_key"),
)
.expect_remove_header_map_value(Some(MapType::HttpRequestHeaders), Some("content-length"))
.expect_get_header_map_value(
Some(MapType::HttpRequestHeaders),
Some("x-arch-llm-provider-hint"),
@ -46,8 +54,6 @@ fn request_headers_expectations(module: &mut Tester, http_context: i32) {
.returning(Some("selector-key"))
.expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("selector-key"))
.returning(Some("selector-value"))
.expect_get_header_map_pairs(Some(MapType::HttpRequestHeaders))
.returning(None)
.expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some(":path"))
.returning(Some("/v1/chat/completions"))
.expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("x-request-id"))
@ -217,12 +223,14 @@ fn llm_gateway_successful_request_to_open_ai_chat_completions() {
chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_metric_record("input_sequence_length", 21)
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
@ -264,7 +272,7 @@ fn llm_gateway_bad_request_to_open_ai_chat_completions() {
{\
\"messages\": [\
{\
\"role\": \"system\",\
\"role\": \"system\"\
},\
{\
\"role\": \"user\",\
@ -282,13 +290,20 @@ fn llm_gateway_bad_request_to_open_ai_chat_completions() {
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(incomplete_chat_completions_request_body))
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: , model selected: gpt-4"))
.expect_send_local_response(
Some(StatusCode::BAD_REQUEST.as_u16().into()),
None,
None,
None,
)
.execute_and_expect(ReturnType::Action(Action::Pause))
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_metric_record("input_sequence_length", 14)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
}
@ -337,16 +352,18 @@ fn llm_gateway_request_ratelimited() {
chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
// The actual call is not important in this test, we just need to grab the token_id
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_metric_record("input_sequence_length", 107)
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Warn), Some("server error occurred: exceeded limit provider=gpt-4, selector=Header { key: \"selector-key\", value: \"selector-value\" }, tokens_used=107"))
.expect_send_local_response(
Some(StatusCode::TOO_MANY_REQUESTS.as_u16().into()),
None,
@ -403,13 +420,201 @@ fn llm_gateway_request_not_ratelimited() {
chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
// The actual call is not important in this test, we just need to grab the token_id
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_metric_record("input_sequence_length", 29)
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
}
#[test]
#[serial]
fn llm_gateway_override_model_name() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let filter_context = setup_filter(&mut module, default_config());
// Setup HTTP Stream
let http_context = 2;
normal_flow(&mut module, filter_context, http_context);
// give shorter body to avoid rate limiting
let chat_completions_request_body = "\
{\
\"model\": \"o1-mini\",\
\"messages\": [\
{\
\"role\": \"system\",\
\"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
},\
{\
\"role\": \"user\",\
\"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
}\
]
}";
module
.call_proxy_on_request_body(
http_context,
chat_completions_request_body.len() as i32,
true,
)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
// The actual call is not important in this test, we just need to grab the token_id
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: o1-mini, model selected: gpt-4"))
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_metric_record("input_sequence_length", 29)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
}
#[test]
#[serial]
fn llm_gateway_override_use_default_model() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let filter_context = setup_filter(&mut module, default_config());
// Setup HTTP Stream
let http_context = 2;
normal_flow(&mut module, filter_context, http_context);
// give shorter body to avoid rate limiting
let chat_completions_request_body = "\
{\
\"messages\": [\
{\
\"role\": \"system\",\
\"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
},\
{\
\"role\": \"user\",\
\"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
}\
]
}";
module
.call_proxy_on_request_body(
http_context,
chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
// The actual call is not important in this test, we just need to grab the token_id
.expect_log(
Some(LogLevel::Info),
Some("on_http_request_body: provider: open-ai-gpt-4, model requested: , model selected: gpt-4"),
)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_metric_record("input_sequence_length", 29)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
}
#[test]
#[serial]
fn llm_gateway_override_use_model_name_none() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let filter_context = setup_filter(&mut module, default_config());
// Setup HTTP Stream
let http_context = 2;
normal_flow(&mut module, filter_context, http_context);
// give shorter body to avoid rate limiting
let chat_completions_request_body = "\
{\
\"model\": \"none\",\
\"messages\": [\
{\
\"role\": \"system\",\
\"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
},\
{\
\"role\": \"user\",\
\"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
}\
]
}";
module
.call_proxy_on_request_body(
http_context,
chat_completions_request_body.len() as i32,
true,
)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
// The actual call is not important in this test, we just need to grab the token_id
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: none, model selected: gpt-4"))
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_metric_record("input_sequence_length", 29)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)

View file

@ -1,6 +1,8 @@
use crate::metrics::Metrics;
use crate::stream_context::StreamContext;
use common::configuration::{Configuration, Overrides, PromptGuards, PromptTarget, Tracing};
use common::configuration::{
Configuration, Endpoint, Overrides, PromptGuards, PromptTarget, Tracing,
};
use common::http::Client;
use common::stats::Gauge;
use log::trace;
@ -21,6 +23,7 @@ pub struct FilterContext {
overrides: Rc<Option<Overrides>>,
system_prompt: Rc<Option<String>>,
prompt_targets: Rc<HashMap<String, PromptTarget>>,
endpoints: Rc<Option<HashMap<String, Endpoint>>>,
prompt_guards: Rc<PromptGuards>,
tracing: Rc<Option<Tracing>>,
}
@ -34,6 +37,7 @@ impl FilterContext {
prompt_targets: Rc::new(HashMap::new()),
overrides: Rc::new(None),
prompt_guards: Rc::new(PromptGuards::default()),
endpoints: Rc::new(None),
tracing: Rc::new(None),
}
}
@ -73,6 +77,7 @@ impl RootContext for FilterContext {
}
self.system_prompt = Rc::new(config.system_prompt);
self.prompt_targets = Rc::new(prompt_targets);
self.endpoints = Rc::new(config.endpoints);
if let Some(prompt_guards) = config.prompt_guards {
self.prompt_guards = Rc::new(prompt_guards)
@ -94,6 +99,7 @@ impl RootContext for FilterContext {
Rc::clone(&self.metrics),
Rc::clone(&self.system_prompt),
Rc::clone(&self.prompt_targets),
Rc::clone(&self.endpoints),
Rc::clone(&self.overrides),
Rc::clone(&self.tracing),
)))

View file

@ -4,17 +4,18 @@ use common::{
self, ArchState, ChatCompletionStreamResponse, ChatCompletionTool, ChatCompletionsRequest,
},
consts::{
ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME, ARCH_STATE_HEADER,
ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME, ARCH_ROUTING_HEADER,
ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, CHAT_COMPLETIONS_PATH, HEALTHZ_PATH,
MODEL_SERVER_NAME, MODEL_SERVER_REQUEST_TIMEOUT_MS, REQUEST_ID_HEADER, TOOL_ROLE,
TRACE_PARENT_HEADER, USER_ROLE,
TRACE_PARENT_HEADER, USER_ROLE, X_ARCH_API_RESPONSE, X_ARCH_FC_MODEL_RESPONSE,
X_ARCH_STATE_HEADER, X_ARCH_TOOL_CALL,
},
errors::ServerError,
http::{CallArgs, Client},
pii::obfuscate_auth_header,
};
use http::StatusCode;
use log::{debug, trace, warn};
use log::{debug, info, warn};
use proxy_wasm::{traits::HttpContext, types::Action};
use serde_json::Value;
use std::{
@ -33,15 +34,37 @@ impl HttpContext for StreamContext {
// manipulate the body in benign ways e.g., compression.
self.set_http_request_header("content-length", None);
if let Some(overrides) = self.overrides.as_ref() {
if overrides.use_agent_orchestrator.unwrap_or_default() {
// get endpoint that has agent_orchestrator set to true
if let Some(endpoints) = self.endpoints.as_ref() {
if endpoints.len() == 1 {
let (name, _) = endpoints.iter().next().unwrap();
info!("Setting ARCH_PROVIDER_HINT_HEADER to {}", name);
self.set_http_request_header(ARCH_ROUTING_HEADER, Some(name));
} else {
warn!("Need single endpoint when use_agent_orchestrator is set");
self.send_server_error(
ServerError::LogicError(
"Need single endpoint when use_agent_orchestrator is set"
.to_string(),
),
None,
);
}
}
}
}
let request_path = self.get_http_request_header(":path").unwrap_or_default();
if request_path == HEALTHZ_PATH {
self.send_http_response(200, vec![], None);
return Action::Continue;
}
self.is_chat_completions_request = request_path == CHAT_COMPLETIONS_PATH;
self.is_chat_completions_request = CHAT_COMPLETIONS_PATH.contains(&request_path.as_str());
trace!(
debug!(
"on_http_request_headers S[{}] req_headers={:?}",
self.context_id,
obfuscate_auth_header(&mut self.get_http_request_headers())
@ -49,6 +72,7 @@ impl HttpContext for StreamContext {
self.request_id = self.get_http_request_header(REQUEST_ID_HEADER);
self.traceparent = self.get_http_request_header(TRACE_PARENT_HEADER);
Action::Continue
}
@ -66,10 +90,9 @@ impl HttpContext for StreamContext {
self.request_body_size = body_size;
trace!(
debug!(
"on_http_request_body S[{}] body_size={}",
self.context_id,
body_size
self.context_id, body_size
);
let body_bytes = match self.get_http_request_body(0, body_size) {
@ -86,7 +109,7 @@ impl HttpContext for StreamContext {
}
};
trace!("request body: {}", String::from_utf8_lossy(&body_bytes));
debug!("request body: {}", String::from_utf8_lossy(&body_bytes));
// Deserialize body into spec.
// Currently OpenAI API.
@ -103,8 +126,8 @@ impl HttpContext for StreamContext {
self.arch_state = match deserialized_body.metadata {
Some(ref metadata) => {
if metadata.contains_key(ARCH_STATE_HEADER) {
let arch_state_str = metadata[ARCH_STATE_HEADER].clone();
if metadata.contains_key(X_ARCH_STATE_HEADER) {
let arch_state_str = metadata[X_ARCH_STATE_HEADER].clone();
let arch_state: Vec<ArchState> = serde_json::from_str(&arch_state_str).unwrap();
Some(arch_state)
} else {
@ -152,11 +175,23 @@ impl HttpContext for StreamContext {
}
}
if let Some(overrides) = self.overrides.as_ref() {
if overrides.use_agent_orchestrator.unwrap_or_default() {
if metadata.is_none() {
metadata = Some(HashMap::new());
}
metadata
.as_mut()
.unwrap()
.insert("use_agent_orchestrator".to_string(), "true".to_string());
}
}
let arch_fc_chat_completion_request = ChatCompletionsRequest {
messages: deserialized_body.messages.clone(),
metadata,
stream: deserialized_body.stream,
model: "--".to_string(),
model: deserialized_body.model.clone(),
stream_options: deserialized_body.stream_options.clone(),
tools: Some(tool_calls),
};
@ -171,8 +206,10 @@ impl HttpContext for StreamContext {
}
};
debug!("sending request to model server");
trace!("request body: {}", json_data);
info!("on_http_request_body: sending request to model server");
debug!("request body: {}", json_data);
let timeout_str = MODEL_SERVER_REQUEST_TIMEOUT_MS.to_string();
let timeout_str = MODEL_SERVER_REQUEST_TIMEOUT_MS.to_string();
@ -213,7 +250,7 @@ impl HttpContext for StreamContext {
};
if let Err(e) = self.http_call(call_args, call_context) {
debug!("http_call failed: {:?}", e);
warn!("http_call failed: {:?}", e);
self.send_server_error(ServerError::HttpDispatch(e), None);
}
@ -221,7 +258,7 @@ impl HttpContext for StreamContext {
}
fn on_http_response_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action {
trace!(
debug!(
"on_http_response_headers recv [S={}] headers={:?}",
self.context_id,
self.get_http_response_headers()
@ -233,15 +270,13 @@ impl HttpContext for StreamContext {
}
fn on_http_response_body(&mut self, body_size: usize, end_of_stream: bool) -> Action {
trace!(
debug!(
"on_http_response_body: recv [S={}] bytes={} end_stream={}",
self.context_id,
body_size,
end_of_stream
self.context_id, body_size, end_of_stream
);
if !self.is_chat_completions_request {
debug!("non-gpt request");
info!("non-gpt request");
return Action::Continue;
}
@ -280,7 +315,7 @@ impl HttpContext for StreamContext {
streaming_chunk
} else {
debug!("non streaming response bytes read: 0:{}", body_size);
info!("non streaming response bytes read: 0:{}", body_size);
match self.get_http_response_body(0, body_size) {
Some(body) => body,
None => {
@ -293,21 +328,21 @@ impl HttpContext for StreamContext {
let body_utf8 = match String::from_utf8(body) {
Ok(body_utf8) => body_utf8,
Err(e) => {
debug!("could not convert to utf8: {}", e);
info!("could not convert to utf8: {}", e);
return Action::Continue;
}
};
if self.streaming_response {
trace!("streaming response");
debug!("streaming response");
if self.tool_calls.is_some() && !self.tool_calls.as_ref().unwrap().is_empty() {
let chunks = vec![
ChatCompletionStreamResponse::new(
None,
self.arch_fc_response.clone(),
Some(ASSISTANT_ROLE.to_string()),
Some(ARCH_FC_MODEL_NAME.to_string()),
self.tool_calls.to_owned(),
None,
),
ChatCompletionStreamResponse::new(
self.tool_call_response.clone(),
@ -349,25 +384,47 @@ impl HttpContext for StreamContext {
*metadata = Value::Object(serde_json::Map::new());
}
let fc_messages = vec![
self.generate_toll_call_message(),
self.generate_api_response_message(),
];
let tool_call_message = self.generate_tool_call_message();
let tool_call_message_str = serde_json::to_string(&tool_call_message).unwrap();
metadata.as_object_mut().unwrap().insert(
X_ARCH_TOOL_CALL.to_string(),
serde_json::Value::String(tool_call_message_str),
);
let api_response_message = self.generate_api_response_message();
let api_response_message_str =
serde_json::to_string(&api_response_message).unwrap();
metadata.as_object_mut().unwrap().insert(
X_ARCH_API_RESPONSE.to_string(),
serde_json::Value::String(api_response_message_str),
);
let fc_messages = vec![tool_call_message, api_response_message];
let fc_messages_str = serde_json::to_string(&fc_messages).unwrap();
let arch_state = HashMap::from([("messages".to_string(), fc_messages_str)]);
let arch_state_str = serde_json::to_string(&arch_state).unwrap();
metadata.as_object_mut().unwrap().insert(
ARCH_STATE_HEADER.to_string(),
X_ARCH_STATE_HEADER.to_string(),
serde_json::Value::String(arch_state_str),
);
if let Some(arch_fc_response) = self.arch_fc_response.as_ref() {
metadata.as_object_mut().unwrap().insert(
X_ARCH_FC_MODEL_RESPONSE.to_string(),
serde_json::Value::String(
serde_json::to_string(arch_fc_response).unwrap(),
),
);
}
let data_serialized = serde_json::to_string(&data).unwrap();
debug!("archgw <= developer: {}", data_serialized);
info!("archgw <= developer: {}", data_serialized);
self.set_http_response_body(0, body_size, data_serialized.as_bytes());
};
}
}
trace!("recv [S={}] end_stream={}", self.context_id, end_of_stream);
debug!("recv [S={}] end_stream={}", self.context_id, end_of_stream);
Action::Continue
}

View file

@ -2,20 +2,21 @@ use crate::metrics::Metrics;
use crate::tools::compute_request_path_body;
use common::api::open_ai::{
to_server_events, ArchState, ChatCompletionStreamResponse, ChatCompletionsRequest,
ChatCompletionsResponse, Message, ModelServerResponse, ToolCall,
ChatCompletionsResponse, Message, ToolCall,
};
use common::configuration::{Overrides, PromptTarget, Tracing};
use common::configuration::{Endpoint, Overrides, PromptTarget, Tracing};
use common::consts::{
API_REQUEST_TIMEOUT_MS, ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME,
ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, DEFAULT_TARGET_REQUEST_TIMEOUT_MS, MESSAGES_KEY,
REQUEST_ID_HEADER, SYSTEM_ROLE, TOOL_ROLE, TRACE_PARENT_HEADER, USER_ROLE,
X_ARCH_FC_MODEL_RESPONSE,
};
use common::errors::ServerError;
use common::http::{CallArgs, Client};
use common::stats::Gauge;
use derivative::Derivative;
use http::StatusCode;
use log::{debug, trace, warn};
use log::{debug, info, warn};
use proxy_wasm::traits::*;
use std::cell::RefCell;
use std::collections::HashMap;
@ -46,6 +47,7 @@ pub struct StreamCallContext {
pub struct StreamContext {
system_prompt: Rc<Option<String>>,
pub prompt_targets: Rc<HashMap<String, PromptTarget>>,
pub endpoints: Rc<Option<HashMap<String, Endpoint>>>,
pub overrides: Rc<Option<Overrides>>,
pub metrics: Rc<Metrics>,
pub callouts: RefCell<HashMap<u32, StreamCallContext>>,
@ -63,15 +65,16 @@ pub struct StreamContext {
pub time_to_first_token: Option<u128>,
pub traceparent: Option<String>,
pub _tracing: Rc<Option<Tracing>>,
pub arch_fc_response: Option<String>,
}
impl StreamContext {
#[allow(clippy::too_many_arguments)]
pub fn new(
context_id: u32,
metrics: Rc<Metrics>,
system_prompt: Rc<Option<String>>,
prompt_targets: Rc<HashMap<String, PromptTarget>>,
endpoints: Rc<Option<HashMap<String, Endpoint>>>,
overrides: Rc<Option<Overrides>>,
tracing: Rc<Option<Tracing>>,
) -> Self {
@ -80,6 +83,7 @@ impl StreamContext {
metrics,
system_prompt,
prompt_targets,
endpoints,
callouts: RefCell::new(HashMap::new()),
chat_completions_request: None,
tool_calls: None,
@ -95,6 +99,7 @@ impl StreamContext {
_tracing: tracing,
start_upstream_llm_request_time: 0,
time_to_first_token: None,
arch_fc_response: None,
}
}
@ -125,10 +130,10 @@ impl StreamContext {
mut callout_context: StreamCallContext,
) {
let body_str = String::from_utf8(body).unwrap();
debug!("model server response received");
trace!("response body: {}", body_str);
info!("on_http_call_response: model server response received");
debug!("response body: {}", body_str);
let model_server_response: ModelServerResponse = match serde_json::from_str(&body_str) {
let model_server_response: ChatCompletionsResponse = match serde_json::from_str(&body_str) {
Ok(arch_fc_response) => arch_fc_response,
Err(e) => {
warn!(
@ -139,77 +144,122 @@ impl StreamContext {
}
};
let arch_fc_response = match model_server_response {
ModelServerResponse::ChatCompletionsResponse(response) => response,
ModelServerResponse::ModelServerErrorResponse(response) => {
debug!("archgw <= modelserver error response: {}", response.result);
if response.result == "No intent matched" {
if let Some(default_prompt_target) = self
.prompt_targets
.values()
.find(|pt| pt.default.unwrap_or(false))
{
debug!("default prompt target found, forwarding request to default prompt target");
let endpoint = default_prompt_target.endpoint.clone().unwrap();
let upstream_path: String = endpoint.path.unwrap_or(String::from("/"));
let intent_matched = check_intent_matched(&model_server_response);
info!("intent matched: {}", intent_matched);
let upstream_endpoint = endpoint.name;
let mut params = HashMap::new();
params.insert(
MESSAGES_KEY.to_string(),
callout_context.request_body.messages.clone(),
);
let arch_messages_json = serde_json::to_string(&params).unwrap();
let timeout_str = DEFAULT_TARGET_REQUEST_TIMEOUT_MS.to_string();
self.arch_fc_response = model_server_response
.metadata
.as_ref()
.and_then(|metadata| metadata.get(X_ARCH_FC_MODEL_RESPONSE))
.cloned();
let mut headers = vec![
(":method", "POST"),
(ARCH_UPSTREAM_HOST_HEADER, &upstream_endpoint),
(":path", &upstream_path),
(":authority", &upstream_endpoint),
("content-type", "application/json"),
("x-envoy-max-retries", "3"),
("x-envoy-upstream-rq-timeout-ms", timeout_str.as_str()),
];
if !intent_matched {
// check if we have a default prompt target
if let Some(default_prompt_target) = self
.prompt_targets
.values()
.find(|pt| pt.default.unwrap_or(false))
{
info!("default prompt target found, forwarding request to default prompt target");
let endpoint = default_prompt_target.endpoint.clone().unwrap();
let upstream_path: String = endpoint.path.unwrap_or(String::from("/"));
if self.request_id.is_some() {
headers.push((REQUEST_ID_HEADER, self.request_id.as_ref().unwrap()));
}
let upstream_endpoint = endpoint.name;
let mut params = HashMap::new();
params.insert(
MESSAGES_KEY.to_string(),
callout_context.request_body.messages.clone(),
);
let arch_messages_json = serde_json::to_string(&params).unwrap();
let timeout_str = DEFAULT_TARGET_REQUEST_TIMEOUT_MS.to_string();
// if self.trace_arch_internal() && self.traceparent.is_some() {
// headers.push((TRACE_PARENT_HEADER, self.traceparent.as_ref().unwrap()));
// }
let mut headers = vec![
(":method", "POST"),
(ARCH_UPSTREAM_HOST_HEADER, &upstream_endpoint),
(":path", &upstream_path),
(":authority", &upstream_endpoint),
("content-type", "application/json"),
("x-envoy-max-retries", "3"),
("x-envoy-upstream-rq-timeout-ms", timeout_str.as_str()),
];
let call_args = CallArgs::new(
ARCH_INTERNAL_CLUSTER_NAME,
&upstream_path,
headers,
Some(arch_messages_json.as_bytes()),
vec![],
Duration::from_secs(5),
);
callout_context.response_handler_type = ResponseHandlerType::DefaultTarget;
callout_context.prompt_target_name =
Some(default_prompt_target.name.clone());
if self.request_id.is_some() {
headers.push((REQUEST_ID_HEADER, self.request_id.as_ref().unwrap()));
}
if let Err(e) = self.http_call(call_args, callout_context) {
warn!("error dispatching default prompt target request: {}", e);
return self.send_server_error(
ServerError::HttpDispatch(e),
Some(StatusCode::BAD_REQUEST),
);
}
return;
let call_args = CallArgs::new(
ARCH_INTERNAL_CLUSTER_NAME,
&upstream_path,
headers,
Some(arch_messages_json.as_bytes()),
vec![],
Duration::from_secs(5),
);
callout_context.response_handler_type = ResponseHandlerType::DefaultTarget;
callout_context.prompt_target_name = Some(default_prompt_target.name.clone());
if let Err(e) = self.http_call(call_args, callout_context) {
warn!("error dispatching default prompt target request: {}", e);
return self.send_server_error(
ServerError::HttpDispatch(e),
Some(StatusCode::BAD_REQUEST),
);
}
return;
} else {
info!("no default prompt target found, forwarding request to upstream llm");
let mut messages = Vec::new();
// add system prompt
match self.system_prompt.as_ref() {
None => {}
Some(system_prompt) => {
let system_prompt_message = Message {
role: SYSTEM_ROLE.to_string(),
content: Some(system_prompt.clone()),
model: None,
tool_calls: None,
tool_call_id: None,
};
messages.push(system_prompt_message);
}
}
return self.send_server_error(
ServerError::LogicError(response.result),
Some(StatusCode::BAD_REQUEST),
);
}
};
arch_fc_response.choices[0]
messages.append(
&mut self
.filter_out_arch_messages(callout_context.request_body.messages.as_ref()),
);
let chat_completion_request = ChatCompletionsRequest {
model: self
.chat_completions_request
.as_ref()
.unwrap()
.model
.clone(),
messages,
tools: None,
stream: callout_context.request_body.stream,
stream_options: callout_context.request_body.stream_options,
metadata: None,
};
let chat_completion_request_json =
serde_json::to_string(&chat_completion_request).unwrap();
info!(
"archgw => upstream llm request: {}",
chat_completion_request_json
);
self.set_http_request_body(
0,
self.request_body_size,
chat_completion_request_json.as_bytes(),
);
self.resume_http_request();
return;
}
}
model_server_response.choices[0]
.message
.tool_calls
.clone_into(&mut self.tool_calls);
@ -231,14 +281,14 @@ impl StreamContext {
let direct_response_str = if self.streaming_response {
let chunks = vec![
ChatCompletionStreamResponse::new(
None,
self.arch_fc_response.clone(),
Some(ASSISTANT_ROLE.to_string()),
Some(ARCH_FC_MODEL_NAME.to_owned()),
Some(ARCH_FC_MODEL_NAME.to_string()),
None,
),
ChatCompletionStreamResponse::new(
Some(
arch_fc_response.choices[0]
model_server_response.choices[0]
.message
.content
.as_ref()
@ -246,7 +296,7 @@ impl StreamContext {
.clone(),
),
None,
Some(ARCH_FC_MODEL_NAME.to_owned()),
Some(format!("{}-Chat", ARCH_FC_MODEL_NAME.to_owned())),
None,
),
];
@ -268,12 +318,59 @@ impl StreamContext {
callout_context.prompt_target_name =
Some(self.tool_calls.as_ref().unwrap()[0].function.name.clone());
if let Some(overrides) = self.overrides.as_ref() {
if overrides.use_agent_orchestrator.unwrap_or_default() {
let mut metadata = HashMap::new();
metadata.insert("use_agent_orchestrator".to_string(), "true".to_string());
metadata.insert(
"agent-name".to_string(),
callout_context
.prompt_target_name
.as_ref()
.unwrap()
.to_string(),
);
if let Some(overrides) = self.overrides.as_ref() {
if overrides.optimize_context_window.unwrap_or_default() {
metadata.insert("optimize_context_window".to_string(), "true".to_string());
}
}
if let Some(overrides) = self.overrides.as_ref() {
if overrides.use_agent_orchestrator.unwrap_or_default() {
metadata.insert("use_agent_orchestrator".to_string(), "true".to_string());
}
}
let messages = self.construct_llm_messages(&callout_context);
let chat_completion_request = ChatCompletionsRequest {
model: callout_context.request_body.model.clone(),
messages,
tools: None,
stream: callout_context.request_body.stream,
stream_options: callout_context.request_body.stream_options.clone(),
metadata: Some(metadata),
};
let body_str = serde_json::to_string(&chat_completion_request).unwrap();
info!("sending request to llm agent: {}", body_str);
self.set_http_request_body(0, self.request_body_size, body_str.as_bytes());
self.resume_http_request();
return;
}
}
self.schedule_api_call_request(callout_context);
}
fn schedule_api_call_request(&mut self, mut callout_context: StreamCallContext) {
// Construct messages early to avoid mutable borrow conflicts
let tools_call_name = self.tool_calls.as_ref().unwrap()[0].function.name.clone();
let prompt_target = self.prompt_targets.get(&tools_call_name).unwrap();
let prompt_target = self.prompt_targets.get(&tools_call_name).unwrap().clone();
let tool_params = &self.tool_calls.as_ref().unwrap()[0].function.arguments;
let endpoint_details = prompt_target.endpoint.as_ref().unwrap();
let endpoint_path: String = endpoint_details
@ -285,7 +382,7 @@ impl StreamContext {
let http_method = endpoint_details.method.clone().unwrap_or_default();
let prompt_target_params = prompt_target.parameters.clone().unwrap_or_default();
let (path, body) = match compute_request_path_body(
let (path, api_call_body) = match compute_request_path_body(
&endpoint_path,
tool_params,
&prompt_target_params,
@ -302,6 +399,8 @@ impl StreamContext {
}
};
debug!("on_http_call_response: api call body {:?}", api_call_body);
let timeout_str = API_REQUEST_TIMEOUT_MS.to_string();
let http_method_str = http_method.to_string();
@ -335,13 +434,13 @@ impl StreamContext {
ARCH_INTERNAL_CLUSTER_NAME,
&path,
headers.into_iter().collect(),
body.as_deref().map(|s| s.as_bytes()),
api_call_body.as_deref().map(|s| s.as_bytes()),
vec![],
Duration::from_secs(5),
);
debug!(
"dispatching api call to developer endpoint: {}, path: {}, method: {}",
info!(
"on_http_call_response: dispatching api call to developer endpoint: {}, path: {}, method: {}",
endpoint_details.name, path, http_method_str
);
@ -358,10 +457,15 @@ impl StreamContext {
let http_status = self
.get_http_call_response_header(":status")
.unwrap_or(StatusCode::OK.as_str().to_string());
debug!(
"developer api call response received: status code: {}",
info!(
"on_http_call_response: developer api call response received: status code: {}",
http_status
);
let prompt_target = self
.prompt_targets
.get(callout_context.prompt_target_name.as_ref().unwrap())
.unwrap()
.clone();
if http_status != StatusCode::OK.as_str() {
warn!(
"api server responded with non 2xx status code: {}",
@ -378,7 +482,7 @@ impl StreamContext {
);
}
self.tool_call_response = Some(String::from_utf8(body).unwrap());
trace!(
debug!(
"response body: {}",
self.tool_call_response.as_ref().unwrap()
);
@ -397,6 +501,37 @@ impl StreamContext {
}
};
if !prompt_target.auto_llm_dispatch_on_response.unwrap_or(true) {
let tool_call_response = self.tool_call_response.as_ref().unwrap().clone();
let direct_response_str = if self.streaming_response {
let chunks = vec![
ChatCompletionStreamResponse::new(
None,
Some(ASSISTANT_ROLE.to_string()),
Some(ARCH_FC_MODEL_NAME.to_owned()),
None,
),
ChatCompletionStreamResponse::new(
Some(tool_call_response.clone()),
None,
Some(ARCH_FC_MODEL_NAME.to_owned()),
None,
),
];
to_server_events(chunks)
} else {
tool_call_response
};
return self.send_http_response(
StatusCode::OK.as_u16().into(),
vec![],
Some(direct_response_str.as_bytes()),
);
}
let final_prompt = format!(
"{}\ncontext: {}",
user_message.content.unwrap(),
@ -429,8 +564,8 @@ impl StreamContext {
return self.send_server_error(ServerError::Serialization(e), None);
}
};
debug!("sending request to upstream llm");
trace!("request body: {}", llm_request_str);
info!("on_http_call_response: sending request to upstream llm");
debug!("request body: {}", llm_request_str);
self.start_upstream_llm_request_time = SystemTime::now()
.duration_since(UNIX_EPOCH)
@ -491,13 +626,24 @@ impl StreamContext {
messages
}
pub fn generate_toll_call_message(&mut self) -> Message {
Message {
role: ASSISTANT_ROLE.to_string(),
content: None,
model: Some(ARCH_FC_MODEL_NAME.to_string()),
tool_calls: self.tool_calls.clone(),
tool_call_id: None,
pub fn generate_tool_call_message(&mut self) -> Message {
if self.arch_fc_response.is_none() {
info!("arch_fc_response is none, generating tool call message");
Message {
role: ASSISTANT_ROLE.to_string(),
content: None,
model: Some(ARCH_FC_MODEL_NAME.to_string()),
tool_calls: self.tool_calls.clone(),
tool_call_id: None,
}
} else {
Message {
role: ASSISTANT_ROLE.to_string(),
content: self.arch_fc_response.as_ref().cloned(),
model: Some(ARCH_FC_MODEL_NAME.to_string()),
tool_calls: None,
tool_call_id: None,
}
}
}
@ -519,10 +665,7 @@ impl StreamContext {
.clone();
// check if the default target should be dispatched to the LLM provider
if !prompt_target
.auto_llm_dispatch_on_response
.unwrap_or_default()
{
if !prompt_target.auto_llm_dispatch_on_response.unwrap_or(true) {
let default_target_response_str = if self.streaming_response {
let chat_completion_response =
match serde_json::from_slice::<ChatCompletionsResponse>(&body) {
@ -626,12 +769,29 @@ impl StreamContext {
};
let json_resp = serde_json::to_string(&chat_completion_request).unwrap();
debug!("archgw => (default target) llm request: {}", json_resp);
info!("archgw => (default target) llm request: {}", json_resp);
self.set_http_request_body(0, self.request_body_size, json_resp.as_bytes());
self.resume_http_request();
}
}
fn check_intent_matched(model_server_response: &ChatCompletionsResponse) -> bool {
let content = model_server_response
.choices.first()
.and_then(|choice| choice.message.content.as_ref());
let content_has_value = content.is_some() && !content.unwrap().is_empty();
let tool_calls = model_server_response
.choices.first()
.and_then(|choice| choice.message.tool_calls.as_ref());
// intent was matched if content has some value or tool_calls is empty
content_has_value || (tool_calls.is_some() && !tool_calls.unwrap().is_empty())
}
impl Client for StreamContext {
type CallContext = StreamCallContext;
@ -643,3 +803,77 @@ impl Client for StreamContext {
&self.metrics.active_http_calls
}
}
#[cfg(test)]
mod test {
use common::api::open_ai::{ChatCompletionsResponse, Choice, Message, ToolCall};
use crate::stream_context::check_intent_matched;
#[test]
fn test_intent_matched() {
let model_server_response = ChatCompletionsResponse {
choices: vec![Choice {
message: Message {
content: Some("".to_string()),
tool_calls: Some(vec![]),
role: "assistant".to_string(),
model: None,
tool_call_id: None,
},
finish_reason: None,
index: None,
}],
usage: None,
model: "arch-fc".to_string(),
metadata: None,
};
assert!(!check_intent_matched(&model_server_response));
let model_server_response = ChatCompletionsResponse {
choices: vec![Choice {
message: Message {
content: Some("hello".to_string()),
tool_calls: Some(vec![]),
role: "assistant".to_string(),
model: None,
tool_call_id: None,
},
finish_reason: None,
index: None,
}],
usage: None,
model: "arch-fc".to_string(),
metadata: None,
};
assert!(check_intent_matched(&model_server_response));
let model_server_response = ChatCompletionsResponse {
choices: vec![Choice {
message: Message {
content: Some("".to_string()),
tool_calls: Some(vec![ToolCall {
id: "1".to_string(),
function: common::api::open_ai::FunctionCallDetail {
name: "test".to_string(),
arguments: None,
},
tool_type: common::api::open_ai::ToolType::Function,
}]),
role: "assistant".to_string(),
model: None,
tool_call_id: None,
},
finish_reason: None,
index: None,
}],
usage: None,
model: "arch-fc".to_string(),
metadata: None,
};
assert!(check_intent_matched(&model_server_response));
}
}

View file

@ -4,8 +4,13 @@ use std::collections::HashMap;
use serde_yaml::Value;
// only add params that are of string, number and bool type
pub fn filter_tool_params(tool_params: &HashMap<String, Value>) -> HashMap<String, String> {
pub fn filter_tool_params(tool_params: &Option<HashMap<String, Value>>) -> HashMap<String, String> {
if tool_params.is_none() {
return HashMap::new();
}
tool_params
.as_ref()
.unwrap()
.iter()
.filter(|(_, value)| value.is_number() || value.is_string() || value.is_bool())
.map(|(key, value)| match value {
@ -22,7 +27,7 @@ pub fn filter_tool_params(tool_params: &HashMap<String, Value>) -> HashMap<Strin
pub fn compute_request_path_body(
endpoint_path: &str,
tool_params: &HashMap<String, Value>,
tool_params: &Option<HashMap<String, Value>>,
prompt_target_params: &[Parameter],
http_method: &HttpMethod,
) -> Result<(String, Option<String>), String> {

View file

@ -24,12 +24,12 @@ fn wasm_module() -> String {
fn request_headers_expectations(module: &mut Tester, http_context: i32) {
module
.call_proxy_on_request_headers(http_context, 0, false)
.expect_log(Some(LogLevel::Debug), None)
.expect_remove_header_map_value(Some(MapType::HttpRequestHeaders), Some("content-length"))
.expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some(":path"))
.returning(Some("/v1/chat/completions"))
.expect_get_header_map_pairs(Some(MapType::HttpRequestHeaders))
.returning(None)
.expect_log(Some(LogLevel::Trace), None)
.expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("x-request-id"))
.returning(None)
.expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("traceparent"))
@ -69,10 +69,14 @@ fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) {
chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
// The actual call is not important in this test, we just need to grab the token_id
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_http_call(
Some("arch_internal"),
Some(vec![
@ -81,16 +85,13 @@ fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) {
(":path", "/function_calling"),
("content-type", "application/json"),
(":authority", "model_server"),
("x-envoy-upstream-rq-timeout-ms", "30000"),
]),
None,
None,
None,
Some(5000),
)
.returning(Some(1))
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Trace), None)
.expect_metric_increment("active_http_calls", 1)
.execute_and_expect(ReturnType::Action(Action::Pause))
.unwrap();
@ -232,13 +233,13 @@ fn prompt_gateway_successful_request_to_open_ai_chat_completions() {
chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(chat_completions_request_body))
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_http_call(Some("arch_internal"), None, None, None, None)
.returning(Some(4))
.expect_metric_increment("active_http_calls", 1)
@ -295,16 +296,16 @@ fn prompt_gateway_bad_request_to_open_ai_chat_completions() {
incomplete_chat_completions_request_body.len() as i32,
true,
)
.expect_log(Some(LogLevel::Debug), None)
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(incomplete_chat_completions_request_body))
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_send_local_response(
Some(StatusCode::BAD_REQUEST.as_u16().into()),
None,
None,
None,
)
.expect_log(Some(LogLevel::Trace), None)
.execute_and_expect(ReturnType::Action(Action::Pause))
.unwrap();
}
@ -351,10 +352,10 @@ fn prompt_gateway_request_to_llm_gateway() {
tool_type: ToolType::Function,
function: FunctionCallDetail {
name: String::from("weather_forecast"),
arguments: HashMap::from([(
arguments: Some(HashMap::from([(
String::from("city"),
Value::String(String::from("seattle")),
)]),
)])),
},
}]),
model: None,
@ -362,7 +363,11 @@ fn prompt_gateway_request_to_llm_gateway() {
},
}],
model: String::from("test"),
metadata: None,
metadata: {
let mut map: HashMap<String, String> = HashMap::new();
map.insert("function_latency".to_string(), "0.0".to_string());
Some(map)
},
};
let expected_body = "{\"city\":\"seattle\"}";
@ -373,27 +378,30 @@ fn prompt_gateway_request_to_llm_gateway() {
.expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
.returning(Some(&arch_fc_resp_str))
.expect_log(Some(LogLevel::Warn), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Trace), None)
.expect_http_call(
Some("arch_internal"),
Some(vec![
(":method", "POST"),
("content-type", "application/json"),
("x-arch-upstream", "api_server"),
(":authority", "api_server"),
("x-envoy-max-retries", "3"),
("x-arch-upstream", "api_server"),
("content-type", "application/json"),
("x-envoy-upstream-rq-timeout-ms", "30000"),
(":path", "/weather"),
(":method", "POST"),
(":authority", "api_server"),
]),
Some(expected_body),
None,
None,
Some(5000),
)
.returning(Some(2))
.expect_metric_increment("active_http_calls", 1)
.expect_log(Some(LogLevel::Trace), None)
.execute_and_expect(ReturnType::None)
.unwrap();
@ -403,14 +411,14 @@ fn prompt_gateway_request_to_llm_gateway() {
.expect_metric_increment("active_http_calls", -1)
.expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
.returning(Some(&body_text))
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Trace), None)
.expect_get_header_map_value(Some(MapType::HttpCallResponseHeaders), Some(":status"))
.returning(Some("200"))
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
.expect_log(Some(LogLevel::Debug), None)
.execute_and_expect(ReturnType::None)
.unwrap();
@ -442,11 +450,241 @@ fn prompt_gateway_request_to_llm_gateway() {
)
.expect_get_buffer_bytes(Some(BufferType::HttpResponseBody))
.returning(Some(chat_completion_response_str.as_str()))
.expect_log(Some(LogLevel::Trace), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), None)
.expect_set_buffer_bytes(Some(BufferType::HttpResponseBody), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Trace), None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
}
#[test]
#[serial]
fn prompt_gateway_request_no_intent_match() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let mut config: Configuration = serde_yaml::from_str(default_config()).unwrap();
config.ratelimits.as_mut().unwrap()[0].limit.tokens += 1000;
let config_str = serde_json::to_string(&config).unwrap();
let filter_context = setup_filter(&mut module, &config_str);
// Setup HTTP Stream
let http_context = 2;
normal_flow(&mut module, filter_context, http_context);
let arch_fc_resp = ChatCompletionsResponse {
usage: Some(Usage {
completion_tokens: 0,
}),
choices: vec![Choice {
finish_reason: Some("test".to_string()),
index: Some(0),
message: Message {
role: "assistant".to_string(),
content: None,
tool_calls: None,
model: None,
tool_call_id: None,
},
}],
model: String::from("test"),
metadata: None,
};
let arch_fc_resp_str = serde_json::to_string(&arch_fc_resp).unwrap();
module
.call_proxy_on_http_call_response(http_context, 1, 0, arch_fc_resp_str.len() as i32, 0)
.expect_metric_increment("active_http_calls", -1)
.expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
.returning(Some(&arch_fc_resp_str))
.expect_log(Some(LogLevel::Warn), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), Some("intent matched: false"))
.expect_log(
Some(LogLevel::Info),
Some("no default prompt target found, forwarding request to upstream llm"),
)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Info), None)
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
.execute_and_expect(ReturnType::None)
.unwrap();
}
fn arch_config_default_target() -> &'static str {
r#"
version: "0.1-beta"
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
endpoints:
api_server:
endpoint: api_server:80
connect_timeout: 0.005s
llm_providers:
- name: open-ai-gpt-4
provider_interface: openai
access_key: secret_key
model: gpt-4
default: true
overrides:
# confidence threshold for prompt target intent matching
prompt_target_intent_matching_threshold: 0.0
system_prompt: |
You are a helpful assistant.
prompt_guards:
input_guards:
jailbreak:
on_exception:
message: "Looks like you're curious about my abilities, but I can only provide assistance within my programmed parameters."
prompt_targets:
- name: weather_forecast
description: This function provides realtime weather forecast information for a given city.
parameters:
- name: city
required: true
description: The city for which the weather forecast is requested.
- name: days
description: The number of days for which the weather forecast is requested.
- name: units
description: The units in which the weather forecast is requested.
endpoint:
name: api_server
path: /weather
http_method: POST
system_prompt: |
You are a helpful weather forecaster. Use weater data that is provided to you. Please following following guidelines when responding to user queries:
- Use farenheight for temperature
- Use miles per hour for wind speed
- name: default_target
default: true
description: This is the default target for all unmatched prompts.
endpoint:
name: weather_forecast_service
path: /default_target
http_method: POST
system_prompt: |
You are a helpful assistant! Summarize the user's request and provide a helpful response.
# if it is set to false arch will send response that it received from this prompt target to the user
# if true arch will forward the response to the default LLM
auto_llm_dispatch_on_response: false
ratelimits:
- model: gpt-4
selector:
key: selector-key
value: selector-value
limit:
tokens: 1
unit: minute
"#
}
#[test]
#[serial]
fn prompt_gateway_request_no_intent_match_default_target() {
let args = tester::MockSettings {
wasm_path: wasm_module(),
quiet: false,
allow_unexpected: false,
};
let mut module = tester::mock(args).unwrap();
module
.call_start()
.execute_and_expect(ReturnType::None)
.unwrap();
// Setup Filter
let mut config: Configuration = serde_yaml::from_str(arch_config_default_target()).unwrap();
config.ratelimits.as_mut().unwrap()[0].limit.tokens += 1000;
let config_str = serde_json::to_string(&config).unwrap();
let filter_context = setup_filter(&mut module, &config_str);
// Setup HTTP Stream
let http_context = 2;
normal_flow(&mut module, filter_context, http_context);
let arch_fc_resp = ChatCompletionsResponse {
usage: Some(Usage {
completion_tokens: 0,
}),
choices: vec![Choice {
finish_reason: Some("test".to_string()),
index: Some(0),
message: Message {
role: "system".to_string(),
content: None,
tool_calls: None,
model: None,
tool_call_id: None,
},
}],
model: String::from("test"),
metadata: None,
};
let arch_fc_resp_str = serde_json::to_string(&arch_fc_resp).unwrap();
module
.call_proxy_on_http_call_response(http_context, 1, 0, arch_fc_resp_str.len() as i32, 0)
.expect_metric_increment("active_http_calls", -1)
.expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
.returning(Some(&arch_fc_resp_str))
.expect_log(Some(LogLevel::Warn), None)
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), Some("intent matched: false"))
.expect_log(
Some(LogLevel::Info),
Some("default prompt target found, forwarding request to default prompt target"),
)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), None)
.expect_http_call(
Some("arch_internal"),
Some(vec![
(":method", "POST"),
("x-arch-upstream", "weather_forecast_service"),
(":path", "/default_target"),
(":authority", "weather_forecast_service"),
("content-type", "application/json"),
("x-envoy-max-retries", "3"),
("x-envoy-upstream-rq-timeout-ms", "30000"),
]),
None,
None,
Some(5000),
)
.returning(Some(2))
.expect_metric_increment("active_http_calls", 1)
.execute_and_expect(ReturnType::None)
.unwrap();
}

View file

@ -0,0 +1,49 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" output="target/classes" path="src/main/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" output="target/test-classes" path="src/test/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
<attribute name="test" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" path="target/generated-sources/annotations">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
<attribute name="ignore_optional_problems" value="true"/>
<attribute name="m2e-apt" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" output="target/test-classes" path="target/generated-test-sources/test-annotations">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
<attribute name="ignore_optional_problems" value="true"/>
<attribute name="m2e-apt" value="true"/>
<attribute name="test" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/classes"/>
</classpath>

View file

@ -0,0 +1,34 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>weather-forecast-service</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
</natures>
<filteredResources>
<filter>
<id>1742579142020</id>
<name></name>
<type>30</type>
<matcher>
<id>org.eclipse.core.resources.regexFilterMatcher</id>
<arguments>node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__</arguments>
</matcher>
</filter>
</filteredResources>
</projectDescription>

View file

@ -0,0 +1,4 @@
eclipse.preferences.version=1
encoding//src/main/java=UTF-8
encoding//src/main/resources=UTF-8
encoding/<project>=UTF-8

View file

@ -0,0 +1,2 @@
eclipse.preferences.version=1
org.eclipse.jdt.apt.aptEnabled=false

View file

@ -0,0 +1,10 @@
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.methodParameters=generate
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
org.eclipse.jdt.core.compiler.compliance=1.8
org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
org.eclipse.jdt.core.compiler.processAnnotations=disabled
org.eclipse.jdt.core.compiler.release=disabled
org.eclipse.jdt.core.compiler.source=1.8

View file

@ -0,0 +1,4 @@
activeProfiles=
eclipse.preferences.version=1
resolveWorkspaceProjects=true
version=1

View file

@ -14,5 +14,10 @@ WORKDIR /app
# Copy the built jar from the previous stage
COPY --from=build /app/target/weather-forecast-service-0.0.1-SNAPSHOT.jar app.jar
# Expose the port on which the app runs (default Spring Boot is 8080)
# Expose the application port and the debug port
EXPOSE 8081
ENTRYPOINT ["java", "-jar", "app.jar"]
EXPOSE 5005
# Start the application with remote debugging enabled
ENTRYPOINT ["java", "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005", "-jar", "app.jar"]

View file

@ -1,8 +1,10 @@
version: v0.1
listener:
address: 127.0.0.1
port: 10000 #If you configure port 443, you'll need to update the listener with tls_certificates
message_format: huggingface
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
# Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
llm_providers:
@ -43,3 +45,7 @@ prompt_targets:
name: weather_forecast_service
path: /weather
http_method: POST
tracing:
random_sampling: 100
trace_arch_internal: true

View file

@ -5,6 +5,7 @@ services:
dockerfile: Dockerfile
ports:
- "18081:8081"
- "5005:5005"
chatbot_ui:
build:
@ -18,3 +19,11 @@ services:
- "host.docker.internal:host-gateway"
volumes:
- ./arch_config.yaml:/app/arch_config.yaml
jaeger:
build:
context: ../../shared/jaeger
ports:
- "16686:16686"
- "4317:4317"
- "4318:4318"

View file

@ -35,6 +35,15 @@
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<debug>true</debug>
<debuglevel>lines,vars,source</debuglevel>
</configuration>
</plugin>
</plugins>
</build>
</project>

View file

@ -1,10 +1,11 @@
version: v0.1
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
llm_providers:
- name: gpt-4o
@ -18,7 +19,7 @@ endpoints:
protocol: https
system_prompt: |
You are a helpful assistant.
You are a helpful assistant. Only respond to queries related to currency exchange. If there are any other questions, I can't help you.
prompt_guards:
input_guards:

View file

@ -0,0 +1,19 @@
POST http://localhost:10000/v1/chat/completions
Content-Type: application/json
{
"messages": [
{
"role": "user",
"content": "convert 100 eur"
}
]
}
HTTP 200
[Asserts]
header "content-type" == "application/json"
jsonpath "$.model" matches /^gpt-4o/
jsonpath "$.metadata.x-arch-state" != null
jsonpath "$.usage" != null
jsonpath "$.choices[0].message.content" != null
jsonpath "$.choices[0].message.role" == "assistant"

View file

@ -0,0 +1,17 @@
POST http://localhost:10000/v1/chat/completions
Content-Type: application/json
{
"messages": [
{
"role": "user",
"content": "convert 100 eur"
}
],
"stream": true
}
HTTP 200
[Asserts]
header "content-type" matches /text\/event-stream/
body matches /^data: .*?currency_exchange.*?\n/
body matches /^data: .*?EUR.*?\n/

View file

@ -1,8 +1,10 @@
version: v0.1
listener:
address: 127.0.0.1
port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates
message_format: huggingface
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
# Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
llm_providers:

View file

@ -1,10 +1,11 @@
version: v0.1
listener:
address: 127.0.0.1
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
endpoints:
rag_energy_source_agent:

View file

@ -28,7 +28,7 @@ The assistant can perform several key operations, including rebooting devices, a
4. Tell me what can you do for me?"
# Observability
Arch gateway publishes stats endpoint at http://localhost:19901/stats. In this demo we are using prometheus to pull stats from arch and we are using grafana to visalize the stats in dashboard. To see grafana dashboard follow instructions below,
Arch gateway publishes stats endpoint at http://localhost:19901/stats. In this demo we are using prometheus to pull stats from arch and we are using grafana to visualize the stats in dashboard. To see grafana dashboard follow instructions below,
1. Start grafana and prometheus using following command
```yaml

View file

@ -1,15 +1,17 @@
version: v0.1
listener:
address: 127.0.0.1
port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates
message_format: huggingface
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
# Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
llm_providers:
- name: OpenAI
provider_interface: openai
access_key: $OPENAI_API_KEY
model: gpt-3.5-turbo
model: gpt-4o
default: true
# default system prompt used by all prompt targets
@ -24,25 +26,26 @@ prompt_targets:
path: /agent/device_summary
http_method: POST
parameters:
- name: device_ids
type: list
description: A list of device identifiers (IDs) to retrieve statistics for.
- name: device_id
type: str
description: A device identifier to retrieve statistics for.
required: true # device_ids are required to get device statistics
- name: days
type: int
description: The number of days for which to gather device statistics.
default: "7"
- name: reboot_devices
description: Reboot a list of devices
default: 7
- name: reboot_device
description: Reboot a device
endpoint:
name: app_server
path: /agent/device_reboot
http_method: POST
parameters:
- name: device_ids
type: list
description: A list of device identifiers (IDs).
- name: device_id
type: str
description: the device identifier
required: true
system_prompt: You will get a status JSON object. Simply summarize it
# Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem.
endpoints:
@ -53,3 +56,8 @@ endpoints:
endpoint: host.docker.internal:18083
# max time to wait for a connection to be established
connect_timeout: 0.005s
tracing:
random_sampling: 100
trace_arch_internal: true

View file

@ -18,3 +18,11 @@ services:
- "host.docker.internal:host-gateway"
volumes:
- ./arch_config.yaml:/app/arch_config.yaml
jaeger:
build:
context: ../../shared/jaeger
ports:
- "16686:16686"
- "4317:4317"
- "4318:4318"

View file

@ -13,7 +13,7 @@ DEMO_DESCRIPTION = """This demo illustrates how **Arch** can be used to perform
# Define the request model
class DeviceSummaryRequest(BaseModel):
device_ids: List[int]
device_id: str
time_range: Optional[int] = Field(
default=7, description="Time range in days, defaults to 7"
)
@ -21,7 +21,7 @@ class DeviceSummaryRequest(BaseModel):
# Define the response model
class DeviceStatistics(BaseModel):
device_id: int
device_id: str
time_range: str
data: str
@ -33,7 +33,7 @@ class DeviceSummaryResponse(BaseModel):
class DeviceRebootRequest(BaseModel):
device_ids: List[int]
device_id: str
# Response model for the device reboot
@ -49,24 +49,21 @@ def reboot_network_device(request_data: DeviceRebootRequest):
"""
# Access data from the Pydantic model
device_ids = request_data.device_ids
device_id = request_data.device_id
# Validate 'device_ids'
# Validate 'device_id'
# (This is already validated by Pydantic, but additional logic can be added if needed)
if not device_ids:
raise HTTPException(
status_code=400, detail="'device_ids' parameter is required"
)
if not device_id:
raise HTTPException(status_code=400, detail="'device_id' parameter is required")
# Simulate reboot operation and return the response
statistics = []
for device_id in device_ids:
# Placeholder for actual data retrieval or device reboot logic
stats = {"data": f"Device {device_id} has been successfully rebooted."}
statistics.append(stats)
# Placeholder for actual data retrieval or device reboot logic
stats = {"data": f"Device {device_id} has been successfully rebooted."}
statistics.append(stats)
# Return the response with a summary
return CoverageResponse(status="success", summary={"device_ids": device_ids})
return CoverageResponse(status="success", summary={"device_id": device_id})
# Post method for device summary
@ -76,28 +73,20 @@ def get_device_summary(request: DeviceSummaryRequest):
Endpoint to retrieve device statistics based on device IDs and an optional time range.
"""
# Extract 'device_ids' and 'time_range' from the request
device_ids = request.device_ids
# Extract 'device_id' and 'time_range' from the request
device_id = request.device_id
time_range = request.time_range
# Simulate retrieving statistics for the given device IDs and time range
statistics = []
minutes = 1
for device_id in device_ids:
stats = {
"device_id": device_id,
"time_range": f"Last {time_range} days",
"data": f"""Device {device_id} over the last {time_range} days experienced {minutes}
minutes of downtime.""",
}
minutes += 1
statistics.append(DeviceStatistics(**stats))
minutes = 4
stats = {
"device_id": device_id,
"time_range": f"Last {time_range} days",
"data": f"""Device {device_id} over the last {time_range} days experienced {minutes}
minutes of downtime.""",
}
statistics.append(DeviceStatistics(**stats))
return DeviceSummaryResponse(statistics=statistics)
CHAT_COMPLETION_ENDPOINT = os.getenv("CHAT_COMPLETION_ENDPOINT")
client = OpenAI(
api_key="--",
base_url=CHAT_COMPLETION_ENDPOINT,
)

View file

@ -22,9 +22,8 @@ start_demo() {
echo "Starting Arch with arch_config.yaml..."
archgw up arch_config.yaml
# Step 4: Start Network Agent
# Step 4: Start developer services
echo "Starting Network Agent using Docker Compose..."
cd build
docker compose up -d # Run in detached mode
}

View file

@ -1,10 +1,11 @@
version: v0.1
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
llm_providers:
- name: gpt-4o

View file

@ -1,6 +1,6 @@
# Function calling
This demo shows how you can use Arch's core function calling capabilites.
This demo shows how you can use Arch's core function calling capabilities.
# Starting the demo

View file

@ -1,10 +1,11 @@
version: "0.1-beta"
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
endpoints:
weather_forecast_service:
@ -16,21 +17,17 @@ overrides:
prompt_target_intent_matching_threshold: 0.6
llm_providers:
- name: gpt-4o-mini
access_key: $OPENAI_API_KEY
- name: groq
access_key: $GROQ_API_KEY
provider_interface: openai
model: gpt-4o-mini
default: true
- name: gpt-3.5-turbo-0125
access_key: $OPENAI_API_KEY
provider_interface: openai
model: gpt-3.5-turbo-0125
model: llama-3.2-3b-preview
base_url: https://api.groq.com
- name: gpt-4o
access_key: $OPENAI_API_KEY
provider_interface: openai
model: gpt-4o
default: true
system_prompt: |
You are a helpful assistant.

View file

@ -19,3 +19,5 @@ services:
- CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:10000/v1
extra_hosts:
- "host.docker.internal:host-gateway"
volumes:
- ./arch_config.yaml:/app/arch_config.yaml

View file

@ -0,0 +1,19 @@
POST http://localhost:10000/v1/chat/completions
Content-Type: application/json
{
"messages": [
{
"role": "user",
"content": "how is the weather in seattle for next 5 days"
}
]
}
HTTP 200
[Asserts]
header "content-type" == "application/json"
jsonpath "$.model" matches /^gpt-4o/
jsonpath "$.metadata.x-arch-state" != null
jsonpath "$.usage" != null
jsonpath "$.choices[0].message.content" matches /Seattle/
jsonpath "$.choices[0].message.role" == "assistant"

View file

@ -0,0 +1,17 @@
POST http://localhost:10000/v1/chat/completions
Content-Type: application/json
{
"messages": [
{
"role": "user",
"content": "how is the weather in seattle for next 5 days"
}
],
"stream": true
}
HTTP 200
[Asserts]
header "content-type" matches /text\/event-stream/
body matches "(?s).*\"name\":\"get_current_weather\".*"
body matches "(?s).*\"model\":\"gpt-4o-mini.*"

View file

@ -73,7 +73,7 @@ async def weather(req: WeatherRequest, res: Response):
class DefaultTargetRequest(BaseModel):
messages: list
messages: list = []
@app.post("/default_target")
@ -86,12 +86,9 @@ async def default_target(req: DefaultTargetRequest, res: Response):
"role": "assistant",
"content": "I can help you with weather forecast",
},
"finish_reason": "completed",
"index": 0,
}
],
"model": "api_server",
"usage": {"completion_tokens": 0},
}
logger.info(f"sending response: {json.dumps(resp)}")
return resp

View file

@ -15,7 +15,7 @@
"LLM": "1",
"CHAT_COMPLETION_ENDPOINT": "http://localhost:10000/v1",
"STREAMING": "True",
"ARCH_CONFIG": "../../weather_forecast/arch_config.yaml"
"ARCH_CONFIG": "../../samples_python/weather_forecast/arch_config.yaml"
}
},
{
@ -29,7 +29,7 @@
"LLM": "1",
"CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1",
"STREAMING": "True",
"ARCH_CONFIG": "../../llm_routing/arch_config.yaml"
"ARCH_CONFIG": "../../samples_python/weather_forecast/arch_config.yaml"
}
},
]

View file

@ -38,7 +38,7 @@ def chat(
try:
response = client.chat.completions.create(
# we select model from arch_config file
model="--",
model="None",
messages=history,
temperature=1.0,
stream=True,
@ -120,8 +120,11 @@ def process_stream_chunk(chunk, history):
if delta.content:
# append content to the last history item
history[-1]["content"] = history[-1].get("content", "") + delta.content
if history[-1]["model"] != "Arch-Function-Chat":
history[-1]["content"] = history[-1].get("content", "") + delta.content
# yield content if it is from assistant
if history[-1]["model"] == "Arch-Function":
return None
if history[-1]["role"] == "assistant":
return delta.content

View file

@ -54,13 +54,13 @@ def chat(
if model_selector and model_selector != "":
headers["x-arch-llm-provider-hint"] = model_selector
client = OpenAI(
api_key="--",
api_key="None",
base_url=CHAT_COMPLETION_ENDPOINT,
default_headers=headers,
)
response = client.chat.completions.create(
# we select model from arch_config file
model="--",
model="None",
messages=history,
temperature=1.0,
stream=True,
@ -88,6 +88,22 @@ def chat(
yield "", conversation, history, debug_output, model_selector
# update assistant response to have correct format
# arch-fc 1.1 expects following format:
# {
# "response": "<assistant response>",
# }
# and this entire block needs to be encoded in ```json\n{json_encoded_content}\n```
if not history[-1]["model"].startswith("Arch"):
assistant_response = {
"response": history[-1]["content"],
}
history[-1]["content"] = "```json\n{}\n```".format(
json.dumps(assistant_response)
)
log.info("history: {}".format(json.dumps(history)))
def main():
with gr.Blocks(

View file

@ -8,11 +8,13 @@ do
echo "Running tests for $demo ..."
echo "****************************************"
cd ../../samples_python/$demo
echo "starting archgw"
archgw up arch_config.yaml
docker compose up -d
cd ../../shared/test_runner
TEST_DATA=../../samples_python/$demo/test_data.yaml poetry run pytest
cd ../../samples_python/$demo
echo "starting docker containers"
docker compose up -d 2>&1 > /dev/null
echo "starting hurl tests"
hurl --test hurl_tests/*.hurl
echo "stopping docker containers and archgw"
archgw down
docker compose down -v
cd ../../shared/test_runner

View file

@ -1,10 +1,11 @@
version: "0.1-beta"
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
egress_traffic:
address: 0.0.0.0
port: 12000
message_format: openai
timeout: 30s
llm_providers:
- name: gpt-4o-mini
@ -13,11 +14,6 @@ llm_providers:
model: gpt-4o-mini
default: true
- name: gpt-3.5-turbo-0125
access_key: $OPENAI_API_KEY
provider_interface: openai
model: gpt-3.5-turbo-0125
- name: gpt-4o
access_key: $OPENAI_API_KEY
provider_interface: openai
@ -28,5 +24,17 @@ llm_providers:
provider_interface: mistral
model: ministral-3b-latest
- name: deepseek
access_key: $DEEPSEEK_API_KEY
provider_interface: openai
model: deepseek-reasoner
base_url: https://api.deepseek.com/
- name: groq
access_key: $GROQ_API_KEY
provider_interface: openai
model: llama-3.1-8b-instant
base_url: https://api.groq.com
tracing:
random_sampling: 100

View file

@ -1,10 +1,11 @@
version: v0.1
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
egress_traffic:
address: 0.0.0.0
port: 12000
message_format: openai
timeout: 30s
llm_providers:

View file

@ -6,7 +6,7 @@ services:
- "18080:8080"
environment:
# this is only because we are running the sample app in the same docker container environemtn as archgw
- CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:10000/v1
- CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:12000/v1
extra_hosts:
- "host.docker.internal:host-gateway"
volumes:

View file

@ -0,0 +1,41 @@
# took inspiration from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0
# The builder image, used to build the virtual environment
FROM python:3.10 as builder
RUN pip install poetry==1.8.3
ENV POETRY_NO_INTERACTION=1 \
POETRY_VIRTUALENVS_IN_PROJECT=1 \
POETRY_VIRTUALENVS_CREATE=1 \
POETRY_CACHE_DIR=/tmp/poetry_cache
WORKDIR /code
COPY pyproject.toml poetry.lock ./
RUN touch README.md
RUN poetry install --no-root && rm -rf $POETRY_CACHE_DIR
# The runtime image, used to just run the code provided its virtual environment
FROM python:3.10-slim as runtime
RUN apt-get update && apt-get install -y curl
WORKDIR /code
ENV VIRTUAL_ENV=/code/.venv \
PATH="/code/.venv/bin:$PATH"
COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
COPY main.py ./
HEALTHCHECK \
--interval=5s \
--timeout=1s \
--start-period=1s \
--retries=3 \
CMD curl http://localhost:80/healthz
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--log-level", "debug"]

View file

@ -0,0 +1,46 @@
version: "0.1-beta"
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
egress_traffic:
address: 0.0.0.0
port: 12000
message_format: openai
timeout: 30s
overrides:
use_agent_orchestrator: true
endpoints:
agent_gateway:
endpoint: host.docker.internal:18083
connect_timeout: 0.005s
llm_providers:
- name: gpt-4o-mini
access_key: $OPENAI_API_KEY
provider_interface: openai
model: gpt-4o-mini
default: true
system_prompt: |
You are a helpful assistant.
prompt_targets:
- name: sales_agent
description: handles queries related to sales and purchases
- name: issues_and_repairs
description: handles issues, repairs, or refunds
- name: escalate_to_human
description: escalates to human agent
tracing:
random_sampling: 100
trace_arch_internal: true

View file

@ -0,0 +1,29 @@
services:
triage_service:
build:
context: ./
environment:
- OLTP_HOST=http://jaeger:4317
extra_hosts:
- "host.docker.internal:host-gateway"
ports:
- "18083:80"
chatbot_ui:
build:
context: ../../shared/chatbot_ui
ports:
- "18080:8080"
environment:
# this is only because we are running the sample app in the same docker container environemtn as archgw
- CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:10000/v1
extra_hosts:
- "host.docker.internal:host-gateway"
jaeger:
build:
context: ../../shared/jaeger
ports:
- "16686:16686"
- "4317:4317"
- "4318:4318"

View file

@ -0,0 +1,19 @@
POST http://localhost:10000/v1/chat/completions
Content-Type: application/json
{
"messages": [
{
"role": "user",
"content": "I bought a package recently and it not working properly"
}
]
}
HTTP 200
[Asserts]
header "content-type" == "application/json"
jsonpath "$.model" matches /^gpt-4o-2/
jsonpath "$.metadata.x-arch-state" != null
jsonpath "$.usage" != null
jsonpath "$.choices[0].message.content" != null
jsonpath "$.choices[0].message.role" == "assistant"

View file

@ -0,0 +1,19 @@
POST http://localhost:10000/v1/chat/completions
Content-Type: application/json
{
"messages": [
{
"role": "user",
"content": "I want to sell red shoes"
}
]
}
HTTP 200
[Asserts]
header "content-type" == "application/json"
jsonpath "$.model" matches /^gpt-4o-mini/
jsonpath "$.metadata.x-arch-state" != null
jsonpath "$.usage" != null
jsonpath "$.choices[0].message.content" != null
jsonpath "$.choices[0].message.role" == "assistant"

View file

@ -0,0 +1,16 @@
POST http://localhost:10000/v1/chat/completions
Content-Type: application/json
{
"messages": [
{
"role": "user",
"content": "I want to sell red shoes"
}
],
"stream": true
}
HTTP 200
[Asserts]
header "content-type" matches /text\/event-stream/
body matches /^data: .*?sales_agent.*?\n/

View file

@ -0,0 +1,115 @@
import logging
import json
from typing import List, Dict, Any
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
import openai
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("uvicorn.error")
app = FastAPI()
class Message(BaseModel):
role: str
content: str
class ChatCompletionsRequest(BaseModel):
messages: List[Message]
model: str
metadata: Dict[str, Any] = {}
stream: bool = False
openai_client = openai.OpenAI(
api_key="None", # archgw picks the API key from the config file
base_url="http://host.docker.internal:12000/v1",
)
def call_openai(messages: List[Dict[str, str]], stream: bool, model: str):
logger.info(f"llm agent model: {model}")
completion = openai_client.chat.completions.create(
model=model,
messages=messages,
stream=stream,
)
if stream:
def stream():
for line in completion:
if line.choices and len(line.choices) > 0 and line.choices[0].delta:
chunk_response_str = json.dumps(line.model_dump())
yield "data: " + chunk_response_str + "\n\n"
yield "data: [DONE]" + "\n\n"
return StreamingResponse(stream(), media_type="text/event-stream")
else:
return completion
class Agent:
def __init__(self, role: str, instructions: str, model: str = ""):
self.model = model
self.system_prompt = f"You are a {role}.\n{instructions}"
def handle(self, req: ChatCompletionsRequest):
messages = [{"role": "system", "content": self.get_system_prompt()}] + [
message.model_dump() for message in req.messages
]
model = req.model
if self.model:
model = self.model
return call_openai(messages, req.stream, model)
def get_system_prompt(self) -> str:
return self.system_prompt
# Define your agents
AGENTS = {
"sales_agent": Agent(
role="sales agent",
instructions=(
"Always answer in a sentence or less.\n"
"Follow the following routine with the user:\n"
"1. Engage\n"
"2. Quote ridiculous price\n"
"3. Reveal caveat if user agrees."
),
model="gpt-4o-mini",
),
"issues_and_repairs": Agent(
role="issues and repairs agent",
instructions="Propose a solution, offer refund if necessary.",
model="gpt-4o",
),
"escalate_to_human": Agent(
role="human escalation agent",
instructions="Escalate issues to a human.",
# skipping model name here as arch gateway will pick the default model from the config file
),
"unknown_agent": Agent(
role="general assistant", instructions="Assist the user in general queries."
),
}
@app.post("/v1/chat/completions")
def completion_api(req: ChatCompletionsRequest, request: Request):
agent_name = req.metadata.get("agent-name", "unknown_agent")
agent = AGENTS.get(agent_name)
logger.info(f"Routing to agent: {agent_name}")
return agent.handle(req)
@app.get("/healthz")
async def healthz():
return {"status": "ok"}

View file

@ -0,0 +1,573 @@
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
[[package]]
name = "annotated-types"
version = "0.7.0"
description = "Reusable constraint types to use with typing.Annotated"
optional = false
python-versions = ">=3.8"
files = [
{file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"},
{file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"},
]
[[package]]
name = "anyio"
version = "4.9.0"
description = "High level compatibility layer for multiple asynchronous event loop implementations"
optional = false
python-versions = ">=3.9"
files = [
{file = "anyio-4.9.0-py3-none-any.whl", hash = "sha256:9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c"},
{file = "anyio-4.9.0.tar.gz", hash = "sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028"},
]
[package.dependencies]
exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
idna = ">=2.8"
sniffio = ">=1.1"
typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""}
[package.extras]
doc = ["Sphinx (>=8.2,<9.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx_rtd_theme"]
test = ["anyio[trio]", "blockbuster (>=1.5.23)", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "trustme", "truststore (>=0.9.1)", "uvloop (>=0.21)"]
trio = ["trio (>=0.26.1)"]
[[package]]
name = "certifi"
version = "2025.1.31"
description = "Python package for providing Mozilla's CA Bundle."
optional = false
python-versions = ">=3.6"
files = [
{file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"},
{file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"},
]
[[package]]
name = "click"
version = "8.1.8"
description = "Composable command line interface toolkit"
optional = false
python-versions = ">=3.7"
files = [
{file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"},
{file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"},
]
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[[package]]
name = "colorama"
version = "0.4.6"
description = "Cross-platform colored terminal text."
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
files = [
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
]
[[package]]
name = "distro"
version = "1.9.0"
description = "Distro - an OS platform information API"
optional = false
python-versions = ">=3.6"
files = [
{file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"},
{file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
]
[[package]]
name = "exceptiongroup"
version = "1.2.2"
description = "Backport of PEP 654 (exception groups)"
optional = false
python-versions = ">=3.7"
files = [
{file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
{file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
]
[package.extras]
test = ["pytest (>=6)"]
[[package]]
name = "fastapi"
version = "0.115.11"
description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
optional = false
python-versions = ">=3.8"
files = [
{file = "fastapi-0.115.11-py3-none-any.whl", hash = "sha256:32e1541b7b74602e4ef4a0260ecaf3aadf9d4f19590bba3e1bf2ac4666aa2c64"},
{file = "fastapi-0.115.11.tar.gz", hash = "sha256:cc81f03f688678b92600a65a5e618b93592c65005db37157147204d8924bf94f"},
]
[package.dependencies]
pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.1.0 || >2.1.0,<3.0.0"
starlette = ">=0.40.0,<0.47.0"
typing-extensions = ">=4.8.0"
[package.extras]
all = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=3.1.5)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.18)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"]
standard = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "jinja2 (>=3.1.5)", "python-multipart (>=0.0.18)", "uvicorn[standard] (>=0.12.0)"]
[[package]]
name = "h11"
version = "0.14.0"
description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
optional = false
python-versions = ">=3.7"
files = [
{file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
{file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
]
[[package]]
name = "httpcore"
version = "1.0.7"
description = "A minimal low-level HTTP client."
optional = false
python-versions = ">=3.8"
files = [
{file = "httpcore-1.0.7-py3-none-any.whl", hash = "sha256:a3fff8f43dc260d5bd363d9f9cf1830fa3a458b332856f34282de498ed420edd"},
{file = "httpcore-1.0.7.tar.gz", hash = "sha256:8551cb62a169ec7162ac7be8d4817d561f60e08eaa485234898414bb5a8a0b4c"},
]
[package.dependencies]
certifi = "*"
h11 = ">=0.13,<0.15"
[package.extras]
asyncio = ["anyio (>=4.0,<5.0)"]
http2 = ["h2 (>=3,<5)"]
socks = ["socksio (==1.*)"]
trio = ["trio (>=0.22.0,<1.0)"]
[[package]]
name = "httpx"
version = "0.28.1"
description = "The next generation HTTP client."
optional = false
python-versions = ">=3.8"
files = [
{file = "httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad"},
{file = "httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc"},
]
[package.dependencies]
anyio = "*"
certifi = "*"
httpcore = "==1.*"
idna = "*"
[package.extras]
brotli = ["brotli", "brotlicffi"]
cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
http2 = ["h2 (>=3,<5)"]
socks = ["socksio (==1.*)"]
zstd = ["zstandard (>=0.18.0)"]
[[package]]
name = "idna"
version = "3.10"
description = "Internationalized Domain Names in Applications (IDNA)"
optional = false
python-versions = ">=3.6"
files = [
{file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"},
{file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"},
]
[package.extras]
all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
[[package]]
name = "jiter"
version = "0.9.0"
description = "Fast iterable JSON parser."
optional = false
python-versions = ">=3.8"
files = [
{file = "jiter-0.9.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:816ec9b60fdfd1fec87da1d7ed46c66c44ffec37ab2ef7de5b147b2fce3fd5ad"},
{file = "jiter-0.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9b1d3086f8a3ee0194ecf2008cf81286a5c3e540d977fa038ff23576c023c0ea"},
{file = "jiter-0.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1339f839b91ae30b37c409bf16ccd3dc453e8b8c3ed4bd1d6a567193651a4a51"},
{file = "jiter-0.9.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ffba79584b3b670fefae66ceb3a28822365d25b7bf811e030609a3d5b876f538"},
{file = "jiter-0.9.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cfc7d0a8e899089d11f065e289cb5b2daf3d82fbe028f49b20d7b809193958d"},
{file = "jiter-0.9.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e00a1a2bbfaaf237e13c3d1592356eab3e9015d7efd59359ac8b51eb56390a12"},
{file = "jiter-0.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1d9870561eb26b11448854dce0ff27a9a27cb616b632468cafc938de25e9e51"},
{file = "jiter-0.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9872aeff3f21e437651df378cb75aeb7043e5297261222b6441a620218b58708"},
{file = "jiter-0.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:1fd19112d1049bdd47f17bfbb44a2c0001061312dcf0e72765bfa8abd4aa30e5"},
{file = "jiter-0.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6ef5da104664e526836070e4a23b5f68dec1cc673b60bf1edb1bfbe8a55d0678"},
{file = "jiter-0.9.0-cp310-cp310-win32.whl", hash = "sha256:cb12e6d65ebbefe5518de819f3eda53b73187b7089040b2d17f5b39001ff31c4"},
{file = "jiter-0.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:c43ca669493626d8672be3b645dbb406ef25af3f4b6384cfd306da7eb2e70322"},
{file = "jiter-0.9.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6c4d99c71508912a7e556d631768dcdef43648a93660670986916b297f1c54af"},
{file = "jiter-0.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8f60fb8ce7df529812bf6c625635a19d27f30806885139e367af93f6e734ef58"},
{file = "jiter-0.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51c4e1a4f8ea84d98b7b98912aa4290ac3d1eabfde8e3c34541fae30e9d1f08b"},
{file = "jiter-0.9.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f4c677c424dc76684fea3e7285a7a2a7493424bea89ac441045e6a1fb1d7b3b"},
{file = "jiter-0.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2221176dfec87f3470b21e6abca056e6b04ce9bff72315cb0b243ca9e835a4b5"},
{file = "jiter-0.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3c7adb66f899ffa25e3c92bfcb593391ee1947dbdd6a9a970e0d7e713237d572"},
{file = "jiter-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c98d27330fdfb77913c1097a7aab07f38ff2259048949f499c9901700789ac15"},
{file = "jiter-0.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eda3f8cc74df66892b1d06b5d41a71670c22d95a1ca2cbab73654745ce9d0419"},
{file = "jiter-0.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:dd5ab5ddc11418dce28343123644a100f487eaccf1de27a459ab36d6cca31043"},
{file = "jiter-0.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:42f8a68a69f047b310319ef8e2f52fdb2e7976fb3313ef27df495cf77bcad965"},
{file = "jiter-0.9.0-cp311-cp311-win32.whl", hash = "sha256:a25519efb78a42254d59326ee417d6f5161b06f5da827d94cf521fed961b1ff2"},
{file = "jiter-0.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:923b54afdd697dfd00d368b7ccad008cccfeb1efb4e621f32860c75e9f25edbd"},
{file = "jiter-0.9.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:7b46249cfd6c48da28f89eb0be3f52d6fdb40ab88e2c66804f546674e539ec11"},
{file = "jiter-0.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:609cf3c78852f1189894383cf0b0b977665f54cb38788e3e6b941fa6d982c00e"},
{file = "jiter-0.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d726a3890a54561e55a9c5faea1f7655eda7f105bd165067575ace6e65f80bb2"},
{file = "jiter-0.9.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2e89dc075c1fef8fa9be219e249f14040270dbc507df4215c324a1839522ea75"},
{file = "jiter-0.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04e8ffa3c353b1bc4134f96f167a2082494351e42888dfcf06e944f2729cbe1d"},
{file = "jiter-0.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:203f28a72a05ae0e129b3ed1f75f56bc419d5f91dfacd057519a8bd137b00c42"},
{file = "jiter-0.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fca1a02ad60ec30bb230f65bc01f611c8608b02d269f998bc29cca8619a919dc"},
{file = "jiter-0.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:237e5cee4d5d2659aaf91bbf8ec45052cc217d9446070699441a91b386ae27dc"},
{file = "jiter-0.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:528b6b71745e7326eed73c53d4aa57e2a522242320b6f7d65b9c5af83cf49b6e"},
{file = "jiter-0.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9f48e86b57bc711eb5acdfd12b6cb580a59cc9a993f6e7dcb6d8b50522dcd50d"},
{file = "jiter-0.9.0-cp312-cp312-win32.whl", hash = "sha256:699edfde481e191d81f9cf6d2211debbfe4bd92f06410e7637dffb8dd5dfde06"},
{file = "jiter-0.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:099500d07b43f61d8bd780466d429c45a7b25411b334c60ca875fa775f68ccb0"},
{file = "jiter-0.9.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:2764891d3f3e8b18dce2cff24949153ee30c9239da7c00f032511091ba688ff7"},
{file = "jiter-0.9.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:387b22fbfd7a62418d5212b4638026d01723761c75c1c8232a8b8c37c2f1003b"},
{file = "jiter-0.9.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d8da8629ccae3606c61d9184970423655fb4e33d03330bcdfe52d234d32f69"},
{file = "jiter-0.9.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1be73d8982bdc278b7b9377426a4b44ceb5c7952073dd7488e4ae96b88e1103"},
{file = "jiter-0.9.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2228eaaaa111ec54b9e89f7481bffb3972e9059301a878d085b2b449fbbde635"},
{file = "jiter-0.9.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:11509bfecbc319459647d4ac3fd391d26fdf530dad00c13c4dadabf5b81f01a4"},
{file = "jiter-0.9.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f22238da568be8bbd8e0650e12feeb2cfea15eda4f9fc271d3b362a4fa0604d"},
{file = "jiter-0.9.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17f5d55eb856597607562257c8e36c42bc87f16bef52ef7129b7da11afc779f3"},
{file = "jiter-0.9.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:6a99bed9fbb02f5bed416d137944419a69aa4c423e44189bc49718859ea83bc5"},
{file = "jiter-0.9.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e057adb0cd1bd39606100be0eafe742de2de88c79df632955b9ab53a086b3c8d"},
{file = "jiter-0.9.0-cp313-cp313-win32.whl", hash = "sha256:f7e6850991f3940f62d387ccfa54d1a92bd4bb9f89690b53aea36b4364bcab53"},
{file = "jiter-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:c8ae3bf27cd1ac5e6e8b7a27487bf3ab5f82318211ec2e1346a5b058756361f7"},
{file = "jiter-0.9.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f0b2827fb88dda2cbecbbc3e596ef08d69bda06c6f57930aec8e79505dc17001"},
{file = "jiter-0.9.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:062b756ceb1d40b0b28f326cba26cfd575a4918415b036464a52f08632731e5a"},
{file = "jiter-0.9.0-cp313-cp313t-win_amd64.whl", hash = "sha256:6f7838bc467ab7e8ef9f387bd6de195c43bad82a569c1699cb822f6609dd4cdf"},
{file = "jiter-0.9.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:4a2d16360d0642cd68236f931b85fe50288834c383492e4279d9f1792e309571"},
{file = "jiter-0.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e84ed1c9c9ec10bbb8c37f450077cbe3c0d4e8c2b19f0a49a60ac7ace73c7452"},
{file = "jiter-0.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f3c848209ccd1bfa344a1240763975ca917de753c7875c77ec3034f4151d06c"},
{file = "jiter-0.9.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7825f46e50646bee937e0f849d14ef3a417910966136f59cd1eb848b8b5bb3e4"},
{file = "jiter-0.9.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d82a811928b26d1a6311a886b2566f68ccf2b23cf3bfed042e18686f1f22c2d7"},
{file = "jiter-0.9.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c058ecb51763a67f019ae423b1cbe3fa90f7ee6280c31a1baa6ccc0c0e2d06e"},
{file = "jiter-0.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9897115ad716c48f0120c1f0c4efae348ec47037319a6c63b2d7838bb53aaef4"},
{file = "jiter-0.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:351f4c90a24c4fb8c87c6a73af2944c440494ed2bea2094feecacb75c50398ae"},
{file = "jiter-0.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:d45807b0f236c485e1e525e2ce3a854807dfe28ccf0d013dd4a563395e28008a"},
{file = "jiter-0.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:1537a890724ba00fdba21787010ac6f24dad47f763410e9e1093277913592784"},
{file = "jiter-0.9.0-cp38-cp38-win32.whl", hash = "sha256:e3630ec20cbeaddd4b65513fa3857e1b7c4190d4481ef07fb63d0fad59033321"},
{file = "jiter-0.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:2685f44bf80e95f8910553bf2d33b9c87bf25fceae6e9f0c1355f75d2922b0ee"},
{file = "jiter-0.9.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:9ef340fae98065071ccd5805fe81c99c8f80484e820e40043689cf97fb66b3e2"},
{file = "jiter-0.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:efb767d92c63b2cd9ec9f24feeb48f49574a713870ec87e9ba0c2c6e9329c3e2"},
{file = "jiter-0.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:113f30f87fb1f412510c6d7ed13e91422cfd329436364a690c34c8b8bd880c42"},
{file = "jiter-0.9.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8793b6df019b988526f5a633fdc7456ea75e4a79bd8396a3373c371fc59f5c9b"},
{file = "jiter-0.9.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7a9aaa5102dba4e079bb728076fadd5a2dca94c05c04ce68004cfd96f128ea34"},
{file = "jiter-0.9.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d838650f6ebaf4ccadfb04522463e74a4c378d7e667e0eb1865cfe3990bfac49"},
{file = "jiter-0.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0194f813efdf4b8865ad5f5c5f50f8566df7d770a82c51ef593d09e0b347020"},
{file = "jiter-0.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a7954a401d0a8a0b8bc669199db78af435aae1e3569187c2939c477c53cb6a0a"},
{file = "jiter-0.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4feafe787eb8a8d98168ab15637ca2577f6ddf77ac6c8c66242c2d028aa5420e"},
{file = "jiter-0.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:27cd1f2e8bb377f31d3190b34e4328d280325ad7ef55c6ac9abde72f79e84d2e"},
{file = "jiter-0.9.0-cp39-cp39-win32.whl", hash = "sha256:161d461dcbe658cf0bd0aa375b30a968b087cdddc624fc585f3867c63c6eca95"},
{file = "jiter-0.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:e8b36d8a16a61993be33e75126ad3d8aa29cf450b09576f3c427d27647fcb4aa"},
{file = "jiter-0.9.0.tar.gz", hash = "sha256:aadba0964deb424daa24492abc3d229c60c4a31bfee205aedbf1acc7639d7893"},
]
[[package]]
name = "openai"
version = "1.66.5"
description = "The official Python library for the openai API"
optional = false
python-versions = ">=3.8"
files = [
{file = "openai-1.66.5-py3-none-any.whl", hash = "sha256:74be528175f8389f67675830c51a15bd51e874425c86d3de6153bf70ed6c2884"},
{file = "openai-1.66.5.tar.gz", hash = "sha256:f61b8fac29490ca8fdc6d996aa6926c18dbe5639536f8c40219c40db05511b11"},
]
[package.dependencies]
anyio = ">=3.5.0,<5"
distro = ">=1.7.0,<2"
httpx = ">=0.23.0,<1"
jiter = ">=0.4.0,<1"
pydantic = ">=1.9.0,<3"
sniffio = "*"
tqdm = ">4"
typing-extensions = ">=4.11,<5"
[package.extras]
datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
realtime = ["websockets (>=13,<15)"]
[[package]]
name = "pydantic"
version = "2.10.6"
description = "Data validation using Python type hints"
optional = false
python-versions = ">=3.8"
files = [
{file = "pydantic-2.10.6-py3-none-any.whl", hash = "sha256:427d664bf0b8a2b34ff5dd0f5a18df00591adcee7198fbd71981054cef37b584"},
{file = "pydantic-2.10.6.tar.gz", hash = "sha256:ca5daa827cce33de7a42be142548b0096bf05a7e7b365aebfa5f8eeec7128236"},
]
[package.dependencies]
annotated-types = ">=0.6.0"
pydantic-core = "2.27.2"
typing-extensions = ">=4.12.2"
[package.extras]
email = ["email-validator (>=2.0.0)"]
timezone = ["tzdata"]
[[package]]
name = "pydantic-core"
version = "2.27.2"
description = "Core functionality for Pydantic validation and serialization"
optional = false
python-versions = ">=3.8"
files = [
{file = "pydantic_core-2.27.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2d367ca20b2f14095a8f4fa1210f5a7b78b8a20009ecced6b12818f455b1e9fa"},
{file = "pydantic_core-2.27.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:491a2b73db93fab69731eaee494f320faa4e093dbed776be1a829c2eb222c34c"},
{file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7969e133a6f183be60e9f6f56bfae753585680f3b7307a8e555a948d443cc05a"},
{file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3de9961f2a346257caf0aa508a4da705467f53778e9ef6fe744c038119737ef5"},
{file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2bb4d3e5873c37bb3dd58714d4cd0b0e6238cebc4177ac8fe878f8b3aa8e74c"},
{file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:280d219beebb0752699480fe8f1dc61ab6615c2046d76b7ab7ee38858de0a4e7"},
{file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47956ae78b6422cbd46f772f1746799cbb862de838fd8d1fbd34a82e05b0983a"},
{file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:14d4a5c49d2f009d62a2a7140d3064f686d17a5d1a268bc641954ba181880236"},
{file = "pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:337b443af21d488716f8d0b6164de833e788aa6bd7e3a39c005febc1284f4962"},
{file = "pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:03d0f86ea3184a12f41a2d23f7ccb79cdb5a18e06993f8a45baa8dfec746f0e9"},
{file = "pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7041c36f5680c6e0f08d922aed302e98b3745d97fe1589db0a3eebf6624523af"},
{file = "pydantic_core-2.27.2-cp310-cp310-win32.whl", hash = "sha256:50a68f3e3819077be2c98110c1f9dcb3817e93f267ba80a2c05bb4f8799e2ff4"},
{file = "pydantic_core-2.27.2-cp310-cp310-win_amd64.whl", hash = "sha256:e0fd26b16394ead34a424eecf8a31a1f5137094cabe84a1bcb10fa6ba39d3d31"},
{file = "pydantic_core-2.27.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:8e10c99ef58cfdf2a66fc15d66b16c4a04f62bca39db589ae8cba08bc55331bc"},
{file = "pydantic_core-2.27.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:26f32e0adf166a84d0cb63be85c562ca8a6fa8de28e5f0d92250c6b7e9e2aff7"},
{file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c19d1ea0673cd13cc2f872f6c9ab42acc4e4f492a7ca9d3795ce2b112dd7e15"},
{file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e68c4446fe0810e959cdff46ab0a41ce2f2c86d227d96dc3847af0ba7def306"},
{file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9640b0059ff4f14d1f37321b94061c6db164fbe49b334b31643e0528d100d99"},
{file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:40d02e7d45c9f8af700f3452f329ead92da4c5f4317ca9b896de7ce7199ea459"},
{file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c1fd185014191700554795c99b347d64f2bb637966c4cfc16998a0ca700d048"},
{file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d81d2068e1c1228a565af076598f9e7451712700b673de8f502f0334f281387d"},
{file = "pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1a4207639fb02ec2dbb76227d7c751a20b1a6b4bc52850568e52260cae64ca3b"},
{file = "pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:3de3ce3c9ddc8bbd88f6e0e304dea0e66d843ec9de1b0042b0911c1663ffd474"},
{file = "pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:30c5f68ded0c36466acede341551106821043e9afaad516adfb6e8fa80a4e6a6"},
{file = "pydantic_core-2.27.2-cp311-cp311-win32.whl", hash = "sha256:c70c26d2c99f78b125a3459f8afe1aed4d9687c24fd677c6a4436bc042e50d6c"},
{file = "pydantic_core-2.27.2-cp311-cp311-win_amd64.whl", hash = "sha256:08e125dbdc505fa69ca7d9c499639ab6407cfa909214d500897d02afb816e7cc"},
{file = "pydantic_core-2.27.2-cp311-cp311-win_arm64.whl", hash = "sha256:26f0d68d4b235a2bae0c3fc585c585b4ecc51382db0e3ba402a22cbc440915e4"},
{file = "pydantic_core-2.27.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9e0c8cfefa0ef83b4da9588448b6d8d2a2bf1a53c3f1ae5fca39eb3061e2f0b0"},
{file = "pydantic_core-2.27.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:83097677b8e3bd7eaa6775720ec8e0405f1575015a463285a92bfdfe254529ef"},
{file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:172fce187655fece0c90d90a678424b013f8fbb0ca8b036ac266749c09438cb7"},
{file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:519f29f5213271eeeeb3093f662ba2fd512b91c5f188f3bb7b27bc5973816934"},
{file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05e3a55d124407fffba0dd6b0c0cd056d10e983ceb4e5dbd10dda135c31071d6"},
{file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c3ed807c7b91de05e63930188f19e921d1fe90de6b4f5cd43ee7fcc3525cb8c"},
{file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fb4aadc0b9a0c063206846d603b92030eb6f03069151a625667f982887153e2"},
{file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28ccb213807e037460326424ceb8b5245acb88f32f3d2777427476e1b32c48c4"},
{file = "pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:de3cd1899e2c279b140adde9357c4495ed9d47131b4a4eaff9052f23398076b3"},
{file = "pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:220f892729375e2d736b97d0e51466252ad84c51857d4d15f5e9692f9ef12be4"},
{file = "pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a0fcd29cd6b4e74fe8ddd2c90330fd8edf2e30cb52acda47f06dd615ae72da57"},
{file = "pydantic_core-2.27.2-cp312-cp312-win32.whl", hash = "sha256:1e2cb691ed9834cd6a8be61228471d0a503731abfb42f82458ff27be7b2186fc"},
{file = "pydantic_core-2.27.2-cp312-cp312-win_amd64.whl", hash = "sha256:cc3f1a99a4f4f9dd1de4fe0312c114e740b5ddead65bb4102884b384c15d8bc9"},
{file = "pydantic_core-2.27.2-cp312-cp312-win_arm64.whl", hash = "sha256:3911ac9284cd8a1792d3cb26a2da18f3ca26c6908cc434a18f730dc0db7bfa3b"},
{file = "pydantic_core-2.27.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7d14bd329640e63852364c306f4d23eb744e0f8193148d4044dd3dacdaacbd8b"},
{file = "pydantic_core-2.27.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:82f91663004eb8ed30ff478d77c4d1179b3563df6cdb15c0817cd1cdaf34d154"},
{file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71b24c7d61131bb83df10cc7e687433609963a944ccf45190cfc21e0887b08c9"},
{file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa8e459d4954f608fa26116118bb67f56b93b209c39b008277ace29937453dc9"},
{file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce8918cbebc8da707ba805b7fd0b382816858728ae7fe19a942080c24e5b7cd1"},
{file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eda3f5c2a021bbc5d976107bb302e0131351c2ba54343f8a496dc8783d3d3a6a"},
{file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8086fa684c4775c27f03f062cbb9eaa6e17f064307e86b21b9e0abc9c0f02e"},
{file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8d9b3388db186ba0c099a6d20f0604a44eabdeef1777ddd94786cdae158729e4"},
{file = "pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7a66efda2387de898c8f38c0cf7f14fca0b51a8ef0b24bfea5849f1b3c95af27"},
{file = "pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:18a101c168e4e092ab40dbc2503bdc0f62010e95d292b27827871dc85450d7ee"},
{file = "pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ba5dd002f88b78a4215ed2f8ddbdf85e8513382820ba15ad5ad8955ce0ca19a1"},
{file = "pydantic_core-2.27.2-cp313-cp313-win32.whl", hash = "sha256:1ebaf1d0481914d004a573394f4be3a7616334be70261007e47c2a6fe7e50130"},
{file = "pydantic_core-2.27.2-cp313-cp313-win_amd64.whl", hash = "sha256:953101387ecf2f5652883208769a79e48db18c6df442568a0b5ccd8c2723abee"},
{file = "pydantic_core-2.27.2-cp313-cp313-win_arm64.whl", hash = "sha256:ac4dbfd1691affb8f48c2c13241a2e3b60ff23247cbcf981759c768b6633cf8b"},
{file = "pydantic_core-2.27.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d3e8d504bdd3f10835468f29008d72fc8359d95c9c415ce6e767203db6127506"},
{file = "pydantic_core-2.27.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:521eb9b7f036c9b6187f0b47318ab0d7ca14bd87f776240b90b21c1f4f149320"},
{file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85210c4d99a0114f5a9481b44560d7d1e35e32cc5634c656bc48e590b669b145"},
{file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d716e2e30c6f140d7560ef1538953a5cd1a87264c737643d481f2779fc247fe1"},
{file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f66d89ba397d92f840f8654756196d93804278457b5fbede59598a1f9f90b228"},
{file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:669e193c1c576a58f132e3158f9dfa9662969edb1a250c54d8fa52590045f046"},
{file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdbe7629b996647b99c01b37f11170a57ae675375b14b8c13b8518b8320ced5"},
{file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d262606bf386a5ba0b0af3b97f37c83d7011439e3dc1a9298f21efb292e42f1a"},
{file = "pydantic_core-2.27.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:cabb9bcb7e0d97f74df8646f34fc76fbf793b7f6dc2438517d7a9e50eee4f14d"},
{file = "pydantic_core-2.27.2-cp38-cp38-musllinux_1_1_armv7l.whl", hash = "sha256:d2d63f1215638d28221f664596b1ccb3944f6e25dd18cd3b86b0a4c408d5ebb9"},
{file = "pydantic_core-2.27.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bca101c00bff0adb45a833f8451b9105d9df18accb8743b08107d7ada14bd7da"},
{file = "pydantic_core-2.27.2-cp38-cp38-win32.whl", hash = "sha256:f6f8e111843bbb0dee4cb6594cdc73e79b3329b526037ec242a3e49012495b3b"},
{file = "pydantic_core-2.27.2-cp38-cp38-win_amd64.whl", hash = "sha256:fd1aea04935a508f62e0d0ef1f5ae968774a32afc306fb8545e06f5ff5cdf3ad"},
{file = "pydantic_core-2.27.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:c10eb4f1659290b523af58fa7cffb452a61ad6ae5613404519aee4bfbf1df993"},
{file = "pydantic_core-2.27.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef592d4bad47296fb11f96cd7dc898b92e795032b4894dfb4076cfccd43a9308"},
{file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c61709a844acc6bf0b7dce7daae75195a10aac96a596ea1b776996414791ede4"},
{file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42c5f762659e47fdb7b16956c71598292f60a03aa92f8b6351504359dbdba6cf"},
{file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4c9775e339e42e79ec99c441d9730fccf07414af63eac2f0e48e08fd38a64d76"},
{file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57762139821c31847cfb2df63c12f725788bd9f04bc2fb392790959b8f70f118"},
{file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d1e85068e818c73e048fe28cfc769040bb1f475524f4745a5dc621f75ac7630"},
{file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:097830ed52fd9e427942ff3b9bc17fab52913b2f50f2880dc4a5611446606a54"},
{file = "pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:044a50963a614ecfae59bb1eaf7ea7efc4bc62f49ed594e18fa1e5d953c40e9f"},
{file = "pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:4e0b4220ba5b40d727c7f879eac379b822eee5d8fff418e9d3381ee45b3b0362"},
{file = "pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5e4f4bb20d75e9325cc9696c6802657b58bc1dbbe3022f32cc2b2b632c3fbb96"},
{file = "pydantic_core-2.27.2-cp39-cp39-win32.whl", hash = "sha256:cca63613e90d001b9f2f9a9ceb276c308bfa2a43fafb75c8031c4f66039e8c6e"},
{file = "pydantic_core-2.27.2-cp39-cp39-win_amd64.whl", hash = "sha256:77d1bca19b0f7021b3a982e6f903dcd5b2b06076def36a652e3907f596e29f67"},
{file = "pydantic_core-2.27.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:2bf14caea37e91198329b828eae1618c068dfb8ef17bb33287a7ad4b61ac314e"},
{file = "pydantic_core-2.27.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b0cb791f5b45307caae8810c2023a184c74605ec3bcbb67d13846c28ff731ff8"},
{file = "pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:688d3fd9fcb71f41c4c015c023d12a79d1c4c0732ec9eb35d96e3388a120dcf3"},
{file = "pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d591580c34f4d731592f0e9fe40f9cc1b430d297eecc70b962e93c5c668f15f"},
{file = "pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:82f986faf4e644ffc189a7f1aafc86e46ef70372bb153e7001e8afccc6e54133"},
{file = "pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:bec317a27290e2537f922639cafd54990551725fc844249e64c523301d0822fc"},
{file = "pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:0296abcb83a797db256b773f45773da397da75a08f5fcaef41f2044adec05f50"},
{file = "pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0d75070718e369e452075a6017fbf187f788e17ed67a3abd47fa934d001863d9"},
{file = "pydantic_core-2.27.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7e17b560be3c98a8e3aa66ce828bdebb9e9ac6ad5466fba92eb74c4c95cb1151"},
{file = "pydantic_core-2.27.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c33939a82924da9ed65dab5a65d427205a73181d8098e79b6b426bdf8ad4e656"},
{file = "pydantic_core-2.27.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:00bad2484fa6bda1e216e7345a798bd37c68fb2d97558edd584942aa41b7d278"},
{file = "pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c817e2b40aba42bac6f457498dacabc568c3b7a986fc9ba7c8d9d260b71485fb"},
{file = "pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:251136cdad0cb722e93732cb45ca5299fb56e1344a833640bf93b2803f8d1bfd"},
{file = "pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d2088237af596f0a524d3afc39ab3b036e8adb054ee57cbb1dcf8e09da5b29cc"},
{file = "pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d4041c0b966a84b4ae7a09832eb691a35aec90910cd2dbe7a208de59be77965b"},
{file = "pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:8083d4e875ebe0b864ffef72a4304827015cff328a1be6e22cc850753bfb122b"},
{file = "pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f141ee28a0ad2123b6611b6ceff018039df17f32ada8b534e6aa039545a3efb2"},
{file = "pydantic_core-2.27.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7d0c8399fcc1848491f00e0314bd59fb34a9c008761bcb422a057670c3f65e35"},
{file = "pydantic_core-2.27.2.tar.gz", hash = "sha256:eb026e5a4c1fee05726072337ff51d1efb6f59090b7da90d30ea58625b1ffb39"},
]
[package.dependencies]
typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
[[package]]
name = "pyyaml"
version = "6.0.2"
description = "YAML parser and emitter for Python"
optional = false
python-versions = ">=3.8"
files = [
{file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
{file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
{file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"},
{file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"},
{file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"},
{file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"},
{file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"},
{file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"},
{file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"},
{file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"},
{file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"},
{file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"},
{file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"},
{file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"},
{file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"},
{file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"},
{file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"},
{file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"},
{file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"},
{file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"},
{file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"},
{file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"},
{file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"},
{file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"},
{file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"},
{file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"},
{file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"},
{file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"},
{file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"},
{file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"},
{file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"},
{file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"},
{file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"},
{file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
{file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
{file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
{file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"},
{file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"},
{file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"},
{file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"},
{file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"},
{file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"},
{file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"},
{file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"},
{file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"},
{file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"},
{file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"},
{file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"},
{file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"},
{file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"},
{file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"},
{file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
{file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
]
[[package]]
name = "sniffio"
version = "1.3.1"
description = "Sniff out which async library your code is running under"
optional = false
python-versions = ">=3.7"
files = [
{file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
{file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
]
[[package]]
name = "starlette"
version = "0.46.1"
description = "The little ASGI library that shines."
optional = false
python-versions = ">=3.9"
files = [
{file = "starlette-0.46.1-py3-none-any.whl", hash = "sha256:77c74ed9d2720138b25875133f3a2dae6d854af2ec37dceb56aef370c1d8a227"},
{file = "starlette-0.46.1.tar.gz", hash = "sha256:3c88d58ee4bd1bb807c0d1acb381838afc7752f9ddaec81bbe4383611d833230"},
]
[package.dependencies]
anyio = ">=3.6.2,<5"
[package.extras]
full = ["httpx (>=0.27.0,<0.29.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.18)", "pyyaml"]
[[package]]
name = "tqdm"
version = "4.67.1"
description = "Fast, Extensible Progress Meter"
optional = false
python-versions = ">=3.7"
files = [
{file = "tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2"},
{file = "tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2"},
]
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[package.extras]
dev = ["nbval", "pytest (>=6)", "pytest-asyncio (>=0.24)", "pytest-cov", "pytest-timeout"]
discord = ["requests"]
notebook = ["ipywidgets (>=6)"]
slack = ["slack-sdk"]
telegram = ["requests"]
[[package]]
name = "typing-extensions"
version = "4.12.2"
description = "Backported and Experimental Type Hints for Python 3.8+"
optional = false
python-versions = ">=3.8"
files = [
{file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
{file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
]
[[package]]
name = "uvicorn"
version = "0.34.0"
description = "The lightning-fast ASGI server."
optional = false
python-versions = ">=3.9"
files = [
{file = "uvicorn-0.34.0-py3-none-any.whl", hash = "sha256:023dc038422502fa28a09c7a30bf2b6991512da7dcdb8fd35fe57cfc154126f4"},
{file = "uvicorn-0.34.0.tar.gz", hash = "sha256:404051050cd7e905de2c9a7e61790943440b3416f49cb409f965d9dcd0fa73e9"},
]
[package.dependencies]
click = ">=7.0"
h11 = ">=0.8"
typing-extensions = {version = ">=4.0", markers = "python_version < \"3.11\""}
[package.extras]
standard = ["colorama (>=0.4)", "httptools (>=0.6.3)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "d005d82268b6f8c2a68b26c454bced5c34bf3c971c0cbfefde3fc0c45c675f55"

View file

@ -0,0 +1,20 @@
[tool.poetry]
name = "api-server"
version = "0.1.0"
description = ""
authors = ["Adil Hafeez <info@katanemo.com>"]
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.10"
fastapi = "^0.115.4"
pyyaml = "^6.0.2"
uvicorn = "^0.34.0"
openai = "^1.66.5"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.poetry.scripts]
api-server = "api_server.main:app"

View file

@ -0,0 +1,47 @@
#!/bin/bash
set -e
# Function to start the demo
start_demo() {
# Step 1: Check if .env file exists
if [ -f ".env" ]; then
echo ".env file already exists. Skipping creation."
else
# Step 2: Create `.env` file and set OpenAI key
if [ -z "$OPENAI_API_KEY" ]; then
echo "Error: OPENAI_API_KEY environment variable is not set for the demo."
exit 1
fi
echo "Creating .env file..."
echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env
echo ".env file created with OPENAI_API_KEY."
fi
# Step 3: Start Arch
echo "Starting Arch with arch_config.yaml..."
archgw up arch_config.yaml
# Step 4: Start developer services
echo "Starting Network Agent using Docker Compose..."
docker compose up -d # Run in detached mode
}
# Function to stop the demo
stop_demo() {
# Step 1: Stop Docker Compose services
echo "Stopping Network Agent using Docker Compose..."
docker compose down
# Step 2: Stop Arch
echo "Stopping Arch..."
archgw down
}
# Main script logic
if [ "$1" == "down" ]; then
stop_demo
else
# Default action is to bring the demo up
start_demo
fi

View file

@ -1,8 +1,10 @@
version: v0.1
listener:
address: 127.0.0.1
port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates
message_format: huggingface
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
overrides:
optimize_context_window: true

View file

@ -46,7 +46,7 @@ Multi-Turn RAG (Follow-up Questions)
Developers often `struggle <https://www.reddit.com/r/LocalLLaMA/comments/18mqwg6/best_practice_for_rag_with_followup_chat/>`_ to efficiently handle
``follow-up`` or ``clarification`` questions. Specifically, when users ask for changes or additions to previous responses, it requires developers to
re-write prompts using LLMs with precise prompt engineering techniques. This process is slow, manual, error prone and adds signifcant latency to the
user experience. Arch
user experience.
Arch is highly capable of accurately detecting and processing prompts in a multi-turn scenarios so that you can buil fast and accurate RAG apps in
minutes. For additional details on how to build multi-turn RAG applications please refer to our :ref:`multi-turn <arch_multi_turn_guide>` docs.

View file

@ -1,10 +1,11 @@
version: v0.1
listener:
address: 0.0.0.0 # or 127.0.0.1
port: 10000
# Defines how Arch should parse the content from application/json or text/pain Content-type in the http request
message_format: huggingface
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
# Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
llm_providers:
@ -51,11 +52,6 @@ prompt_targets:
default: false
enum: [true, false]
error_target:
endpoint:
name: error_target_1
path: /error
# Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem.
endpoints:
app_server:

View file

@ -4,7 +4,7 @@ LLM Provider
============
**LLM provider** is a top-level primitive in Arch, helping developers centrally define, secure, observe,
and manage the usage of of their LLMs. Arch builds on Envoy's reliable `cluster subsystem <https://www.envoyproxy.io/docs/envoy/v1.31.2/intro/arch_overview/upstream/cluster_manager>`_
and manage the usage of their LLMs. Arch builds on Envoy's reliable `cluster subsystem <https://www.envoyproxy.io/docs/envoy/v1.31.2/intro/arch_overview/upstream/cluster_manager>`_
to manage egress traffic to LLMs, which includes intelligent routing, retry and fail-over mechanisms,
ensuring high availability and fault tolerance. This abstraction also enables developers to seamlessly
switching between LLM providers or upgrade LLM versions, simplifying the integration and scaling of LLMs

View file

@ -3,14 +3,13 @@
Prompt Target
==============
**Prompt Targets** are a fundamental component of Arch, enabling developers to define how different types of user prompts are processed and routed within their generative AI applications.
This section provides an in-depth look at prompt targets, including their purpose, configuration, usage, and best practices to help you effectively leverage this feature in your projects.
**Prompt Targets** are a core concept in Arch, empowering developers to clearly define how user prompts are interpreted, processed, and routed within their generative AI applications. Prompts can seamlessly be routed either to specialized AI agents capable of handling sophisticated, context-driven tasks or to targeted tools provided by your application, offering users a fast, precise, and personalized experience.
This section covers the essentials of prompt targets—what they are, how to configure them, their practical uses, and recommended best practices—to help you fully utilize this feature in your applications.
What Are Prompt Targets?
------------------------
Prompt targets are predefined endpoints within Arch that handle specific types of user prompts.
They act as the bridge between user inputs and your backend services or APIs, enabling Arch to route, process, and manage prompts efficiently.
By defining prompt targets, you can separate your application's business logic from the complexities of prompt processing, ensuring a cleaner and more maintainable codebase.
Prompt targets are endpoints within Arch that handle specific types of user prompts. They act as the bridge between user inputs and your backend agents or tools (APIs), enabling Arch to route, process, and manage prompts efficiently. Defining prompt targets helps you decouple your application's core logic from processing and handling complexities, leading to clearer code organization, better scalability, and easier maintenance.
.. table::
@ -21,7 +20,7 @@ By defining prompt targets, you can separate your application's business logic f
==================== ============================================
Intent Recognition Identify the purpose of a user prompt.
Parameter Extraction Extract necessary data from the prompt.
API Invocation Call relevant backend services or functions.
Invocation Call relevant backend agents or tools (APIs).
Response Handling Process and return responses to the user.
==================== ============================================
@ -30,16 +29,15 @@ Key Features
Below are the key features of prompt targets that empower developers to build efficient, scalable, and personalized GenAI solutions:
- **Modular Design**: Define multiple prompt targets to handle diverse functionalities.
- **Parameter Management**: Specify required and optional parameters for each target.
- **Function Integration**: Seamlessly connect prompts to backend APIs or functions.
- **Design Scenarios**: Define prompt targets to effectively handle specific agentic scenarios.
- **Input Management**: Specify required and optional parameters for each target.
- **Tools Integration**: Seamlessly connect prompts to backend APIs or functions.
- **Error Handling**: Direct errors to designated handlers for streamlined troubleshooting.
- **Metadata Enrichment**: Attach additional context to prompts for enhanced processing.
Configuring Prompt Targets
--------------------------
Configuring prompt targets involves defining them in Arch's configuration file.
Each Prompt target specifies how a particular type of prompt should be handled, including the endpoint to invoke and any parameters required.
Configuring prompt targets involves defining them in Arch's configuration file. Each Prompt target specifies how a particular type of prompt should be handled, including the endpoint to invoke and any parameters required.
Basic Configuration
~~~~~~~~~~~~~~~~~~~
@ -50,37 +48,38 @@ A prompt target configuration includes the following elements:
- ``name``: A unique identifier for the prompt target.
- ``description``: A brief explanation of what the prompt target does.
- ``endpoint``: The API endpoint or function that handles the prompt.
- ``endpoint``: Required if you want to call a tool or specific API. ``name`` and ``path`` ``http_method`` are the three attributes of the endpoint.
- ``parameters`` (Optional): A list of parameters to extract from the prompt.
.. _defining_prompt_target_parameters:
Defining Parameters
~~~~~~~~~~~~~~~~~~~
Parameters are the pieces of information that Arch needs to extract from the user's prompt to perform the desired action.
Each parameter can be marked as required or optional.
Here is a full list of parameter attributes that Arch can support:
Each parameter can be marked as required or optional. Here is a full list of parameter attributes that Arch can support:
.. table::
:width: 100%
==================== ============================================================================
======================== ============================================================================
**Attribute** **Description**
==================== ============================================================================
``name`` Specifies identifier of parameters
``type`` Specifies the data type of the parameter.
``description`` Provides a human-readable explanation of the parameter's purpose.
``required`` Indicates whether the parameter is mandatory or optional
======================== ============================================================================
``name (req.)`` Specifies name of the parameter.
``description (req.)`` Provides a human-readable explanation of the parameter's purpose.
``type (req.)`` Specifies the data type. Supported types include: **int**, **str**, **float**, **bool**, **list**, **set**, **dict**, **tuple**
``in_path`` Indicates whether the parameter is part of the path in the endpoint url. Valid values: **true** or **false**
``default`` Specifies a default value for the parameter if not provided by the user.
``items`` Used in the context of arrays to define the schema of items within an array.
``format`` Specifies a format for the parameter value, e.g., date and email
``enum`` Lists the allowable values for the parameter.
``minimum`` Defines the minimum acceptable value for numeric parameters.
``maximum`` Specifies the maximum acceptable value for numeric parameters.
==================== ============================================================================
``format`` Specifies a format for the parameter value. For example: `2019-12-31` for a date value.
``enum`` Lists of allowable values for the parameter with data type matching the ``type`` attribute. **Usage Example**: ``enum: ["celsius`", "fahrenheit"]``
``items`` Specifies the attribute of the elements when type equals **list**, **set**, **dict**, **tuple**. **Usage Example**: ``items: {"type": "str"}``
``required`` Indicates whether the parameter is mandatory or optional. Valid values: **true** or **false**
======================== ============================================================================
Example Configuration
~~~~~~~~~~~~~~~~~~~~~
Example Configuration For Tools
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: yaml
:caption: Tools and Function Calling Configuration Example
prompt_targets:
- name: get_weather
@ -99,16 +98,35 @@ Example Configuration
name: api_server
path: /weather
Example Configuration For Agents
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: yaml
:caption: Agent Orchestration Configuration Example
overrides:
use_agent_orchestrator: true
prompt_targets:
- name: sales_agent
description: handles queries related to sales and purchases
- name: issues_and_repairs
description: handles issues, repairs, or refunds
- name: escalate_to_human
description: escalates to human agent
.. note::
Today, you can use Arch to coordinate more specific agentic scenarios via tools and function calling, or use it for high-level agent routing and hand off scenarios. In the future, we plan to offer you the ability to combine these two approaches for more complex scenarios. Please see `github issues <https://github.com/katanemo/archgw/issues/442>`_ for more details.
Routing Logic
-------------
Prompt targets determine where and how user prompts are processed.
Arch uses intelligent routing logic to ensure that prompts are directed to the appropriate targets based on their intent and context.
Prompt targets determine where and how user prompts are processed. Arch uses intelligent routing logic to ensure that prompts are directed to the appropriate targets based on their intent and context.
Default Targets
~~~~~~~~~~~~~~~
For general-purpose prompts that do not match any specific prompt target, Arch routes them to a designated default target.
This is useful for handling open-ended queries like document summarization or information extraction.
For general-purpose prompts that do not match any specific prompt target, Arch routes them to a designated default target. This is useful for handling open-ended queries like document summarization or information extraction.
Intent Matching
~~~~~~~~~~~~~~~
@ -125,5 +143,5 @@ For example:
Summary
--------
Prompt targets are essential for defining how user prompts are handled within your generative AI applications using Arch.
By carefully configuring prompt targets, you can ensure that prompts are accurately routed, necessary parameters are extracted, and backend services are invoked seamlessly.
This modular approach not only simplifies your application's architecture but also enhances scalability, maintainability, and overall user experience.
By carefully configuring prompt targets, you can ensure that prompts are accurately routed, necessary parameters are extracted, and backend services are invoked seamlessly. This modular approach not only simplifies your application's architecture but also enhances scalability, maintainability, and overall user experience.

View file

@ -5,7 +5,7 @@ Listener
**Listener** is a top level primitive in Arch, which simplifies the configuration required to bind incoming
connections from downstream clients, and for egress connections to LLMs (hosted or API)
Arch builds on Envoy's Listener subsystem to streamline connection managemet for developers. Arch minimizes
Arch builds on Envoy's Listener subsystem to streamline connection management for developers. Arch minimizes
the complexity of Envoy's listener setup by using best-practices and exposing only essential settings,
making it easier for developers to bind connections without deep knowledge of Envoys configuration model. This
simplification ensures that connections are secure, reliable, and optimized for performance.
@ -13,7 +13,7 @@ simplification ensures that connections are secure, reliable, and optimized for
Downstream (Ingress)
^^^^^^^^^^^^^^^^^^^^^^
Developers can configure Arch to accept connections from downstream clients. A downstream listener acts as the
primary entry point for incoming traffic, handling initial connection setup, including network filtering, gurdrails,
primary entry point for incoming traffic, handling initial connection setup, including network filtering, guardrails,
and additional network security checks. For more details on prompt security and safety,
see :ref:`here <arch_overview_prompt_handling>`.
@ -27,7 +27,7 @@ address like ``arch.local:12000/v1`` for outgoing traffic. For more details on L
Configure Listener
^^^^^^^^^^^^^^^^^^
To configure a Downstream (Ingress) Listner, simply add the ``listener`` directive to your configuration file:
To configure a Downstream (Ingress) Listener, simply add the ``listener`` directive to your configuration file:
.. literalinclude:: ../includes/arch_config.yaml
:language: yaml

View file

@ -5,7 +5,7 @@ Model Serving
Arch is a set of `two` self-contained processes that are designed to run alongside your application
servers (or on a separate host connected via a network). The first process is designated to manage low-level
networking and HTTP related comcerns, and the other process is for model serving, which helps Arch make
networking and HTTP related concerns, and the other process is for model serving, which helps Arch make
intelligent decisions about the incoming prompts. The model server is designed to call the purpose-built
LLMs in Arch.
@ -16,7 +16,7 @@ LLMs in Arch.
Arch' is designed to be deployed in your cloud VPC, on a on-premises host, and can work on devices that don't
have a GPU. Note, GPU devices are need for fast and cost-efficient use, so that Arch (model server, specifically)
can process prompts quickly and forward control back to the applicaton host. There are three modes in which Arch
can process prompts quickly and forward control back to the application host. There are three modes in which Arch
can be configured to run its **model server** subsystem:
Local Serving (CPU - Moderate)
@ -32,7 +32,7 @@ might not be available.
Cloud Serving (GPU - Blazing Fast)
----------------------------------
The command below instructs Arch to intelligently use GPUs locally for fast intent detection, but default to
cloud serving for function calling and guardails scenarios to dramatically improve the speed and overall performance
cloud serving for function calling and guardrails scenarios to dramatically improve the speed and overall performance
of your applications.
.. code-block:: console
@ -40,6 +40,6 @@ of your applications.
$ archgw up
.. Note::
Arch's model serving in the cloud is priced at $0.05M/token (156x cheaper than GPT-4o) with averlage latency
Arch's model serving in the cloud is priced at $0.05M/token (156x cheaper than GPT-4o) with average latency
of 200ms (10x faster than GPT-4o). Please refer to our :ref:`Get Started <quickstart>` to know
how to generate API keys for model serving

View file

@ -8,7 +8,7 @@ Arch relies on Envoy's HTTP `connection management <https://www.envoyproxy.io/do
subsystem and its **prompt handler** subsystem engineered with purpose-built LLMs to
implement critical functionality on behalf of developers so that you can stay focused on business logic.
Arch's **prompt handler** subsystem interacts with the **model subsytem** through Envoy's cluster manager system to ensure robust, resilient and fault-tolerant experience in managing incoming prompts.
Arch's **prompt handler** subsystem interacts with the **model subsystem** through Envoy's cluster manager system to ensure robust, resilient and fault-tolerant experience in managing incoming prompts.
.. seealso::
Read more about the :ref:`model subsystem <model_serving>` and how the LLMs are hosted in Arch.
@ -28,7 +28,7 @@ Prompt Guard
-----------------
Arch is engineered with `Arch-Guard <https://huggingface.co/collections/katanemo/arch-guard-6702bdc08b889e4bce8f446d>`_, an industry leading safety layer, powered by a
compact and high-performimg LLM that monitors incoming prompts to detect and reject jailbreak attempts -
compact and high-performing LLM that monitors incoming prompts to detect and reject jailbreak attempts -
ensuring that unauthorized or harmful behaviors are intercepted early in the process.
To add jailbreak guardrails, see example below:
@ -50,7 +50,7 @@ Prompt Targets
--------------
Once a prompt passes any configured guardrail checks, Arch processes the contents of the incoming conversation
and identifies where to forwad the conversation to via its ``prompt target`` primitve. Prompt targets are endpoints
and identifies where to forward the conversation to via its ``prompt target`` primitive. Prompt targets are endpoints
that receive prompts that are processed by Arch. For example, Arch enriches incoming prompts with metadata like knowing
when a user's intent has changed so that you can build faster, more accurate RAG apps.
@ -72,7 +72,7 @@ Intent Matching
Arch uses fast text embedding and intent recognition approaches to first detect the intent of each incoming prompt.
This intent matching phase analyzes the prompt's content and matches it against predefined prompt targets, ensuring that each prompt is forwarded to the most appropriate endpoint.
Archs intent matching framework considers both the name and description of each prompt target, and uses a composite matching score between embedding similarity and intent classification scores to enchance accuracy in forwarding decisions.
Archs intent matching framework considers both the name and description of each prompt target, and uses a composite matching score between embedding similarity and intent classification scores to enhance accuracy in forwarding decisions.
- **Intent Recognition**: NLI techniques further refine the matching process by evaluating the semantic alignment between the prompt and potential targets.

View file

@ -5,7 +5,7 @@ Request Lifecycle
Below we describe the events in the lifecycle of a request passing through an Arch gateway instance. We first
describe how Arch fits into the request path and then the internal events that take place following
the arrival of a request at Arch from downtream clients. We follow the request until the corresponding
the arrival of a request at Arch from downstream clients. We follow the request until the corresponding
dispatch upstream and the response path.
.. image:: /_static/img/network-topology-ingress-egress.jpg
@ -59,7 +59,7 @@ The request processing path in Arch has three main parts:
lifecycle. The downstream and upstream HTTP/2 codec lives here.
* :ref:`Prompt handler subsystem <arch_overview_prompt_handling>` which is responsible for selecting and
forwarding prompts ``prompt_targets`` and establishes the lifecycle of any **upstream** connection to a
hosted endpoint that implements domain-specific business logic for incoming promots. This is where knowledge
hosted endpoint that implements domain-specific business logic for incoming prompts. This is where knowledge
of targets and endpoint health, load balancing and connection pooling exists.
* :ref:`Model serving subsystem <model_serving>` which helps Arch make intelligent decisions about the
incoming prompts. The model server is designed to call the purpose-built LLMs in Arch.
@ -67,7 +67,7 @@ The request processing path in Arch has three main parts:
The three subsystems are bridged with either the HTTP router filter, and the cluster manager subsystems of Envoy.
Also, Arch utilizes `Envoy event-based thread model <https://blog.envoyproxy.io/envoy-threading-model-a8d44b922310>`_.
A main thread is responsible forthe server lifecycle, configuration processing, stats, etc. and some number of
A main thread is responsible for the server lifecycle, configuration processing, stats, etc. and some number of
:ref:`worker threads <arch_overview_threading>` process requests. All threads operate around an event loop (`libevent <https://libevent.org/>`_)
and any given downstream TCP connection will be handled by exactly one worker thread for its lifetime. Each worker
thread maintains its own pool of TCP connections to upstream endpoints.
@ -99,7 +99,7 @@ A brief outline of the lifecycle of a request and response using the example con
that harmful or unwanted behaviors are detected early in the request processing pipeline.
3. **Intent Matching**:
The decrypted data stream is deframed by the HTTP/2 codec in Arch's HTTP connection manager. Arch performs
The decrypted data stream is de-framed by the HTTP/2 codec in Arch's HTTP connection manager. Arch performs
intent matching via is **prompt-handler** subsystem using the name and description of the defined prompt targets,
determining which endpoint should handle the prompt.
@ -162,7 +162,7 @@ Post-request processing
Once a request completes, the stream is destroyed. The following also takes places:
* The post-request :ref:`monitoring <monitoring>` are updated (e.g. timing, active requests, upgrades, health checks).
Some statistics are updated earlier however, during request processing. Stats are batchedand written by the main
Some statistics are updated earlier however, during request processing. Stats are batched and written by the main
thread periodically.
* :ref:`Access logs <arch_access_logging>` are written to the access log
* :ref:`Trace <arch_overview_tracing>` spans are finalized. If our example request was traced, a

View file

@ -7,12 +7,12 @@ A few definitions before we dive into the main architecture documentation. Also
to keep things consistent in logs and traces, and introduces and clarifies concepts are is relates to LLM applications.
**Agent**: An application that uses LLMs to handle wide-ranging tasks from users via prompts. This could be as simple
as retrieving or summarizing data from an API, or being able to trigger compleix actions like adjusting ad campaigns, or
as retrieving or summarizing data from an API, or being able to trigger complex actions like adjusting ad campaigns, or
changing travel plans via prompts.
**Arch Config**: Arch operates based on a configuration that controls the behavior of a single instance of the Arch gateway.
This where you enable capabilities like LLM routing, fast function calling (via prompt_targets), applying guardrails, and enabling critical
features like metrics and tracing. For the full configuration reference of `arch_config.yaml` see :ref:`here <configuration_refernce>`.
features like metrics and tracing. For the full configuration reference of `arch_config.yaml` see :ref:`here <configuration_reference>`.
**Downstream(Ingress)**: An downstream client (web application, etc.) connects to Arch, sends prompts, and receives responses.
@ -37,11 +37,11 @@ code to LLMs.
undifferentiated work in building generative AI apps. Prompt targets are endpoints that receive prompts that are processed by Arch.
For example, Arch enriches incoming prompts with metadata like knowing when a request is a follow-up or clarifying prompt so that you
can build faster, more accurate retrieval (RAG) apps. To support agentic apps, like scheduling travel plans or sharing comments on a
document - via prompts, Arch uses its function calling abilities to extract critical information fromthe incoming prompt (or a set of
document - via prompts, Arch uses its function calling abilities to extract critical information from the incoming prompt (or a set of
prompts) needed by a downstream backend API or function call before calling it directly.
**Model Serving**: Arch is a set of `two` self-contained processes that are designed to run alongside your application servers
(or on a separate hostconnected via a network).The :ref:`model serving <model_serving>` process helps Arch make intelligent decisions
(or on a separate host connected via a network).The :ref:`model serving <model_serving>` process helps Arch make intelligent decisions
about the incoming prompts. The model server is designed to call the (fast) purpose-built LLMs in Arch.
**Error Target**: :ref:`Error targets <error_target>` are those endpoints that receive forwarded errors from Arch when issues arise,

View file

@ -15,7 +15,7 @@ from sphinxawesome_theme.postprocess import Icons
project = "Arch Docs"
copyright = "2025, Katanemo Labs, Inc"
author = "Katanemo Labs, Inc"
release = " v0.2.1"
release = " v0.2.6"
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

View file

@ -3,10 +3,17 @@
Intro to Arch
=============
Arch is an intelligent `(Layer 7) <https://www.cloudflare.com/learning/ddos/what-is-layer-7/>`_ gateway designed for generative AI apps, agents, copilots that work with prompts.
Engineered with purpose-built large language models (LLMs), Arch handles all the critical but undifferentiated tasks related to the handling and processing of prompts, including
detecting and rejecting jailbreak attempts, intelligently calling “backend” APIs to fulfill the user's request represented in a prompt, routing to and offering disaster recovery
between upstream LLMs, and managing the observability of prompts and LLM interactions in a centralized way.
Arch is an intelligent proxy server designed agentic applications. **Move faster** by letting Arch handle the **pesky heavy lifting** in building agents:
fast input clarification, agent routing, seamless integration of prompts with tools for common tasks, and unified access and observability of LLMs.
Past the thrill of an AI demo, have you found yourself hitting these walls? You know, the all too familiar ones:
- You break a prompt into specialized ones, but **get stuck writing routing** and handoff logic?
- You want use new LLMs, but **struggle to quickly add LLMs** without writing integration logic?
- You're **trapped in tedious prompting work** to clarify inputs and user intents?
- You're **wasting cycles** choosing and integrating **code for observability** instead of it just happening transparently?
And you think to yourself, can't I move faster by focusing on higher-level objectives in a language and framework agnostic way? Well, you can!
.. figure:: /_static/img/arch_network_diagram_high_level.png
:width: 100%
@ -15,7 +22,7 @@ between upstream LLMs, and managing the observability of prompts and LLM interac
High-level network flow of where Arch Gateway sits in your agentic stack. Designed for both ingress and egress prompt traffic.
**The project was born out of the belief that:**
**Arch Gateway was built by the contributors of Envoy Proxy with the belief that:**
*Prompts are nuanced and opaque user requests, which require the same capabilities as traditional HTTP requests
including secure handling, intelligent routing, robust observability, and integration with backend (API)
@ -28,7 +35,7 @@ Arch takes a dependency on Envoy and is a self-contained process that is designe
Arch uses Envoy's HTTP connection management subsystem, HTTP L7 filtering and telemetry capabilities to extend the functionality exclusively for prompts and LLMs.
This gives Arch several advantages:
* Arch builds on Envoy's proven success. Envoy is used at masssive scale by the leading technology companies of our time including `AirBnB <https://www.airbnb.com>`_, `Dropbox <https://www.dropbox.com>`_, `Google <https://www.google.com>`_, `Reddit <https://www.reddit.com>`_, `Stripe <https://www.stripe.com>`_, etc. Its battle tested and scales linearly with usage and enables developers to focus on what really matters: application features and business logic.
* Arch builds on Envoy's proven success. Envoy is used at massive scale by the leading technology companies of our time including `AirBnB <https://www.airbnb.com>`_, `Dropbox <https://www.dropbox.com>`_, `Google <https://www.google.com>`_, `Reddit <https://www.reddit.com>`_, `Stripe <https://www.stripe.com>`_, etc. Its battle tested and scales linearly with usage and enables developers to focus on what really matters: application features and business logic.
* Arch works with any application language. A single Arch deployment can act as gateway for AI applications written in Python, Java, C++, Go, Php, etc.
@ -47,7 +54,7 @@ These LLMs are designed to be best-in-class for critical prompt-related tasks li
With prompt guardrails you can prevent ``jailbreak attempts`` present in user's prompts without having to write a single line of code.
To learn more about how to configure guardrails available in Arch, read :ref:`Prompt Guard <prompt_guard>`.
**Traffic Management:** Arch offers several capabilities for LLM calls originating from your applications, including smart retries on errors from upstream LLMs, and automatic cutover to other LLMs configured in Arch for continuous availability and disaster recovery scenarios.
**Traffic Management:** Arch offers several capabilities for LLM calls originating from your applications, including smart retries on errors from upstream LLMs, and automatic cut-over to other LLMs configured in Arch for continuous availability and disaster recovery scenarios.
Arch extends Envoy's `cluster subsystem <https://www.envoyproxy.io/docs/envoy/latest/intro/arch_overview/upstream/cluster_manager>`_ to manage upstream connections to LLMs so that you can build resilient AI applications.
**Front/edge Gateway:** There is substantial benefit in using the same software at the edge (observability, traffic shaping algorithms, applying guardrails, etc.) as for outbound LLM inference use cases.

View file

@ -3,7 +3,11 @@
Overview
============
Welcome to Arch, the intelligent prompt gateway designed to help developers build **fast**, **secure**, and **personalized** generative AI apps at ANY scale.
Welcome to Arch, The intelligent (edge and LLM) proxy server for agentic applications.
Move **faster** by letting Arch handle the pesky heavy lifting in building agents: **fast input clarification**, **agent routing**,
seamless integration of prompts with **tools for common tasks**, and **unified access and observability of LLMs**.
In this documentation, you will learn how to quickly set up Arch to trigger API calls via prompts, apply prompt guardrails without writing any application-level logic,
simplify the interaction with upstream LLMs, and improve observability all while simplifying your application development process.

View file

@ -25,7 +25,7 @@ Arch's CLI allows you to manage and interact with the Arch gateway efficiently.
$ python -m venv venv
$ source venv/bin/activate # On Windows, use: venv\Scripts\activate
$ pip install archgw==0.2.1
$ pip install archgw==0.2.6
Build AI Agent with Arch Gateway
@ -42,11 +42,12 @@ Create ``arch_config.yaml`` file with the following content:
version: v0.1
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
llm_providers:
- name: gpt-4o
@ -144,22 +145,23 @@ Create ``arch_config.yaml`` file with the following content:
version: v0.1
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
egress_traffic:
address: 0.0.0.0
port: 12000
message_format: openai
timeout: 30s
llm_providers:
- name: gpt-4o
access_key: $OPENAI_API_KEY
provider: openai
provider_interface: openai
model: gpt-4o
default: true
- name: ministral-3b
access_key: $MISTRAL_API_KEY
provider: mistral
provider_interface: openai
model: ministral-3b-latest
Step 2. Start arch gateway

View file

@ -0,0 +1,105 @@
.. _agent_routing:
Agent Routing and Hand Off
===========================
Agent Routing and Hand Off is a key feature in Arch that enables intelligent routing of user prompts to specialized AI agents or human agents based on the nature and complexity of the user's request.
This capability significantly enhances the efficiency and personalization of interactions, ensuring each prompt receives the most appropriate and effective handling. The following section describes
the workflow, configuration, and implementation of Agent routing and hand off in Arch.
#. **Agent Selection**
When a user submits a prompt, Arch analyzes the input to determine the intent and complexity. Based on the analysis, Arch selects the most suitable agent configured within your application to handle the specific category of the user's request—such as sales inquiries, technical issues, or complex scenarios requiring human attention.
#. **Prompt Routing**
After selecting the appropriate agent, Arch routes the user's prompt to the designated agent's endpoint and waits for the agent to respond back with the processed output or further instructions.
#. **Hand Off**
Based on follow-up queries from the user, Arch repeats the process of analysis, agent selection, and routing to ensure a seamless hand off between AI agents as needed.
.. code-block:: yaml
:caption: Agent Routing and Hand Off Configuration Example
prompt_targets:
- name: sales_agent
description: Handles queries related to sales and purchases
- name: issues_and_repairs
description: handles issues, repairs, or refunds
- name: escalate_to_human
description: escalates to human agent
.. code-block:: python
:caption: Agent Routing and Hand Off Implementation Example via FastAPI
class Agent:
def __init__(self, role: str, instructions: str):
self.system_prompt = f"You are a {role}.\n{instructions}"
def handle(self, req: ChatCompletionsRequest):
messages = [{"role": "system", "content": self.get_system_prompt()}] + [
message.model_dump() for message in req.messages
]
return call_openai(messages, req.stream) #call_openai is a placeholder for the actual API call
def get_system_prompt(self) -> str:
return self.system_prompt
# Define your agents
AGENTS = {
"sales_agent": Agent(
role="sales agent",
instructions=(
"Always answer in a sentence or less.\n"
"Follow the following routine with the user:\n"
"1. Engage\n"
"2. Quote ridiculous price\n"
"3. Reveal caveat if user agrees."
),
),
"issues_and_repairs": Agent(
role="issues and repairs agent",
instructions="Propose a solution, offer refund if necessary.",
),
"escalate_to_human": Agent(
role="human escalation agent", instructions="Escalate issues to a human."
),
"unknown_agent": Agent(
role="general assistant", instructions="Assist the user in general queries."
),
}
#handle the request from arch gateway
@app.post("/v1/chat/completions")
def completion_api(req: ChatCompletionsRequest, request: Request):
agent_name = req.metadata.get("agent-name", "unknown_agent")
agent = AGENTS.get(agent_name)
logger.info(f"Routing to agent: {agent_name}")
return agent.handle(req)
.. note::
The above example demonstrates a simple implementation of Agent Routing and Hand Off using FastAPI. For the full implementation of this example
please see our `GitHub demo <https://github.com/katanemo/archgw/tree/main/demos/use_cases/orchestrating_agents>`_.
Example Use Cases
-----------------
Agent Routing and Hand Off is particularly beneficial in scenarios such as:
- **Customer Support**: Routing common customer queries to automated support agents, while escalating complex or sensitive issues to human support staff.
- **Sales and Marketing**: Automatically directing potential leads and sales inquiries to specialized sales agents for timely and targeted follow-ups.
- **Technical Assistance**: Managing user-reported issues, repairs, or refunds by assigning them to the correct technical or support agent efficiently.
Best Practices and Tips
------------------------
When implementing Agent Routing and Hand Off in your applications, consider these best practices:
- Clearly define agent responsibilities: Ensure each agent or human endpoint has a clear, specific description of the prompts they handle, reducing mis-routing.
- Monitor and optimize routes: Regularly review how prompts are routed to adjust and optimize agent definitions and configurations.
.. note::
To observe traffic to and from agents, please read more about :ref:`observability <observability>` in Arch.
By carefully configuring and managing your Agent routing and hand off, you can significantly improve your application's responsiveness, performance, and overall user satisfaction.

View file

@ -118,6 +118,9 @@ Specify the parameters your function needs and how Arch should interpret these.
name: api_server
path: /weather
.. Note::
For a complete refernce of attributes that you can configure in a prompt target, see :ref:`here <defining_prompt_target_parameters>`.
Step 3: Arch Takes Over
~~~~~~~~~~~~~~~~~~~~~~~
Once you have defined the functions and configured the prompt targets, Arch Gateway takes care of the remaining work.

View file

@ -1,10 +1,11 @@
version: v0.1
listener:
address: 0.0.0.0 # or 127.0.0.1
port: 10000
# Defines how Arch should parse the content from application/json or text/pain Content-type in the http request
message_format: huggingface
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
# Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
llm_providers:
@ -53,11 +54,6 @@ prompt_targets:
default: false
enum: [true, false]
error_target:
endpoint:
name: error_target_1
path: /error
# Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem.
endpoints:
app_server:

View file

@ -12,11 +12,11 @@ Welcome to Arch!
<p>Build <strong>fast</strong>, <strong>observable</strong>, and <strong>personalized</strong> GenAI apps</p>
</div>
<a href="https://www.producthunt.com/posts/arch-3?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_souce=badge-arch&#0045;3" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=565761&theme=light&period=daily" alt="Arch - Build&#0032;fast&#0044;&#0032;hyper&#0045;personalized&#0032;agents&#0032;with&#0032;intelligent&#0032;infra | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
<a href="https://www.producthunt.com/posts/arch-3?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_souce=badge-arch&#0045;3" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=565761&theme=dark&period=daily&t=1742433071161" alt="Arch - Build&#0032;fast&#0044;&#0032;hyper&#0045;personalized&#0032;agents&#0032;with&#0032;intelligent&#0032;infra | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
`Arch <https://github.com/katanemo/arch>`_ is an intelligent gateway for agents - an infrastructure primitive for GenAI (built by the contributors of `Envoy <https://www.envoyproxy.io/>`_ ). The project was born out of the belief that:
`Arch <https://github.com/katanemo/arch>`_ is an intelligent (edge and LLM) proxy, exclusively designed for prompts and agents - and built by the contributors of the widely adopted and loved `Envoy <https://www.envoyproxy.io/>`_ ).
*Prompts are nuanced and opaque user requests, which require the same capabilities as traditional HTTP requests including secure handling, intelligent routing, robust observability, and integration with backend (API) systems for personalization - all outside business logic.*
**Move faster** by letting Arch handle the **pesky heavy lifting** in building agents: fast input clarification, agent routing, seamless integration of prompts with tools for common tasks, and unified access and observability of LLMs - all outside business logic.
.. tab-set::
@ -50,6 +50,7 @@ Welcome to Arch!
:maxdepth: 2
guides/prompt_guard
guides/agent_routing
guides/function_calling
guides/observability/observability

View file

@ -1,9 +1,9 @@
.. _configuration_refernce:
.. _configuration_reference:
Configuration Reference
=======================
The following is a complete reference of the ``arch_conifg.yml`` that controls the behavior of a single instance of
The following is a complete reference of the ``arch_config.yml`` that controls the behavior of a single instance of
the Arch gateway. This where you enable capabilities like routing to upstream LLm providers, defining prompt_targets
where prompts get routed to, apply guardrails, and enable critical agent observability features.

View file

@ -1,14 +1,16 @@
version: v0.1
listeners:
prompt_gateway:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 5s
llm_gateway:
egress_traffic:
address: 0.0.0.0
port: 12000
message_format: openai
timeout: 5s
# Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem.
endpoints:
@ -33,14 +35,6 @@ llm_providers:
access_key: $OPENAI_API_KEY
model: gpt-4o
default: true
rate_limits:
selector: #optional headers, to add rate limiting based on http headers like JWT tokens or API keys
http_header:
name: Authorization
value: "" # Empty value means each separate value has a separate limit
limit:
tokens: 100000 # Tokens per unit
unit: minute
- name: Mistral8x7b
provider_interface: openai
@ -54,8 +48,8 @@ llm_providers:
# provides a way to override default settings for the arch system
overrides:
# By default Arch uses an NLI + embedding approach to match an incomming prompt to a prompt target.
# The intent matching threshold is kept at 0.80, you can overide this behavior if you would like
# By default Arch uses an NLI + embedding approach to match an incoming prompt to a prompt target.
# The intent matching threshold is kept at 0.80, you can override this behavior if you would like
prompt_target_intent_matching_threshold: 0.60
# default system prompt used by all prompt targets
@ -96,11 +90,6 @@ prompt_targets:
default: false
enum: [true, false]
error_target:
endpoint:
name: error_target_1
path: /error
tracing:
# sampling rate. Note by default Arch works on OpenTelemetry compatible tracing.
sampling_rate: 0.1

Some files were not shown because too many files have changed in this diff Show more