mirror of
https://github.com/katanemo/plano.git
synced 2026-06-11 15:05:14 +02:00
add precommit check (#97)
* add precommit check
* remove check
* Revert "remove check"
This reverts commit 9987b62b9b.
* fix checks
* fix whitespace errors
This commit is contained in:
parent
1e61452310
commit
4182879717
26 changed files with 292 additions and 312 deletions
27
.github/workflows/checks.yml
vendored
27
.github/workflows/checks.yml
vendored
|
|
@ -3,33 +3,6 @@ name: Checks
|
|||
on: pull_request
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
name: Lint
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Setup | Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Setup | Rust
|
||||
run: rustup toolchain install stable --profile minimal
|
||||
- name: Run Clippy on arch
|
||||
run: cd arch && cargo clippy --all-targets --all-features -- -Dwarnings
|
||||
- name: Run Clippy on public_types
|
||||
run: cd public_types && cargo clippy --all-targets --all-features -- -Dwarnings
|
||||
|
||||
format:
|
||||
name: Rustfmt
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Setup | Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Setup | Rust
|
||||
run: rustup toolchain install stable --profile minimal
|
||||
- name: Run Rustfmt on arch
|
||||
run: cd arch && cargo fmt -p intelligent-prompt-gateway -- --check
|
||||
- name: Run Rustfmt on public_types
|
||||
run: cd public_types && cargo fmt -p public_types -- --check
|
||||
|
||||
test:
|
||||
name: Test
|
||||
runs-on: ubuntu-latest
|
||||
|
|
|
|||
14
.github/workflows/pre-commit.yml
vendored
Normal file
14
.github/workflows/pre-commit.yml
vendored
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
name: pre-commit
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches: [main]
|
||||
|
||||
jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/setup-python@v3
|
||||
- uses: pre-commit/action@v3.0.1
|
||||
|
|
@ -6,8 +6,8 @@ listen:
|
|||
system_prompts:
|
||||
- name: network_assistant
|
||||
content: You are a network assistant that just offers facts about the operational health of the network
|
||||
|
||||
llm_providers:
|
||||
|
||||
llm_providers:
|
||||
- name: "OpenAI"
|
||||
access_key: $OPEN_AI_KEY
|
||||
model: gpt-4o
|
||||
|
|
@ -16,13 +16,13 @@ llm_providers:
|
|||
prompt_targets:
|
||||
- name: reboot_devices
|
||||
description: >
|
||||
This prompt target handles user requests to reboot devices.
|
||||
This prompt target handles user requests to reboot devices.
|
||||
It ensures that when users request to reboot specific devices or device groups, the system processes the reboot commands accurately.
|
||||
|
||||
**Examples of user prompts:**
|
||||
|
||||
- "Please reboot device 12345."
|
||||
- "Restart all devices in tenant group tenant-XYZ
|
||||
- "Restart all devices in tenant group tenant-XYZ
|
||||
- "I need to reboot devices A, B, and C."
|
||||
|
||||
path: /agent/device_reboot
|
||||
|
|
@ -38,4 +38,4 @@ prompt_targets:
|
|||
|
||||
prompt_endpoints:
|
||||
- "http://127.0.0.2"
|
||||
- "http://127.0.0.1"
|
||||
- "http://127.0.0.1"
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ error_target:
|
|||
name: "error_handler"
|
||||
path: "/errors"
|
||||
|
||||
tracing: 100 #sampling rate. Note by default Arch works on OpenTelemetry compatible tracing.
|
||||
tracing: 100 #sampling rate. Note by default Arch works on OpenTelemetry compatible tracing.
|
||||
|
||||
intent-detection-threshold-override: 0.60 # By default Arch uses an NLI + embedding approach to match an incomming prompt to a prompt target.
|
||||
# The intent matching threshold is kept at 0.80, you can overide this behavior if you would like
|
||||
|
|
|
|||
|
|
@ -6,8 +6,8 @@ listener:
|
|||
system_prompts:
|
||||
- name: network_assistant
|
||||
content: You are a network assistant that just offers facts about the operational health of the network
|
||||
|
||||
llm_providers:
|
||||
|
||||
llm_providers:
|
||||
- name: "OpenAI"
|
||||
access_key: $OPEN_AI_KEY
|
||||
model: gpt-4o
|
||||
|
|
@ -15,8 +15,8 @@ llm_providers:
|
|||
|
||||
prompt_targets:
|
||||
- name: get_device_statistics
|
||||
description: >
|
||||
This prompt target ensures that when users request device-related statistics, the system accurately retrieves and presents the relevant data
|
||||
description: >
|
||||
This prompt target ensures that when users request device-related statistics, the system accurately retrieves and presents the relevant data
|
||||
based on the specified devices and time range. Examples of user prompts, include:
|
||||
|
||||
- "Show me the performance stats for device 12345 over the past week."
|
||||
|
|
@ -37,4 +37,4 @@ prompt_targets:
|
|||
|
||||
prompt_endpoints:
|
||||
- "http://127.0.0.2"
|
||||
- "http://127.0.0.1"
|
||||
- "http://127.0.0.1"
|
||||
|
|
|
|||
|
|
@ -10,4 +10,3 @@
|
|||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
|
@ -2,4 +2,4 @@
|
|||
|
||||
body {
|
||||
font-size: 1em;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,9 +2,9 @@ Configuration Reference
|
|||
============================
|
||||
|
||||
The following is a complete reference of the ``prompt-conifg.yml`` that controls the behavior of a single instance of
|
||||
the Arch gateway. We've kept things simple (less than 80 lines) and held off on exposing additional functionality (for
|
||||
e.g. suppporting push observability stats, managing prompt-endpoints as virtual cluster, exposing more load balancing
|
||||
options, etc). Our belief that the simple things, should be simple. So we offert good defaults for developers, so
|
||||
the Arch gateway. We've kept things simple (less than 80 lines) and held off on exposing additional functionality (for
|
||||
e.g. suppporting push observability stats, managing prompt-endpoints as virtual cluster, exposing more load balancing
|
||||
options, etc). Our belief that the simple things, should be simple. So we offert good defaults for developers, so
|
||||
that they can spend more of their time in building features unique to their AI experience.
|
||||
|
||||
.. literalinclude:: /_config/prompt-config-full-reference.yml
|
||||
|
|
|
|||
|
|
@ -4,11 +4,11 @@ Getting Started
|
|||
================
|
||||
|
||||
.. sidebar:: Pre-requisites
|
||||
|
||||
In order for you to get started, please make sure that `Docker <https://www.docker.com/get-started>`_
|
||||
|
||||
In order for you to get started, please make sure that `Docker <https://www.docker.com/get-started>`_
|
||||
and `Python <https://www.python.org/downloads/>`_ are installed locally.
|
||||
|
||||
As the examples use the pre-built `Arch Docker images <https://hub.docker.com/r/katanemo/arch>`_,
|
||||
As the examples use the pre-built `Arch Docker images <https://hub.docker.com/r/katanemo/arch>`_,
|
||||
they should work on the following architectures:
|
||||
|
||||
- x86_64
|
||||
|
|
@ -23,11 +23,11 @@ You can also build it from source.
|
|||
|
||||
Step 1: Install the Arch CLI
|
||||
----------------------------
|
||||
Arch's CLI allows you to manage and interact with the Arch gateway efficiently. To install the CLI, simply
|
||||
Arch's CLI allows you to manage and interact with the Arch gateway efficiently. To install the CLI, simply
|
||||
run the following command:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install archgw
|
||||
|
||||
This will install the archgw command-line tool globally on your system.
|
||||
|
|
@ -35,8 +35,8 @@ This will install the archgw command-line tool globally on your system.
|
|||
Step 2: Start Arch Gateway
|
||||
--------------------------
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
archgw up --quick-start
|
||||
|
||||
Configuration
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Use Cases
|
||||
|
||||
|
||||
use_cases/rag
|
||||
use_cases/function_calling
|
||||
|
|
|
|||
|
|
@ -3,16 +3,16 @@
|
|||
Agentic (Text-to-Action) Apps
|
||||
==============================
|
||||
|
||||
Arch helps you easily personalize your applications by calling application-specific (API) functions
|
||||
via user prompts. This involves any predefined functions or APIs you want to expose to users to perform tasks,
|
||||
gather information, or manipulate data. This capability is generally referred to as **function calling**, where
|
||||
you have the flexibility to support “agentic” apps tailored to specific use cases - from updating insurance
|
||||
claims to creating ad campaigns - via prompts.
|
||||
Arch helps you easily personalize your applications by calling application-specific (API) functions
|
||||
via user prompts. This involves any predefined functions or APIs you want to expose to users to perform tasks,
|
||||
gather information, or manipulate data. This capability is generally referred to as **function calling**, where
|
||||
you have the flexibility to support “agentic” apps tailored to specific use cases - from updating insurance
|
||||
claims to creating ad campaigns - via prompts.
|
||||
|
||||
Arch analyzes prompts, extracts critical information from prompts, engages in lightweight conversation with
|
||||
Arch analyzes prompts, extracts critical information from prompts, engages in lightweight conversation with
|
||||
the user to gather any missing parameters and makes API calls so that you can focus on writing business logic.
|
||||
Arch does this via its purpose-built :ref:`Arch-FC LLM <llms_in_arch>` - the fastest (200ms p90 - 10x faser than GPT-4o)
|
||||
and cheapest (100x than GPT-40) function-calling LLM that matches performance with frontier models.
|
||||
Arch does this via its purpose-built :ref:`Arch-FC LLM <llms_in_arch>` - the fastest (200ms p90 - 10x faser than GPT-4o)
|
||||
and cheapest (100x than GPT-40) function-calling LLM that matches performance with frontier models.
|
||||
______________________________________________________________________________________________
|
||||
|
||||
.. image:: /_static/img/function-calling-network-flow.jpg
|
||||
|
|
@ -22,7 +22,7 @@ ________________________________________________________________________________
|
|||
|
||||
Single Function Call
|
||||
--------------------
|
||||
In the most common scenario, users will request a single action via prompts, and Arch efficiently processes the
|
||||
In the most common scenario, users will request a single action via prompts, and Arch efficiently processes the
|
||||
request by extracting relevant parameters, validating the input, and calling the designated function or API. Here
|
||||
is how you would go about enabling this scenario with Arch:
|
||||
|
||||
|
|
@ -38,7 +38,7 @@ Step 1: Define prompt targets with functions
|
|||
Step 2: Process request parameters in Flask
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Once the prompt targets are configured as above, handling those parameters is
|
||||
Once the prompt targets are configured as above, handling those parameters is
|
||||
|
||||
.. literalinclude:: /_include/parameter_handling_flask.py
|
||||
:language: python
|
||||
|
|
@ -47,19 +47,19 @@ Once the prompt targets are configured as above, handling those parameters is
|
|||
|
||||
Parallel/ Multiple Function Calling
|
||||
-----------------------------------
|
||||
In more complex use cases, users may request multiple actions or need multiple APIs/functions to be called
|
||||
simultaneously or sequentially. With Arch, you can handle these scenarios efficiently using parallel or multiple
|
||||
In more complex use cases, users may request multiple actions or need multiple APIs/functions to be called
|
||||
simultaneously or sequentially. With Arch, you can handle these scenarios efficiently using parallel or multiple
|
||||
function calling. This allows your application to engage in a broader range of interactions, such as updating
|
||||
different datasets, triggering events across systems, or collecting results from multiple services in one prompt.
|
||||
|
||||
Arch-FC1B is built to manage these parallel tasks efficiently, ensuring low latency and high throughput, even
|
||||
Arch-FC1B is built to manage these parallel tasks efficiently, ensuring low latency and high throughput, even
|
||||
when multiple functions are invoked. It provides two mechanisms to handle these cases:
|
||||
|
||||
Step 1: Define Multiple Function Targets
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
When enabling multiple function calling, define the prompt targets in a way that supports multiple functions or
|
||||
API calls based on the user's prompt. These targets can be triggered in parallel or sequentially, depending on
|
||||
When enabling multiple function calling, define the prompt targets in a way that supports multiple functions or
|
||||
API calls based on the user's prompt. These targets can be triggered in parallel or sequentially, depending on
|
||||
the user's intent.
|
||||
|
||||
Example of Multiple Prompt Targets in YAML:
|
||||
|
|
@ -68,4 +68,4 @@ Example of Multiple Prompt Targets in YAML:
|
|||
:language: yaml
|
||||
:linenos:
|
||||
:emphasize-lines: 16-37
|
||||
:caption: Define prompt targets that can enable users to engage with API and backened functions of an app
|
||||
:caption: Define prompt targets that can enable users to engage with API and backened functions of an app
|
||||
|
|
|
|||
|
|
@ -3,22 +3,22 @@
|
|||
Retrieval-Augmented (RAG)
|
||||
=========================
|
||||
|
||||
The following section describes how Arch can help you build faster, smarter and more accurate
|
||||
The following section describes how Arch can help you build faster, smarter and more accurate
|
||||
Retrieval-Augmented Generation (RAG) applications.
|
||||
|
||||
Intent-drift Detection
|
||||
----------------------
|
||||
|
||||
Developers struggle to handle `follow-up <https://www.reddit.com/r/ChatGPTPromptGenius/comments/17dzmpy/how_to_use_rag_with_conversation_history_for/?>`_
|
||||
or `clarifying <https://www.reddit.com/r/LocalLLaMA/comments/18mqwg6/best_practice_for_rag_with_followup_chat/>`_
|
||||
questions. Specifically, when users ask for changes or additions to previous responses their AI applications often
|
||||
generate entirely new responses instead of adjusting previous ones. Arch offers *intent-drift* tracking as a feature so
|
||||
that developers can know when the user has shifted away from a previous intent so that they can dramatically improve
|
||||
Developers struggle to handle `follow-up <https://www.reddit.com/r/ChatGPTPromptGenius/comments/17dzmpy/how_to_use_rag_with_conversation_history_for/?>`_
|
||||
or `clarifying <https://www.reddit.com/r/LocalLLaMA/comments/18mqwg6/best_practice_for_rag_with_followup_chat/>`_
|
||||
questions. Specifically, when users ask for changes or additions to previous responses their AI applications often
|
||||
generate entirely new responses instead of adjusting previous ones. Arch offers *intent-drift* tracking as a feature so
|
||||
that developers can know when the user has shifted away from a previous intent so that they can dramatically improve
|
||||
retrieval accuracy, lower overall token cost and improve the speed of their responses back to users.
|
||||
|
||||
Arch uses its built-in lightweight NLI and embedding models to know if the user has steered away from an active intent.
|
||||
Arch uses its built-in lightweight NLI and embedding models to know if the user has steered away from an active intent.
|
||||
Arch's intent-drift detection mechanism is based on its' *prompt_targets* primtive. Arch tries to match an incoming
|
||||
prompt to one of the *prompt_targets* configured in the gateway. Once it detects that the user has moved away from an active
|
||||
prompt to one of the *prompt_targets* configured in the gateway. Once it detects that the user has moved away from an active
|
||||
active intent, Arch adds the ``x-arch-intent-drift`` headers to the request before sending it your application servers.
|
||||
|
||||
.. literalinclude:: /_include/intent_detection.py
|
||||
|
|
@ -32,9 +32,9 @@ ________________________________________________________________________________
|
|||
|
||||
.. Note::
|
||||
|
||||
Arch is (mostly) stateless so that it can scale in an embarrassingly parrallel fashion. So, while Arch offers
|
||||
intent-drift detetction, you still have to maintain converational state with intent drift as meta-data. The
|
||||
following code snippets show how easily you can build and enrich conversational history with Langchain (in python),
|
||||
Arch is (mostly) stateless so that it can scale in an embarrassingly parrallel fashion. So, while Arch offers
|
||||
intent-drift detetction, you still have to maintain converational state with intent drift as meta-data. The
|
||||
following code snippets show how easily you can build and enrich conversational history with Langchain (in python),
|
||||
so that you can use the most relevant prompts for your retrieval and for prompting upstream LLMs.
|
||||
|
||||
|
||||
|
|
@ -54,7 +54,7 @@ Step 2: update ConversationBufferMemory w/ intent
|
|||
:linenos:
|
||||
:lines: 22-62
|
||||
|
||||
Step 3: get Messages based on latest drift
|
||||
Step 3: get Messages based on latest drift
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. literalinclude:: /_include/intent_detection.py
|
||||
|
|
@ -63,17 +63,17 @@ Step 3: get Messages based on latest drift
|
|||
:lines: 64-76
|
||||
|
||||
|
||||
You can used the last set of messages that match to an intent to prompt an LLM, use it with an vector-DB for
|
||||
improved retrieval, etc. With Arch and a few lines of code, you can improve the retrieval accuracy, lower overall
|
||||
You can used the last set of messages that match to an intent to prompt an LLM, use it with an vector-DB for
|
||||
improved retrieval, etc. With Arch and a few lines of code, you can improve the retrieval accuracy, lower overall
|
||||
token cost and dramatically improve the speed of their responses back to users.
|
||||
|
||||
Parameter Extraction for RAG
|
||||
Parameter Extraction for RAG
|
||||
----------------------------
|
||||
|
||||
To build RAG (Retrieval-Augmented Generation) applications, you can configure prompt targets with parameters,
|
||||
enabling Arch to retrieve critical information in a structured way for processing. This approach improves the
|
||||
retrieval quality and speed of your application. By extracting parameters from the conversation, you can pull
|
||||
the appropriate chunks from a vector database or SQL-like data store to enhance accuracy. With Arch, you can
|
||||
To build RAG (Retrieval-Augmented Generation) applications, you can configure prompt targets with parameters,
|
||||
enabling Arch to retrieve critical information in a structured way for processing. This approach improves the
|
||||
retrieval quality and speed of your application. By extracting parameters from the conversation, you can pull
|
||||
the appropriate chunks from a vector database or SQL-like data store to enhance accuracy. With Arch, you can
|
||||
streamline data retrieval and processing to build more efficient and precise RAG applications.
|
||||
|
||||
Step 1: Define prompt targets with parameter definitions
|
||||
|
|
@ -88,9 +88,9 @@ Step 1: Define prompt targets with parameter definitions
|
|||
Step 2: Process request parameters in Flask
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Once the prompt targets are configured as above, handling those parameters is
|
||||
Once the prompt targets are configured as above, handling those parameters is
|
||||
|
||||
.. literalinclude:: /_include/parameter_handling_flask.py
|
||||
:language: python
|
||||
:linenos:
|
||||
:caption: Flask API example for parameter extraction via HTTP request parameters
|
||||
:caption: Flask API example for parameter extraction via HTTP request parameters
|
||||
|
|
|
|||
|
|
@ -3,8 +3,8 @@
|
|||
Terminology
|
||||
============
|
||||
|
||||
A few definitions before we dive into the main architecture documentation. Arch borrows from Envoy's terminology
|
||||
to keep things consistent in logs, traces and in code.
|
||||
A few definitions before we dive into the main architecture documentation. Arch borrows from Envoy's terminology
|
||||
to keep things consistent in logs, traces and in code.
|
||||
|
||||
**Downstream(Ingress)**: An downstream client (web application, etc.) connects to Arch, sends prompts, and receives responses.
|
||||
|
||||
|
|
@ -15,32 +15,32 @@ to keep things consistent in logs, traces and in code.
|
|||
:align: center
|
||||
|
||||
**Listener**: A listener is a named network location (e.g., port, address, path etc.) that Arch listens on to process prompts
|
||||
before forwarding them to your application server endpoints. rch enables you to configure one listener for downstream connections
|
||||
(like port 80, 443) and creates a separate internal listener for calls that initiate from your application code to LLMs.
|
||||
before forwarding them to your application server endpoints. rch enables you to configure one listener for downstream connections
|
||||
(like port 80, 443) and creates a separate internal listener for calls that initiate from your application code to LLMs.
|
||||
|
||||
.. Note::
|
||||
|
||||
When you start Arch, you specify a listener address/port that you want to bind downstream. But, Arch uses are predefined port
|
||||
that you can use (``127.0.0.1:10000``) to proxy egress calls originating from your application to LLMs (API-based or hosted).
|
||||
When you start Arch, you specify a listener address/port that you want to bind downstream. But, Arch uses are predefined port
|
||||
that you can use (``127.0.0.1:10000``) to proxy egress calls originating from your application to LLMs (API-based or hosted).
|
||||
For more details, check out :ref:`LLM providers <llm_providers>`
|
||||
|
||||
**Instance**: An instance of the Arch gateway. When you start Arch it creates at most two processes. One to handle Layer 7
|
||||
**Instance**: An instance of the Arch gateway. When you start Arch it creates at most two processes. One to handle Layer 7
|
||||
networking operations (auth, tls, observability, etc) and the second process to serve models that enable it to make smart
|
||||
decisions on how to accept, handle and forward prompts. The second process is optional, as the model serving sevice could be
|
||||
decisions on how to accept, handle and forward prompts. The second process is optional, as the model serving sevice could be
|
||||
hosted on a different network (an API call). But these two processes are considered a single instance of Arch.
|
||||
|
||||
**Prompt Targets**: Arch offers a primitive called ``prompt_targets`` to help separate business logic from undifferentiated
|
||||
work in building generative AI apps. Prompt targets are endpoints that receive prompts that are processed by Arch.
|
||||
For example, Arch enriches incoming prompts with metadata like knowing when a request is a follow-up or clarifying prompt
|
||||
so that you can build faster, more accurate retrieval (RAG) apps. To support agentic apps, like scheduling travel plans or
|
||||
**Prompt Targets**: Arch offers a primitive called ``prompt_targets`` to help separate business logic from undifferentiated
|
||||
work in building generative AI apps. Prompt targets are endpoints that receive prompts that are processed by Arch.
|
||||
For example, Arch enriches incoming prompts with metadata like knowing when a request is a follow-up or clarifying prompt
|
||||
so that you can build faster, more accurate retrieval (RAG) apps. To support agentic apps, like scheduling travel plans or
|
||||
sharing comments on a document - via prompts, Bolt uses its function calling abilities to extract critical information from
|
||||
the incoming prompt (or a set of prompts) needed by a downstream backend API or function call before calling it directly.
|
||||
|
||||
**Error Targets**: Error targets are those endpoints that receive forwarded errors from Arch when issues arise,
|
||||
such as failing to properly call a function/API, detecting violations of guardrails, or encountering other processing errors.
|
||||
These errors are communicated to the application via headers (X-Arch-[ERROR-TYPE]), allowing it to handle the errors gracefully
|
||||
such as failing to properly call a function/API, detecting violations of guardrails, or encountering other processing errors.
|
||||
These errors are communicated to the application via headers (X-Arch-[ERROR-TYPE]), allowing it to handle the errors gracefully
|
||||
and take appropriate actions.
|
||||
|
||||
**Model Serving**: Arch is a set of **two** self-contained processes that are designed to run alongside your application servers
|
||||
(or on a separate hostconnected via a network).The **model serving** process helps Arch make intelligent decisions about the
|
||||
**Model Serving**: Arch is a set of **two** self-contained processes that are designed to run alongside your application servers
|
||||
(or on a separate hostconnected via a network).The **model serving** process helps Arch make intelligent decisions about the
|
||||
incoming prompts. The model server is designed to call the (fast) purpose-built :ref:`LLMs <llms_in_arch>` in Arch.
|
||||
|
|
|
|||
|
|
@ -8,9 +8,9 @@ Arch builds on top of Envoy's single process with multiple threads architecture.
|
|||
A single *primary* thread controls various sporadic coordination tasks while some number of *worker*
|
||||
threads perform filtering, and forwarding.
|
||||
|
||||
Once a connection is accepted, the connection spends the rest of its lifetime bound to a single worker
|
||||
thread. All the functionality around prompt handling from a downstream client is handled in a separate worker thread.
|
||||
This allows the majority of Arch to be largely single threaded (embarrassingly parallel) with a small amount
|
||||
Once a connection is accepted, the connection spends the rest of its lifetime bound to a single worker
|
||||
thread. All the functionality around prompt handling from a downstream client is handled in a separate worker thread.
|
||||
This allows the majority of Arch to be largely single threaded (embarrassingly parallel) with a small amount
|
||||
of more complex code handling coordination between the worker threads.
|
||||
|
||||
Generally Arch is written to be 100% non-blocking.
|
||||
|
|
@ -18,4 +18,4 @@ Generally Arch is written to be 100% non-blocking.
|
|||
.. tip::
|
||||
|
||||
For most workloads we recommend configuring the number of worker threads to be equal to the number of
|
||||
hardware threads on the machine.
|
||||
hardware threads on the machine.
|
||||
|
|
|
|||
|
|
@ -2,28 +2,28 @@
|
|||
|
||||
Listener
|
||||
---------
|
||||
Listener is a top level primitive in Arch, which simplifies the configuration required to bind incoming
|
||||
Listener is a top level primitive in Arch, which simplifies the configuration required to bind incoming
|
||||
connections from downstream clients, and for egress connections to LLMs (hosted or API)
|
||||
|
||||
Arch builds on Envoy's Listener subsystem to streamline connection managemet for developers. Arch minimizes
|
||||
the complexity of Envoy's listener setup by using best-practices and exposing only essential settings,
|
||||
making it easier for developers to bind connections without deep knowledge of Envoy’s configuration model. This
|
||||
Arch builds on Envoy's Listener subsystem to streamline connection managemet for developers. Arch minimizes
|
||||
the complexity of Envoy's listener setup by using best-practices and exposing only essential settings,
|
||||
making it easier for developers to bind connections without deep knowledge of Envoy’s configuration model. This
|
||||
simplification ensures that connections are secure, reliable, and optimized for performance.
|
||||
|
||||
Downstream (Ingress)
|
||||
^^^^^^^^^^^^^^^^^^^^^^
|
||||
Developers can configure Arch to accept connections from downstream clients. A downstream listener acts as the
|
||||
primary entry point for incoming traffic, handling initial connection setup, including network filtering, gurdrails,
|
||||
and additional network security checks. For more details on prompt security and safety,
|
||||
Developers can configure Arch to accept connections from downstream clients. A downstream listener acts as the
|
||||
primary entry point for incoming traffic, handling initial connection setup, including network filtering, gurdrails,
|
||||
and additional network security checks. For more details on prompt security and safety,
|
||||
see :ref:`here <arch_overview_prompt_handling>`
|
||||
|
||||
Upstream (Egress)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
Arch automatically configures a listener to route requests from your application to upstream LLM API providers (or hosts).
|
||||
When you start Arch, it creates a listener for egress traffic based on the presence of the ``llm_providers`` configuration
|
||||
section in the ``prompt_config.yml`` file. Arch binds itself to a local address such as ``127.0.0.1:9000/v1`` or a DNS-based
|
||||
Arch automatically configures a listener to route requests from your application to upstream LLM API providers (or hosts).
|
||||
When you start Arch, it creates a listener for egress traffic based on the presence of the ``llm_providers`` configuration
|
||||
section in the ``prompt_config.yml`` file. Arch binds itself to a local address such as ``127.0.0.1:9000/v1`` or a DNS-based
|
||||
address like ``arch.local:9000/v1`` for outgoing traffic. For more details on LLM providers, read :ref:`here <llm_providers>`
|
||||
|
||||
|
||||
Configure Listener
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
|
||||
|
|
@ -31,7 +31,7 @@ To configure a Downstream (Ingress) Listner, simply add the ``listener`` directi
|
|||
|
||||
.. literalinclude:: /_config/getting-started.yml
|
||||
:language: yaml
|
||||
:linenos:
|
||||
:linenos:
|
||||
:lines: 1-18
|
||||
:emphasize-lines: 2-5
|
||||
:caption: :download:`arch-getting-started.yml </_config/getting-started.yml>`
|
||||
:caption: :download:`arch-getting-started.yml </_config/getting-started.yml>`
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
LLM Provider
|
||||
------------
|
||||
|
||||
``llm_provider`` is a top-level primitive in Arch, helping developers centrally define, secure, observe,
|
||||
and manage the usage of of their LLMs. Arch builds on Envoy's reliable `cluster subsystem <https://www.envoyproxy.io/docs/envoy/v1.31.2/intro/arch_overview/upstream/cluster_manager>`_
|
||||
to manage egress traffic to LLMs, which includes intelligent routing, retry and fail-over mechanisms,
|
||||
ensuring high availability and fault tolerance. This abstraction also enables developers to seamlessly switching between LLM providers or upgrade LLM versions, simplifying the integration and scaling of LLMs across
|
||||
``llm_provider`` is a top-level primitive in Arch, helping developers centrally define, secure, observe,
|
||||
and manage the usage of of their LLMs. Arch builds on Envoy's reliable `cluster subsystem <https://www.envoyproxy.io/docs/envoy/v1.31.2/intro/arch_overview/upstream/cluster_manager>`_
|
||||
to manage egress traffic to LLMs, which includes intelligent routing, retry and fail-over mechanisms,
|
||||
ensuring high availability and fault tolerance. This abstraction also enables developers to seamlessly switching between LLM providers or upgrade LLM versions, simplifying the integration and scaling of LLMs across
|
||||
applications.
|
||||
|
||||
|
||||
|
|
@ -20,16 +20,16 @@ Below is an example of how you can configure ``llm_providers`` with an instance
|
|||
:caption: :download:`arch-getting-started.yml </_config/getting-started.yml>`
|
||||
|
||||
.. Note::
|
||||
When you start Arch, it creates a listener port for egress traffic based on the presence of ``llm_providers``
|
||||
configuration section in the ``prompt_config.yml`` file. Arch binds itself to a local address such as
|
||||
``127.0.0.1:9000/v1`` or a DNS-based address like ``arch.local:9000/v1`` for egress traffic.
|
||||
When you start Arch, it creates a listener port for egress traffic based on the presence of ``llm_providers``
|
||||
configuration section in the ``prompt_config.yml`` file. Arch binds itself to a local address such as
|
||||
``127.0.0.1:9000/v1`` or a DNS-based address like ``arch.local:9000/v1`` for egress traffic.
|
||||
|
||||
Arch also offers vendor-agnostic SDKs and libraries to make LLM calls to API-based LLM providers (like OpenAI,
|
||||
Anthropic, Mistral, Cohere, etc.) and supports calls to OSS LLMs that are hosted on your infrastructure. Arch
|
||||
abstracts the complexities of integrating with different LLM providers, providing a unified interface for making
|
||||
calls, handling retries, managing rate limits, and ensuring seamless integration with cloud-based and on-premise
|
||||
LLMs. Simply configure the details of the LLMs your application will use, and Arch offers a unified interface to
|
||||
make outbound LLM calls.
|
||||
Arch also offers vendor-agnostic SDKs and libraries to make LLM calls to API-based LLM providers (like OpenAI,
|
||||
Anthropic, Mistral, Cohere, etc.) and supports calls to OSS LLMs that are hosted on your infrastructure. Arch
|
||||
abstracts the complexities of integrating with different LLM providers, providing a unified interface for making
|
||||
calls, handling retries, managing rate limits, and ensuring seamless integration with cloud-based and on-premise
|
||||
LLMs. Simply configure the details of the LLMs your application will use, and Arch offers a unified interface to
|
||||
make outbound LLM calls.
|
||||
|
||||
Example: Using the Arch Python SDK
|
||||
----------------------------------
|
||||
|
|
@ -49,4 +49,4 @@ Example: Using the Arch Python SDK
|
|||
response = client.completions.create(llm_provider=llm_provider, prompt=prompt)
|
||||
|
||||
# Print the response
|
||||
print("LLM Response:", response)
|
||||
print("LLM Response:", response)
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
Model Serving
|
||||
-------------
|
||||
|
||||
Arch is a set of **two** self-contained processes that are designed to run alongside your application
|
||||
servers (or on a separate host connected via a network). The first process is designated to manage low-level
|
||||
networking and HTTP related comcerns, and the other process is for **model serving**, which helps Arch make
|
||||
intelligent decisions about the incoming prompts. The model server is designed to call the purpose-built
|
||||
Arch is a set of **two** self-contained processes that are designed to run alongside your application
|
||||
servers (or on a separate host connected via a network). The first process is designated to manage low-level
|
||||
networking and HTTP related comcerns, and the other process is for **model serving**, which helps Arch make
|
||||
intelligent decisions about the incoming prompts. The model server is designed to call the purpose-built
|
||||
:ref:`LLMs <llms_in_arch>` in Arch.
|
||||
|
||||
.. image:: /_static/img/arch-system-architecture.jpg
|
||||
|
|
@ -15,16 +15,16 @@ intelligent decisions about the incoming prompts. The model server is designed t
|
|||
|
||||
_____________________________________________________________________________________________________________
|
||||
|
||||
Arch' is designed to be deployed in your cloud VPC, on a on-premises host, and can work on devices that don't
|
||||
have a GPU. Note, GPU devices are need for fast and cost-efficient use, so that Arch (model server, specifically)
|
||||
can process prompts quickly and forward control back to the applicaton host. There are three modes in which Arch
|
||||
Arch' is designed to be deployed in your cloud VPC, on a on-premises host, and can work on devices that don't
|
||||
have a GPU. Note, GPU devices are need for fast and cost-efficient use, so that Arch (model server, specifically)
|
||||
can process prompts quickly and forward control back to the applicaton host. There are three modes in which Arch
|
||||
can be configured to run its **model server** subsystem:
|
||||
|
||||
Local Serving (CPU - Moderate)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
The following bash commands enable you to configure the model server subsystem in Arch to run local on device
|
||||
and only use CPU devices. This will be the slowest option but can be useful in dev/test scenarios where GPUs
|
||||
might not be available.
|
||||
The following bash commands enable you to configure the model server subsystem in Arch to run local on device
|
||||
and only use CPU devices. This will be the slowest option but can be useful in dev/test scenarios where GPUs
|
||||
might not be available.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
|
|
@ -32,25 +32,25 @@ might not be available.
|
|||
|
||||
Local Serving (GPU- Fast)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
The following bash commands enable you to configure the model server subsystem in Arch to run locally on the
|
||||
The following bash commands enable you to configure the model server subsystem in Arch to run locally on the
|
||||
machine and utilize the GPU available for fast inference across all model use cases, including function calling
|
||||
guardails, etc.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
archgw up --local
|
||||
archgw up --local
|
||||
|
||||
Cloud Serving (GPU - Blazing Fast)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
The command below instructs Arch to intelligently use GPUs locally for fast intent detection, but default to
|
||||
cloud serving for function calling and guardails scenarios to dramatically improve the speed and overall performance
|
||||
of your applications.
|
||||
The command below instructs Arch to intelligently use GPUs locally for fast intent detection, but default to
|
||||
cloud serving for function calling and guardails scenarios to dramatically improve the speed and overall performance
|
||||
of your applications.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
archgw up
|
||||
archgw up
|
||||
|
||||
.. Note::
|
||||
Arch's model serving in the cloud is priced at $0.05M/token (156x cheaper than GPT-4o) with averlage latency
|
||||
of 200ms (10x faster than GPT-4o). Please refer to our :ref:`getting started guide <getting_started>` to know
|
||||
how to generate API keys for model serving
|
||||
Arch's model serving in the cloud is priced at $0.05M/token (156x cheaper than GPT-4o) with averlage latency
|
||||
of 200ms (10x faster than GPT-4o). Please refer to our :ref:`getting started guide <getting_started>` to know
|
||||
how to generate API keys for model serving
|
||||
|
|
|
|||
|
|
@ -3,9 +3,9 @@
|
|||
Prompts
|
||||
-------
|
||||
|
||||
Arch's primary design point is to securely accept, process and handle prompts. To do that effectively,
|
||||
Arch relies on Envoy's HTTP `connection management <https://www.envoyproxy.io/docs/envoy/v1.31.2/intro/arch_overview/http/http_connection_management>`_,
|
||||
subsystem and its **prompt handler** subsystem engineered with purpose-built :ref:`LLMs <llms_in_arch>` to
|
||||
Arch's primary design point is to securely accept, process and handle prompts. To do that effectively,
|
||||
Arch relies on Envoy's HTTP `connection management <https://www.envoyproxy.io/docs/envoy/v1.31.2/intro/arch_overview/http/http_connection_management>`_,
|
||||
subsystem and its **prompt handler** subsystem engineered with purpose-built :ref:`LLMs <llms_in_arch>` to
|
||||
implement critical functionality on behalf of developers so that you can stay focused on business logic.
|
||||
|
||||
.. Note::
|
||||
|
|
@ -16,8 +16,8 @@ implement critical functionality on behalf of developers so that you can stay fo
|
|||
Messages
|
||||
--------
|
||||
|
||||
Arch accepts messages directly from the body of the HTTP request in a format that follows the `Hugging Face Messages API <https://huggingface.co/docs/text-generation-inference/en/messages_api>`_.
|
||||
This design allows developers to pass a list of messages, where each message is represented as a dictionary
|
||||
Arch accepts messages directly from the body of the HTTP request in a format that follows the `Hugging Face Messages API <https://huggingface.co/docs/text-generation-inference/en/messages_api>`_.
|
||||
This design allows developers to pass a list of messages, where each message is represented as a dictionary
|
||||
containing two key-value pairs:
|
||||
|
||||
- **Role**: Defines the role of the message sender, such as "user" or "assistant".
|
||||
|
|
@ -27,11 +27,11 @@ containing two key-value pairs:
|
|||
Prompt Guardrails
|
||||
-----------------
|
||||
|
||||
Arch is engineered with :ref:`Arch-Guard <llms_in_arch>`, an industry leading safety layer, powered by a
|
||||
compact and high-performimg LLM that monitors incoming prompts to detect and reject jailbreak attempts -
|
||||
Arch is engineered with :ref:`Arch-Guard <llms_in_arch>`, an industry leading safety layer, powered by a
|
||||
compact and high-performimg LLM that monitors incoming prompts to detect and reject jailbreak attempts -
|
||||
ensuring that unauthorized or harmful behaviors are intercepted early in the process.
|
||||
|
||||
To add jailbreak guardrails, see example below:
|
||||
To add jailbreak guardrails, see example below:
|
||||
|
||||
.. literalinclude:: /_config/getting-started.yml
|
||||
:language: yaml
|
||||
|
|
@ -41,16 +41,16 @@ To add jailbreak guardrails, see example below:
|
|||
|
||||
.. Note::
|
||||
As a roadmap item, Arch will expose the ability for developers to define custom guardrails via Arch-Guard-v2,
|
||||
and add support for additional safety checks defined by developers and hazardous categories like, violent crimes, privacy, hate,
|
||||
and add support for additional safety checks defined by developers and hazardous categories like, violent crimes, privacy, hate,
|
||||
etc. To offer feedback on our roadmap, please visit our `github page <https://github.com/orgs/katanemo/projects/1>`_
|
||||
|
||||
|
||||
Prompt Targets
|
||||
--------------
|
||||
|
||||
Once a prompt passes any configured guardrail checks, Arch processes the contents of the incoming conversation
|
||||
and identifies where to forwad the conversation to via its essential ``prompt_targets`` primitve. Prompt targets
|
||||
are endpoints that receive prompts that are processed by Arch. For example, Arch enriches incoming prompts with
|
||||
Once a prompt passes any configured guardrail checks, Arch processes the contents of the incoming conversation
|
||||
and identifies where to forwad the conversation to via its essential ``prompt_targets`` primitve. Prompt targets
|
||||
are endpoints that receive prompts that are processed by Arch. For example, Arch enriches incoming prompts with
|
||||
metadata like knowing when a user's intent has changed so that you can build faster, more accurate RAG apps.
|
||||
|
||||
Configuring ``prompt_targets`` is simple. See example below:
|
||||
|
|
@ -65,47 +65,47 @@ Configuring ``prompt_targets`` is simple. See example below:
|
|||
Intent Detection and Prompt Matching:
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Arch uses fast Natural Language Inference (NLI) and embedding approaches to first detect the intent of each
|
||||
incoming prompt. This intent detection phase analyzes the prompt's content and matches it against predefined
|
||||
prompt targets, ensuring that each prompt is forwarded to the most appropriate endpoint. Arch’s intent
|
||||
Arch uses fast Natural Language Inference (NLI) and embedding approaches to first detect the intent of each
|
||||
incoming prompt. This intent detection phase analyzes the prompt's content and matches it against predefined
|
||||
prompt targets, ensuring that each prompt is forwarded to the most appropriate endpoint. Arch’s intent
|
||||
detection framework considers both the name and description of each prompt target, and uses a composite matching
|
||||
score between an NLI and cosine similarity to enchance accuracy in forwarding decisions.
|
||||
|
||||
- **Embeddings**: By embedding the prompt and comparing it to known target vectors, Arch effectively identifies
|
||||
- **Embeddings**: By embedding the prompt and comparing it to known target vectors, Arch effectively identifies
|
||||
the closest match, ensuring that the prompt is handled by the correct downstream service.
|
||||
|
||||
- **NLI**: NLI techniques further refine the matching process by evaluating the semantic alignment between the
|
||||
- **NLI**: NLI techniques further refine the matching process by evaluating the semantic alignment between the
|
||||
prompt and potential targets.
|
||||
|
||||
Agentic Apps via Prompt Targets
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
To support agentic apps, like scheduling travel plans or sharing comments on a document - via prompts, Arch uses
|
||||
its function calling abilities to extract critical information from the incoming prompt (or a set of prompts)
|
||||
To support agentic apps, like scheduling travel plans or sharing comments on a document - via prompts, Arch uses
|
||||
its function calling abilities to extract critical information from the incoming prompt (or a set of prompts)
|
||||
needed by a downstream backend API or function call before calling it directly. For more details on how you can
|
||||
build agentic applications using Arch, see our full guide :ref:`here <arch_function_calling_agentic_guide>`:
|
||||
|
||||
.. Note::
|
||||
Arch :ref:`Arch-FC <llms_in_arch>` is the dedicated agentic model engineered in Arch to extract information from
|
||||
a (set of) prompts and executes necessary backend API calls. This allows for efficient handling of agentic tasks,
|
||||
such as scheduling data retrieval, by dynamically interacting with backend services. Arch-FC is a flagship 1.3
|
||||
billion parameter model that matches performance with frontier models like Claude Sonnet 3.5 ang GPT-4, while
|
||||
Arch :ref:`Arch-FC <llms_in_arch>` is the dedicated agentic model engineered in Arch to extract information from
|
||||
a (set of) prompts and executes necessary backend API calls. This allows for efficient handling of agentic tasks,
|
||||
such as scheduling data retrieval, by dynamically interacting with backend services. Arch-FC is a flagship 1.3
|
||||
billion parameter model that matches performance with frontier models like Claude Sonnet 3.5 ang GPT-4, while
|
||||
being 100x cheaper ($0.05M/token hosted) and 10x faster (p50 latencies of 200ms).
|
||||
|
||||
Prompting LLMs
|
||||
--------------
|
||||
Arch is a single piece of software that is designed to manage both ingress and egress prompt traffic, drawing its
|
||||
distributed proxy nature from the robust `Envoy <https://envoyproxy.io>`_. This makes it extremely efficient and capable
|
||||
of handling upstream connections to LLMs. If your application is originating code to an API-based LLM, simply use
|
||||
Arch's Python or JavaScript client SDK to send traffic to the desired LLM of choice. By sending traffic through Arch,
|
||||
you can propagate traces, manage and monitor traffic, apply rate limits, and utilize a large set of traffic management
|
||||
Arch is a single piece of software that is designed to manage both ingress and egress prompt traffic, drawing its
|
||||
distributed proxy nature from the robust `Envoy <https://envoyproxy.io>`_. This makes it extremely efficient and capable
|
||||
of handling upstream connections to LLMs. If your application is originating code to an API-based LLM, simply use
|
||||
Arch's Python or JavaScript client SDK to send traffic to the desired LLM of choice. By sending traffic through Arch,
|
||||
you can propagate traces, manage and monitor traffic, apply rate limits, and utilize a large set of traffic management
|
||||
capabilities in a central place.
|
||||
|
||||
.. Attention::
|
||||
When you start Arch, it automatically creates a listener port for egress calls to upstream LLMs. This is based on the
|
||||
``llm_providers`` configuration section in the ``prompt_config.yml`` file. Arch binds itself to a local address such as
|
||||
.. Attention::
|
||||
When you start Arch, it automatically creates a listener port for egress calls to upstream LLMs. This is based on the
|
||||
``llm_providers`` configuration section in the ``prompt_config.yml`` file. Arch binds itself to a local address such as
|
||||
127.0.0.1:9000/v1 or a DNS-based address like arch.local:9000/v1 for outgoing traffic.
|
||||
|
||||
|
||||
Example: Using the Arch Python SDK
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
|
|
@ -129,7 +129,7 @@ Example: Using the Arch Python SDK
|
|||
Example: Using OpenAI Client with Arch as an Egress Gateway
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code-block:: python
|
||||
.. code-block:: python
|
||||
|
||||
import openai
|
||||
|
||||
|
|
@ -149,7 +149,7 @@ Example: Using OpenAI Client with Arch as an Egress Gateway
|
|||
In these examples:
|
||||
|
||||
The ArchClient is used to send traffic directly through the Arch egress proxy to the LLM of your choice, such as OpenAI.
|
||||
The OpenAI client is configured to route traffic via Arch by setting the proxy to 127.0.0.1:9000, assuming Arch is
|
||||
The OpenAI client is configured to route traffic via Arch by setting the proxy to 127.0.0.1:9000, assuming Arch is
|
||||
running locally and bound to that address and port.
|
||||
|
||||
This setup allows you to take advantage of Arch's advanced traffic management features while interacting with LLM APIs like OpenAI.
|
||||
This setup allows you to take advantage of Arch's advanced traffic management features while interacting with LLM APIs like OpenAI.
|
||||
|
|
|
|||
|
|
@ -15,8 +15,8 @@ dispatch upstream and the response path.
|
|||
Terminology
|
||||
-----------
|
||||
|
||||
We recommend that you get familiar with some of the :ref:`terminology <arch_terminology>` used in Arch
|
||||
before reading this section.
|
||||
We recommend that you get familiar with some of the :ref:`terminology <arch_terminology>` used in Arch
|
||||
before reading this section.
|
||||
|
||||
Network topology
|
||||
----------------
|
||||
|
|
@ -25,10 +25,10 @@ How a request flows through the components in a network (including Arch) depends
|
|||
Arch can be used in a wide variety of networking topologies. We focus on the inner operation of Arch below,
|
||||
but briefly we address how Arch relates to the rest of the network in this section.
|
||||
|
||||
- **Downstream(Ingress)** listeners take requests from upstream clients like a web UI or clients that forward
|
||||
- **Downstream(Ingress)** listeners take requests from upstream clients like a web UI or clients that forward
|
||||
prompts to you local application responses from the application flow back through Arch to the downstream.
|
||||
|
||||
- **Upstream(Egress)** listeners take requests from the application and forward them to LLMs.
|
||||
- **Upstream(Egress)** listeners take requests from the application and forward them to LLMs.
|
||||
|
||||
.. image:: /_static/img/network-topology-ingress-egress.jpg
|
||||
:width: 100%
|
||||
|
|
@ -44,33 +44,33 @@ traverse multiple Arch gateways:
|
|||
|
||||
High level architecture
|
||||
-----------------------
|
||||
Arch is a set of **two** self-contained processes that are designed to run alongside your application servers
|
||||
(or on a separate server connected to your application servers via a network). The first process is designated
|
||||
to manage HTTP-level networking and connection management concerns (protocol management, request id generation,
|
||||
header sanitization, etc.), and the other process is for **model serving**, which helps Arch make intelligent
|
||||
decisions about the incoming prompts. The model server hosts the purpose-built :ref:`LLMs <llms_in_arch>` to
|
||||
manage several critical, but undifferentiated, prompt related tasks on behalf of developers.
|
||||
Arch is a set of **two** self-contained processes that are designed to run alongside your application servers
|
||||
(or on a separate server connected to your application servers via a network). The first process is designated
|
||||
to manage HTTP-level networking and connection management concerns (protocol management, request id generation,
|
||||
header sanitization, etc.), and the other process is for **model serving**, which helps Arch make intelligent
|
||||
decisions about the incoming prompts. The model server hosts the purpose-built :ref:`LLMs <llms_in_arch>` to
|
||||
manage several critical, but undifferentiated, prompt related tasks on behalf of developers.
|
||||
|
||||
|
||||
The request processing path in Arch has three main parts:
|
||||
|
||||
* :ref:`Listener subsystem <arch_overview_listeners>` which handles **downstream** and **upstream** request
|
||||
processing. It is responsible for managing the downstream (ingress) and the upstream (egress) request
|
||||
* :ref:`Listener subsystem <arch_overview_listeners>` which handles **downstream** and **upstream** request
|
||||
processing. It is responsible for managing the downstream (ingress) and the upstream (egress) request
|
||||
lifecycle. The downstream and upstream HTTP/2 codec lives here.
|
||||
* :ref:`Prompt handler subsystem <arch_overview_prompt_handling>` which is responsible for selecting and
|
||||
forwarding prompts ``prompt_targets`` and establishes the lifecycle of any **upstream** connection to a
|
||||
hosted endpoint that implements domain-specific business logic for incoming promots. This is where knowledge
|
||||
of targets and endpoint health, load balancing and connection pooling exists.
|
||||
* :ref:`Model serving subsystem <arch_model_serving>` which helps Arch make intelligent decisions about the
|
||||
forwarding prompts ``prompt_targets`` and establishes the lifecycle of any **upstream** connection to a
|
||||
hosted endpoint that implements domain-specific business logic for incoming promots. This is where knowledge
|
||||
of targets and endpoint health, load balancing and connection pooling exists.
|
||||
* :ref:`Model serving subsystem <arch_model_serving>` which helps Arch make intelligent decisions about the
|
||||
incoming prompts. The model server is designed to call the purpose-built :ref:`LLMs <llms_in_arch>` in Arch.
|
||||
|
||||
The three subsystems are bridged with either the HTTP router filter, and the cluster manager subsystems of Envoy.
|
||||
|
||||
Also, Arch utilizes `Envoy event-based thread model <https://blog.envoyproxy.io/envoy-threading-model-a8d44b922310>`_.
|
||||
A main thread is responsible forthe server lifecycle, configuration processing, stats, etc. and some number of
|
||||
:ref:`worker threads <arch_overview_threading>` process requests. All threads operate around an event loop (`libevent <https://libevent.org/>`_)
|
||||
and any given downstream TCP connection will be handled by exactly one worker thread for its lifetime. Each worker
|
||||
thread maintains its own pool of TCP connections to upstream endpoints.
|
||||
A main thread is responsible forthe server lifecycle, configuration processing, stats, etc. and some number of
|
||||
:ref:`worker threads <arch_overview_threading>` process requests. All threads operate around an event loop (`libevent <https://libevent.org/>`_)
|
||||
and any given downstream TCP connection will be handled by exactly one worker thread for its lifetime. Each worker
|
||||
thread maintains its own pool of TCP connections to upstream endpoints.
|
||||
|
||||
Worker threads rarely share state and operate in a trivially parallel fashion. This threading model
|
||||
enables scaling to very high core count CPUs.
|
||||
|
|
@ -92,34 +92,34 @@ Overview
|
|||
A brief outline of the life cycle of a request and response using the example configuration above:
|
||||
|
||||
1. **TCP Connection Establishment**:
|
||||
A TCP connection from downstream is accepted by an Arch listener running on a worker thread.
|
||||
The listener filter chain provides SNI and other pre-TLS information. The transport socket, typically TLS,
|
||||
A TCP connection from downstream is accepted by an Arch listener running on a worker thread.
|
||||
The listener filter chain provides SNI and other pre-TLS information. The transport socket, typically TLS,
|
||||
decrypts incoming data for processing.
|
||||
|
||||
2. **Prompt Guardrails Check**:
|
||||
Arch first checks the incoming prompts for guardrails such as jailbreak attempts. This ensures
|
||||
Arch first checks the incoming prompts for guardrails such as jailbreak attempts. This ensures
|
||||
that harmful or unwanted behaviors are detected early in the request processing pipeline.
|
||||
|
||||
3. **Intent Matching**:
|
||||
The decrypted data stream is deframed by the HTTP/2 codec in Arch's HTTP connection manager. Arch performs
|
||||
intent matching via is **prompt-handler** subsystem using the name and description of the defined prompt targets,
|
||||
The decrypted data stream is deframed by the HTTP/2 codec in Arch's HTTP connection manager. Arch performs
|
||||
intent matching via is **prompt-handler** subsystem using the name and description of the defined prompt targets,
|
||||
determining which endpoint should handle the prompt.
|
||||
|
||||
4. **Parameter Gathering with Arch-FC**:
|
||||
If a prompt target requires specific parameters, Arch engages Arch-FC to extract the necessary details
|
||||
If a prompt target requires specific parameters, Arch engages Arch-FC to extract the necessary details
|
||||
from the incoming prompt(s). This process gathers the critical information needed for downstream API calls.
|
||||
|
||||
5. **API Call Execution**:
|
||||
Arch routes the prompt to the appropriate backend API or function call. If an endpoint cluster is identified,
|
||||
load balancing is performed, circuit breakers are checked, and the request is proxied to the upstream endpoint.
|
||||
|
||||
Arch routes the prompt to the appropriate backend API or function call. If an endpoint cluster is identified,
|
||||
load balancing is performed, circuit breakers are checked, and the request is proxied to the upstream endpoint.
|
||||
|
||||
6. **Default Summarization by Upstream LLM**:
|
||||
By default, if no specific endpoint processing is needed, the prompt is sent to an upstream LLM for summarization.
|
||||
This ensures that responses are concise and relevant, enhancing user experience in RAG (Retrieval-Augmented Generation)
|
||||
By default, if no specific endpoint processing is needed, the prompt is sent to an upstream LLM for summarization.
|
||||
This ensures that responses are concise and relevant, enhancing user experience in RAG (Retrieval-Augmented Generation)
|
||||
and agentic applications.
|
||||
|
||||
7. **Error Handling and Forwarding**:
|
||||
Errors encountered during processing, such as failed function calls or guardrail detections, are forwarded to
|
||||
Errors encountered during processing, such as failed function calls or guardrail detections, are forwarded to
|
||||
designated error targets. Error details are communicated through specific headers to the application:
|
||||
|
||||
- ``X-Function-Error-Code``: Code indicating the type of function call error.
|
||||
|
|
@ -127,7 +127,7 @@ A brief outline of the life cycle of a request and response using the example co
|
|||
- Additional headers carry messages and timestamps to aid in debugging and logging.
|
||||
|
||||
8. **Response Handling**:
|
||||
The upstream endpoint’s TLS transport socket encrypts the response, which is then proxied back downstream.
|
||||
The upstream endpoint’s TLS transport socket encrypts the response, which is then proxied back downstream.
|
||||
Responses pass through HTTP filters in reverse order, ensuring any necessary processing or modification before final delivery.
|
||||
|
||||
|
||||
|
|
@ -137,29 +137,29 @@ Request Flow (Egress)
|
|||
Overview
|
||||
--------
|
||||
|
||||
A brief outline of the life cycle of a request and response in the context of egress traffic from an application
|
||||
A brief outline of the life cycle of a request and response in the context of egress traffic from an application
|
||||
to Large Language Models (LLMs) via Arch:
|
||||
|
||||
1. **HTTP Connection Establishment to LLM**:
|
||||
Arch initiates an HTTP connection to the upstream LLM service. This connection is handled by Arch’s egress listener
|
||||
running on a worker thread. The connection typically uses a secure transport protocol such as HTTPS, ensuring the
|
||||
1. **HTTP Connection Establishment to LLM**:
|
||||
Arch initiates an HTTP connection to the upstream LLM service. This connection is handled by Arch’s egress listener
|
||||
running on a worker thread. The connection typically uses a secure transport protocol such as HTTPS, ensuring the
|
||||
prompt data is encrypted before being sent to the LLM service.
|
||||
|
||||
2. **Rate Limiting**:
|
||||
Before sending the request to the LLM, Arch applies rate-limiting policies to ensure that the upstream LLM service
|
||||
is not overwhelmed by excessive traffic. Rate limits are enforced per client or service, ensuring fair usage and
|
||||
preventing accidental or malicious overload. If the rate limit is exceeded, Arch may return an appropriate HTTP
|
||||
2. **Rate Limiting**:
|
||||
Before sending the request to the LLM, Arch applies rate-limiting policies to ensure that the upstream LLM service
|
||||
is not overwhelmed by excessive traffic. Rate limits are enforced per client or service, ensuring fair usage and
|
||||
preventing accidental or malicious overload. If the rate limit is exceeded, Arch may return an appropriate HTTP
|
||||
error (e.g., 429 Too Many Requests) without sending the prompt to the LLM.
|
||||
|
||||
3. **Load Balancing to (hosted) LLM Endpoints**:
|
||||
After passing the rate-limiting checks, Arch routes the prompt to the appropriate LLM endpoint.
|
||||
If multiple LLM providers instances are available, load balancing is performed to distribute traffic evenly
|
||||
across the instances. Arch checks the health of the LLM endpoints using circuit breakers and health checks,
|
||||
3. **Load Balancing to (hosted) LLM Endpoints**:
|
||||
After passing the rate-limiting checks, Arch routes the prompt to the appropriate LLM endpoint.
|
||||
If multiple LLM providers instances are available, load balancing is performed to distribute traffic evenly
|
||||
across the instances. Arch checks the health of the LLM endpoints using circuit breakers and health checks,
|
||||
ensuring that the prompt is only routed to a healthy, responsive instance.
|
||||
|
||||
4. **Response Reception and Forwarding**:
|
||||
Once the LLM processes the prompt, Arch receives the response from the LLM service. The response is typically a
|
||||
generated text, completion, or summarization. Upon reception, Arch decrypts (if necessary) and handles the response,
|
||||
4. **Response Reception and Forwarding**:
|
||||
Once the LLM processes the prompt, Arch receives the response from the LLM service. The response is typically a
|
||||
generated text, completion, or summarization. Upon reception, Arch decrypts (if necessary) and handles the response,
|
||||
passing it through any egress processing pipeline defined by the application, such as logging or additional response filtering.
|
||||
|
||||
|
||||
|
|
@ -167,10 +167,10 @@ Post-request processing
|
|||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
Once a request completes, the stream is destroyed. The following also takes places:
|
||||
|
||||
* The post-request :ref:`monitoring <monitoring>` are updated (e.g. timing, active requests, upgrades, health checks).
|
||||
Some statistics are updated earlier however, during request processing. Stats are batchedand written by the main
|
||||
* The post-request :ref:`monitoring <monitoring>` are updated (e.g. timing, active requests, upgrades, health checks).
|
||||
Some statistics are updated earlier however, during request processing. Stats are batchedand written by the main
|
||||
thread periodically.
|
||||
* :ref:`Access logs <arch_access_logging>` are written to the access log
|
||||
* :ref:`Trace <arch_overview_tracing>` spans are finalized. If our example request was traced, a
|
||||
trace span, describing the duration and details of the request would be created by the HCM when
|
||||
processing request headers and then finalized by the HCM during post-request processing.
|
||||
processing request headers and then finalized by the HCM during post-request processing.
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ in a centralized way.
|
|||
|
||||
**The project was born out of the belief that:**
|
||||
|
||||
*Prompts are nuanced and opaque user requests, which require the same capabilities as traditional HTTP requests
|
||||
*Prompts are nuanced and opaque user requests, which require the same capabilities as traditional HTTP requests
|
||||
including secure handling, intelligent routing, robust observability, and integration with backend (API)
|
||||
systems for personalization - all outside business logic.*
|
||||
|
||||
|
|
@ -39,10 +39,10 @@ functionality exclusively for prompts and LLMs. This gives Arch several advantag
|
|||
* Arch works with any application language. A single Arch deployment can act as gateway for AI applications
|
||||
written in Python, Java, C++, Go, Php, etc.
|
||||
|
||||
* Arch can be deployed and upgraded quickly across your infrastructure transparently without the horrid pain
|
||||
* Arch can be deployed and upgraded quickly across your infrastructure transparently without the horrid pain
|
||||
of deploying library upgrades in your applications.
|
||||
|
||||
**Engineered with Fast LLMs:** Arch is engineered with specialized (sub-billion) LLMs that are desgined for
|
||||
**Engineered with Fast LLMs:** Arch is engineered with specialized (sub-billion) LLMs that are desgined for
|
||||
fast, cost-effective and acurrate handling of prompts. These :ref:`LLMs <llms_in_arch>` are designed to be
|
||||
best-in-class for critcal prompt-related tasks like:
|
||||
|
||||
|
|
@ -51,7 +51,7 @@ best-in-class for critcal prompt-related tasks like:
|
|||
you want to expose to users to perform tasks, gather information, or manipulate data. With function calling,
|
||||
you have flexibility to support "agentic" experiences tailored to specific use cases - from updating insurance
|
||||
claims to creating ad campaigns - via prompts. Arch analyzes prompts, extracts critical information from
|
||||
prompts, engages in lightweight conversation to gather any missing parameters and makes API calls so that you can
|
||||
prompts, engages in lightweight conversation to gather any missing parameters and makes API calls so that you can
|
||||
focus on writing business logic. For more details, read :ref:`prompt processing <arch_overview_prompt_handling>`.
|
||||
|
||||
* **Prompt Guardrails:** Arch helps you improve the safety of your application by applying prompt guardrails in
|
||||
|
|
@ -83,8 +83,8 @@ critical aspects of your application: latency, token usage, and error rates by a
|
|||
measures the speed at which your application is responding to users, which includes metrics like time to first
|
||||
token (TFT), time per output token (TOT) metrics, and the total latency as perceived by users.
|
||||
|
||||
**End-to-End Tracing:** Arch propagates trace context using the W3C Trace Context standard, specifically through
|
||||
the ``traceparent`` header. This allows each component in the system to record its part of the request flow,
|
||||
**End-to-End Tracing:** Arch propagates trace context using the W3C Trace Context standard, specifically through
|
||||
the ``traceparent`` header. This allows each component in the system to record its part of the request flow,
|
||||
enabling **end-to-end tracing** across the entire application. By using OpenTelemetry, Arch ensures that
|
||||
developers can capture this trace data consistently and in a format compatible with various observability tools.
|
||||
For more details, read :ref:`tracing <arch_overview_tracing>`.
|
||||
For more details, read :ref:`tracing <arch_overview_tracing>`.
|
||||
|
|
|
|||
|
|
@ -3,19 +3,19 @@
|
|||
LLMs
|
||||
====
|
||||
|
||||
Arch utilizes purpose-built, industry leading, LLMs to handle the crufty and undifferentiated work around
|
||||
Arch utilizes purpose-built, industry leading, LLMs to handle the crufty and undifferentiated work around
|
||||
accepting, handling and processing prompts. The following sections talk about some of the core models that
|
||||
are built-in Arch.
|
||||
are built-in Arch.
|
||||
|
||||
Arch-Guard-v1
|
||||
-------------
|
||||
LLM-powered applications are susceptible to prompt attacks, which are prompts intentionally designed to
|
||||
subvert the developer’s intended behavior of the LLM. Arch-Guard-v1 is a classifier model trained on a large
|
||||
corpus of attacks, capable of detecting explicitly malicious prompts (and toxicity).
|
||||
LLM-powered applications are susceptible to prompt attacks, which are prompts intentionally designed to
|
||||
subvert the developer’s intended behavior of the LLM. Arch-Guard-v1 is a classifier model trained on a large
|
||||
corpus of attacks, capable of detecting explicitly malicious prompts (and toxicity).
|
||||
|
||||
The model is useful as a starting point for identifying and guardrailing against the most risky realistic
|
||||
inputs to LLM-powered applications. Our goal in embedding Arch-Guard in the Arch gateway is to enable developers
|
||||
to focus on their business logic and factor out security and safety outside application logic. Wth Arch-Guard-v1
|
||||
The model is useful as a starting point for identifying and guardrailing against the most risky realistic
|
||||
inputs to LLM-powered applications. Our goal in embedding Arch-Guard in the Arch gateway is to enable developers
|
||||
to focus on their business logic and factor out security and safety outside application logic. Wth Arch-Guard-v1
|
||||
developers can take to significantly reduce prompt attack risk while maintaining control over the user experience.
|
||||
|
||||
Below is our test results of the strength of our model as compared to Prompt-Guard from `Meta LLama <https://huggingface.co/meta-llama/Prompt-Guard-86M>`_.
|
||||
|
|
@ -140,24 +140,20 @@ Below is our test results of the strength of our model as compared to Prompt-Gua
|
|||
Arch-FC
|
||||
-------
|
||||
Arch-FC is a lean, powerful and cost-effective agentic model designed for function calling scenarios.
|
||||
You can run Arch-FC locally, or use the cloud-hosted version for as little as $0.05/M token (100x cheaper
|
||||
You can run Arch-FC locally, or use the cloud-hosted version for as little as $0.05/M token (100x cheaper
|
||||
than GPT-4o), with a p50 latency of 200ms (5x faster than GPT-4o), while meeting frontier model performance.
|
||||
|
||||
.. Note::
|
||||
Function calling helps you personalize the GenAI experience by calling application-specific operations via
|
||||
prompts. This involves any predefined functions or APIs you want to expose to perform tasks, gather
|
||||
information, or manipulate data - via prompts.
|
||||
Function calling helps you personalize the GenAI experience by calling application-specific operations via
|
||||
prompts. This involves any predefined functions or APIs you want to expose to perform tasks, gather
|
||||
information, or manipulate data - via prompts.
|
||||
|
||||
You can get started with function calling simply by configuring a prompt target with a name, description
|
||||
You can get started with function calling simply by configuring a prompt target with a name, description
|
||||
and set of parameters needed by a specific backend function or a hosted API. The name, and description helps
|
||||
Arch-FC match a user prompt to a function or API that can process it.
|
||||
|
||||
By using Arch-FC, Arch enables you to easily build agentic workflows tailored to domain-specific use cases -
|
||||
from updating insurance claims to creating ad campaigns. Arch-FC analyzes prompts, extracts critical information
|
||||
By using Arch-FC, Arch enables you to easily build agentic workflows tailored to domain-specific use cases -
|
||||
from updating insurance claims to creating ad campaigns. Arch-FC analyzes prompts, extracts critical information
|
||||
from prompts, engages in lightweight conversations with the user to gather any missing parameters need before
|
||||
handling control back to Arch to make the API call to your hosted backend. Arch-FC handles the muck of information
|
||||
extraction so that you can focus on the business logic of your application.
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -3,16 +3,16 @@
|
|||
Access Logging
|
||||
==============
|
||||
|
||||
Access logging in Arch refers to the logging of detailed information about each request and response that flows through Arch.
|
||||
It provides visibility into the traffic passing through Arch, which is crucial for monitoring, debugging, and analyzing the
|
||||
Access logging in Arch refers to the logging of detailed information about each request and response that flows through Arch.
|
||||
It provides visibility into the traffic passing through Arch, which is crucial for monitoring, debugging, and analyzing the
|
||||
behavior of AI applications and their interactions.
|
||||
|
||||
Key Features of Access Logging in Arch:
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
* **Per-Request Logging**:
|
||||
Each request that passes through Arch is logged. This includes important metadata such as HTTP method,
|
||||
path, response status code, request duration, upstream host, and more.
|
||||
* **Integration with Monitoring Tools**:
|
||||
* **Per-Request Logging**:
|
||||
Each request that passes through Arch is logged. This includes important metadata such as HTTP method,
|
||||
path, response status code, request duration, upstream host, and more.
|
||||
* **Integration with Monitoring Tools**:
|
||||
Access logs can be exported to centralized logging systems (e.g., ELK stack or Fluentd) or used to feed monitoring and alerting systems.
|
||||
* **Structured Logging**: where each request is logged as a object, making it easier to parse and analyze using tools like Elasticsearch and Kibana.
|
||||
|
||||
|
|
@ -20,4 +20,4 @@ Key Features of Access Logging in Arch:
|
|||
|
||||
[2024-09-27T14:52:01.123Z] "ARCH REQUEST" GET /path/to/resource HTTP/1.1 200 512 1024 56 upstream_service.com D
|
||||
X-Arch-Upstream-Service-Time: 25
|
||||
X-Arch-Attempt-Count: 1
|
||||
X-Arch-Attempt-Count: 1
|
||||
|
|
|
|||
|
|
@ -8,4 +8,4 @@ Observability
|
|||
|
||||
tracing
|
||||
stats
|
||||
access_logs
|
||||
access_logs
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
Monitoring
|
||||
==========
|
||||
|
||||
Arch offers several monitoring metrics that help you understand three critical aspects of your application:
|
||||
latency, token usage, and error rates by an upstream LLM provider. Latency measures the speed at which your
|
||||
application is responding to users, which includes metrics like time to first token (TFT), time per output
|
||||
token (TOT) metrics, and the total latency as perceived by users.
|
||||
Arch offers several monitoring metrics that help you understand three critical aspects of your application:
|
||||
latency, token usage, and error rates by an upstream LLM provider. Latency measures the speed at which your
|
||||
application is responding to users, which includes metrics like time to first token (TFT), time per output
|
||||
token (TOT) metrics, and the total latency as perceived by users.
|
||||
|
|
|
|||
|
|
@ -1,35 +1,35 @@
|
|||
.. _arch_overview_tracing:
|
||||
|
||||
Tracing
|
||||
Tracing
|
||||
=======
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
`OpenTelemetry <https://opentelemetry.io/>`_ is an open-source observability framework providing APIs
|
||||
and instrumentation for generating, collecting, processing, and exporting telemetry data, such as traces,
|
||||
metrics, and logs. Its flexible design supports a wide range of backends and seamlessly integrates with
|
||||
modern application tools. A key feature of OpenTelemetry is its commitment to standards like the
|
||||
`OpenTelemetry <https://opentelemetry.io/>`_ is an open-source observability framework providing APIs
|
||||
and instrumentation for generating, collecting, processing, and exporting telemetry data, such as traces,
|
||||
metrics, and logs. Its flexible design supports a wide range of backends and seamlessly integrates with
|
||||
modern application tools. A key feature of OpenTelemetry is its commitment to standards like the
|
||||
`W3C Trace Context <https://www.w3.org/TR/trace-context/>`_
|
||||
|
||||
**Tracing** is a critical tool that allows developers to visualize and understand the flow of
|
||||
requests in an AI application. With tracing, you can capture a detailed view of how requests propagate
|
||||
through various services and components, which is crucial for **debugging**, **performance optimization**,
|
||||
**Tracing** is a critical tool that allows developers to visualize and understand the flow of
|
||||
requests in an AI application. With tracing, you can capture a detailed view of how requests propagate
|
||||
through various services and components, which is crucial for **debugging**, **performance optimization**,
|
||||
and understanding complex AI agent architectures like Co-pilots.
|
||||
|
||||
**Arch** propagates trace context using the W3C Trace Context standard, specifically through the
|
||||
``traceparent`` header. This allows each component in the system to record its part of the request
|
||||
flow, enabling **end-to-end tracing** across the entire application. By using OpenTelemetry, Arch ensures
|
||||
that developers can capture this trace data consistently and in a format compatible with various observability
|
||||
**Arch** propagates trace context using the W3C Trace Context standard, specifically through the
|
||||
``traceparent`` header. This allows each component in the system to record its part of the request
|
||||
flow, enabling **end-to-end tracing** across the entire application. By using OpenTelemetry, Arch ensures
|
||||
that developers can capture this trace data consistently and in a format compatible with various observability
|
||||
tools.
|
||||
______________________________________________________________________________________________
|
||||
|
||||
Benefits of using ``traceparent`` headers
|
||||
Benefits of using ``traceparent`` headers
|
||||
-----------------------------------------
|
||||
|
||||
- **Standardization**: The W3C Trace Context standard ensures compatibility across ecosystem tools, allowing
|
||||
- **Standardization**: The W3C Trace Context standard ensures compatibility across ecosystem tools, allowing
|
||||
traces to be propagated uniformly through different layers of the system.
|
||||
- **Ease of Integration**: OpenTelemetry's design allows developers to easily integrate tracing with minimal
|
||||
- **Ease of Integration**: OpenTelemetry's design allows developers to easily integrate tracing with minimal
|
||||
changes to their codebase, enabling quick adoption of end-to-end observability.
|
||||
- **Interoperability**: Works seamlessly with popular tracing tools like AWS X-Ray, Datadog, Jaeger, and many others,
|
||||
making it easy to visualize traces in the tools you're already usi
|
||||
|
|
@ -46,15 +46,15 @@ How to initiate a trace
|
|||
- Start a new span representing its processing of the request.
|
||||
- Forward the ``traceparent`` header to downstream services.
|
||||
|
||||
3. **Sampling Policy**: The 100 in ``tracing: 100`` means that all the requests as sampled for tracing.
|
||||
3. **Sampling Policy**: The 100 in ``tracing: 100`` means that all the requests as sampled for tracing.
|
||||
You can adjust this value from 0-100.
|
||||
|
||||
|
||||
Trace Propagation
|
||||
-----------------
|
||||
|
||||
Arch uses the W3C Trace Context standard for trace propagation, which relies on the ``traceparent`` header.
|
||||
This header carries tracing information in a standardized format, enabling interoperability between different
|
||||
Arch uses the W3C Trace Context standard for trace propagation, which relies on the ``traceparent`` header.
|
||||
This header carries tracing information in a standardized format, enabling interoperability between different
|
||||
tracing systems.
|
||||
|
||||
Header Format
|
||||
|
|
@ -73,7 +73,7 @@ Instrumentation
|
|||
~~~~~~~~~~~~~~~
|
||||
|
||||
To integrate AI tracing, your application needs to follow a few simple steps. The steps
|
||||
below are very common practice, and not unique to Arch, when you reading tracing headers and export
|
||||
below are very common practice, and not unique to Arch, when you reading tracing headers and export
|
||||
`spans <https://docs.lightstep.com/docs/understand-distributed-tracing>`_ for distributed tracing.
|
||||
|
||||
- Read the ``traceparent`` header from incoming requests.
|
||||
|
|
@ -147,14 +147,14 @@ Handle incoming requests:
|
|||
AI Agent Tracing Visualization Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The following is an example of tracing for an AI-powered customer support system.
|
||||
A customer interacts with AI agents, which forward their requests through different
|
||||
The following is an example of tracing for an AI-powered customer support system.
|
||||
A customer interacts with AI agents, which forward their requests through different
|
||||
specialized services and external systems.
|
||||
|
||||
::
|
||||
|
||||
+--------------------------+
|
||||
| Customer Interaction |
|
||||
| Customer Interaction |
|
||||
+--------------------------+
|
||||
|
|
||||
v
|
||||
|
|
@ -179,17 +179,17 @@ Trace Breakdown:
|
|||
- Span 1: Customer initiates a request via the AI-powered chatbot for billing support (e.g., asking for payment details).
|
||||
|
||||
- AI Agent 1 (Main - Arch):
|
||||
- Span 2: AI Agent 1 (Main) processes the request and identifies it as related to billing, forwarding the request
|
||||
- Span 2: AI Agent 1 (Main) processes the request and identifies it as related to billing, forwarding the request
|
||||
to an external payment service.
|
||||
- Span 3: AI Agent 1 determines that additional technical support is needed for processing and forwards the request
|
||||
- Span 3: AI Agent 1 determines that additional technical support is needed for processing and forwards the request
|
||||
to AI Agent 2.
|
||||
|
||||
- External Payment Service:
|
||||
- Span 4: The external payment service processes the payment-related request (e.g., verifying payment status) and sends
|
||||
- Span 4: The external payment service processes the payment-related request (e.g., verifying payment status) and sends
|
||||
the response back to AI Agent 1.
|
||||
|
||||
- AI Agent 2 (Tech - Arch):
|
||||
- Span 5: AI Agent 2, responsible for technical queries, processes a request forwarded from AI Agent 1 (e.g., checking for
|
||||
- Span 5: AI Agent 2, responsible for technical queries, processes a request forwarded from AI Agent 1 (e.g., checking for
|
||||
any account issues).
|
||||
- Span 6: AI Agent 2 forwards the query to Internal Tech Support for further investigation.
|
||||
|
||||
|
|
@ -197,7 +197,7 @@ Trace Breakdown:
|
|||
- Span 7: Internal Tech Support processes the request (e.g., resolving account access issues) and responds to AI Agent 2.
|
||||
|
||||
- AI Agent 3 (Orders - Arch):
|
||||
- Span 8: AI Agent 3 handles order-related queries. AI Agent 1 forwards the request to AI Agent 3 after payment verification
|
||||
- Span 8: AI Agent 3 handles order-related queries. AI Agent 1 forwards the request to AI Agent 3 after payment verification
|
||||
is completed.
|
||||
- Span 9: AI Agent 3 forwards a request to the Inventory Management system to confirm product availability for a pending order.
|
||||
|
||||
|
|
@ -297,8 +297,8 @@ Best Practices
|
|||
Conclusion
|
||||
----------
|
||||
|
||||
By leveraging the ``traceparent`` header for trace context propagation, Arch enables developers to implement
|
||||
tracing efficiently. This approach simplifies the process of collecting and analyzing tracing data in common
|
||||
By leveraging the ``traceparent`` header for trace context propagation, Arch enables developers to implement
|
||||
tracing efficiently. This approach simplifies the process of collecting and analyzing tracing data in common
|
||||
tools like AWS X-Ray and Datadog, enhancing observability and facilitating faster debugging and optimization.
|
||||
|
||||
Additional Resources
|
||||
|
|
@ -311,5 +311,3 @@ Additional Resources
|
|||
|
||||
.. Note::
|
||||
Replace placeholders like ``your-aws-region``, and ``DD_API_KEY`` with your actual configurations.
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -7,13 +7,13 @@ Documentation
|
|||
|
||||
**Arch is built on (and by the core contributors of) Envoy proxy with the belief that:**
|
||||
|
||||
*Prompts are nuanced and opaque user requests, which require the same capabilities as traditional HTTP requests
|
||||
*Prompts are nuanced and opaque user requests, which require the same capabilities as traditional HTTP requests
|
||||
including secure handling, intelligent routing, robust observability, and integration with backend (API)
|
||||
systems for personalization - all outside business logic.*
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
|
||||
intro/intro
|
||||
getting_started/getting_started
|
||||
getting_started/use_cases
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue