mirror of
https://github.com/katanemo/plano.git
synced 2026-05-03 04:42:49 +02:00
668 lines
No EOL
36 KiB
HTML
668 lines
No EOL
36 KiB
HTML
|
||
<!DOCTYPE html>
|
||
|
||
|
||
<html lang="en" data-content_root="../" >
|
||
|
||
<head>
|
||
<meta charset="utf-8" />
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||
|
||
<title>Life of a Request — Arch 0.1-beta documentation</title>
|
||
|
||
|
||
|
||
<script data-cfasync="false">
|
||
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
|
||
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
|
||
</script>
|
||
|
||
<!-- Loaded before other Sphinx assets -->
|
||
<link href="../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
|
||
<link href="../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
|
||
<link href="../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
|
||
|
||
|
||
<link href="../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
|
||
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
|
||
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
|
||
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />
|
||
|
||
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=a746c00c" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/styles/sphinx-book-theme.css?v=a3416100" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
|
||
|
||
<!-- Pre-loaded scripts that we'll load fully later -->
|
||
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
|
||
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
|
||
<script src="../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>
|
||
|
||
<script src="../_static/documentation_options.js?v=2742c0eb"></script>
|
||
<script src="../_static/doctools.js?v=9a2dae69"></script>
|
||
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
|
||
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
|
||
<script src="../_static/copybutton.js?v=f281be69"></script>
|
||
<script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
|
||
<script>DOCUMENTATION_OPTIONS.pagename = 'intro/life_of_a_request';</script>
|
||
<link rel="icon" href="../_static/favicon.ico"/>
|
||
<link rel="index" title="Index" href="../genindex.html" />
|
||
<link rel="search" title="Search" href="../search.html" />
|
||
<link rel="next" title="Getting help" href="getting_help.html" />
|
||
<link rel="prev" title="Model Serving" href="architecture/model_serving/model_serving.html" />
|
||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||
<meta name="docsearch:language" content="en"/>
|
||
</head>
|
||
|
||
|
||
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
|
||
|
||
|
||
|
||
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
|
||
|
||
<div id="pst-scroll-pixel-helper"></div>
|
||
|
||
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
|
||
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
|
||
|
||
|
||
<input type="checkbox"
|
||
class="sidebar-toggle"
|
||
id="pst-primary-sidebar-checkbox"/>
|
||
<label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>
|
||
|
||
<input type="checkbox"
|
||
class="sidebar-toggle"
|
||
id="pst-secondary-sidebar-checkbox"/>
|
||
<label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>
|
||
|
||
<div class="search-button__wrapper">
|
||
<div class="search-button__overlay"></div>
|
||
<div class="search-button__search-container">
|
||
<form class="bd-search d-flex align-items-center"
|
||
action="../search.html"
|
||
method="get">
|
||
<i class="fa-solid fa-magnifying-glass"></i>
|
||
<input type="search"
|
||
class="form-control"
|
||
name="q"
|
||
id="search-input"
|
||
placeholder="Search..."
|
||
aria-label="Search..."
|
||
autocomplete="off"
|
||
autocorrect="off"
|
||
autocapitalize="off"
|
||
spellcheck="false"/>
|
||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
|
||
</form></div>
|
||
</div>
|
||
|
||
<div class="pst-async-banner-revealer d-none">
|
||
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
|
||
</div>
|
||
|
||
|
||
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
|
||
</header>
|
||
|
||
|
||
<div class="bd-container">
|
||
<div class="bd-container__inner bd-page-width">
|
||
|
||
|
||
|
||
<div class="bd-sidebar-primary bd-sidebar">
|
||
|
||
|
||
|
||
<div class="sidebar-header-items sidebar-primary__section">
|
||
|
||
|
||
|
||
|
||
</div>
|
||
|
||
<div class="sidebar-primary-items__start sidebar-primary__section">
|
||
<div class="sidebar-primary-item">
|
||
|
||
|
||
|
||
|
||
|
||
<a class="navbar-brand logo" href="../root.html">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<img src="../_static/arch-nav-logo.png" class="logo__image only-light" alt="Arch 0.1-beta documentation - Home"/>
|
||
<script>document.write(`<img src="../_static/arch-nav-logo.png" class="logo__image only-dark" alt="Arch 0.1-beta documentation - Home"/>`);</script>
|
||
|
||
|
||
</a></div>
|
||
<div class="sidebar-primary-item">
|
||
|
||
<script>
|
||
document.write(`
|
||
<button class="btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<i class="fa-solid fa-magnifying-glass"></i>
|
||
<span class="search-button__default-text">Search</span>
|
||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
|
||
</button>
|
||
`);
|
||
</script></div>
|
||
<div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
|
||
<div class="bd-toc-item navbar-nav active">
|
||
<ul class="current nav bd-sidenav">
|
||
<li class="toctree-l1 current active has-children"><a class="reference internal" href="intro.html">Introduction</a><details open="open"><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul class="current">
|
||
<li class="toctree-l2"><a class="reference internal" href="what_is_arch.html">What is Arch</a></li>
|
||
<li class="toctree-l2 has-children"><a class="reference internal" href="architecture/architecture.html">Technical Architecture</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="architecture/intro/terminology.html">Terminology</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="architecture/intro/threading_model.html">Threading model</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="architecture/listeners/listeners.html">Listener</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="architecture/prompt_processing/prompt_processing.html">Prompts</a></li>
|
||
|
||
|
||
|
||
|
||
<li class="toctree-l3"><a class="reference internal" href="architecture/listeners/llm_provider.html">LLM Provider</a></li>
|
||
|
||
<li class="toctree-l3"><a class="reference internal" href="architecture/model_serving/model_serving.html">Model Serving</a></li>
|
||
</ul>
|
||
</details></li>
|
||
<li class="toctree-l2 current active"><a class="current reference internal" href="#">Life of a Request</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="getting_help.html">Getting help</a></li>
|
||
</ul>
|
||
</details></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../getting_started/getting_started.html">Getting Started</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../getting_started/use_cases/rag.html">Retrieval-Augmented (RAG)</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../getting_started/use_cases/function_calling.html">Agentic (Text-to-Action) Apps</a></li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../observability/observability.html">Observability</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../observability/tracing.html">Tracing</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../observability/stats.html">Monitoring</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../observability/access_logs.html">Access Logging</a></li>
|
||
</ul>
|
||
</details></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../llms/llms.html">LLMs</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../configuration_reference.html">Configuration Reference</a></li>
|
||
</ul>
|
||
|
||
</div>
|
||
</nav></div>
|
||
</div>
|
||
|
||
|
||
<div class="sidebar-primary-items__end sidebar-primary__section">
|
||
</div>
|
||
|
||
<div id="rtd-footer-container"></div>
|
||
|
||
|
||
</div>
|
||
|
||
<main id="main-content" class="bd-main" role="main">
|
||
|
||
|
||
|
||
<div class="sbt-scroll-pixel-helper"></div>
|
||
|
||
<div class="bd-content">
|
||
<div class="bd-article-container">
|
||
|
||
<div class="bd-header-article d-print-none">
|
||
<div class="header-article-items header-article__inner">
|
||
|
||
<div class="header-article-items__start">
|
||
|
||
<div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<span class="fa-solid fa-bars"></span>
|
||
</button></div>
|
||
|
||
</div>
|
||
|
||
|
||
<div class="header-article-items__end">
|
||
|
||
<div class="header-article-item">
|
||
|
||
<div class="article-header-buttons">
|
||
|
||
|
||
|
||
|
||
|
||
<div class="dropdown dropdown-download-buttons">
|
||
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
|
||
<i class="fas fa-download"></i>
|
||
</button>
|
||
<ul class="dropdown-menu">
|
||
|
||
|
||
|
||
<li><a href="../_sources/intro/life_of_a_request.rst" target="_blank"
|
||
class="btn btn-sm btn-download-source-button dropdown-item"
|
||
title="Download source file"
|
||
data-bs-placement="left" data-bs-toggle="tooltip"
|
||
>
|
||
|
||
|
||
<span class="btn__icon-container">
|
||
<i class="fas fa-file"></i>
|
||
</span>
|
||
<span class="btn__text-container">.rst</span>
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
<li>
|
||
<button onclick="window.print()"
|
||
class="btn btn-sm btn-download-pdf-button dropdown-item"
|
||
title="Print to PDF"
|
||
data-bs-placement="left" data-bs-toggle="tooltip"
|
||
>
|
||
|
||
|
||
<span class="btn__icon-container">
|
||
<i class="fas fa-file-pdf"></i>
|
||
</span>
|
||
<span class="btn__text-container">.pdf</span>
|
||
</button>
|
||
</li>
|
||
|
||
</ul>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
<button onclick="toggleFullScreen()"
|
||
class="btn btn-sm btn-fullscreen-button"
|
||
title="Fullscreen mode"
|
||
data-bs-placement="bottom" data-bs-toggle="tooltip"
|
||
>
|
||
|
||
|
||
<span class="btn__icon-container">
|
||
<i class="fas fa-expand"></i>
|
||
</span>
|
||
|
||
</button>
|
||
|
||
|
||
|
||
<script>
|
||
document.write(`
|
||
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
|
||
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
|
||
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
|
||
</button>
|
||
`);
|
||
</script>
|
||
|
||
|
||
<script>
|
||
document.write(`
|
||
<button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<i class="fa-solid fa-magnifying-glass fa-lg"></i>
|
||
</button>
|
||
`);
|
||
</script>
|
||
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<span class="fa-solid fa-list"></span>
|
||
</button>
|
||
</div></div>
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div id="jb-print-docs-body" class="onlyprint">
|
||
<h1>Life of a Request</h1>
|
||
<!-- Table of contents -->
|
||
<div id="print-main-content">
|
||
<div id="jb-print-toc">
|
||
|
||
<div>
|
||
<h2> Contents </h2>
|
||
</div>
|
||
<nav aria-label="Page">
|
||
<ul class="visible nav section-nav flex-column">
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#terminology">Terminology</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#network-topology">Network topology</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#high-level-architecture">High level architecture</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#configuration">Configuration</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#request-flow-ingress">Request Flow (Ingress)</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#request-flow-egress">Request Flow (Egress)</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#id2">Overview</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#post-request-processing">Post-request processing</a></li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div id="searchbox"></div>
|
||
<article class="bd-article">
|
||
|
||
<section id="life-of-a-request">
|
||
<span id="id1"></span><h1>Life of a Request<a class="headerlink" href="#life-of-a-request" title="Link to this heading">#</a></h1>
|
||
<p>Below we describe the events in the life of a request passing through an Arch gateway instance. We first
|
||
describe how Arch fits into the request path and then the internal events that take place following
|
||
the arrival of a request at Arch from downtream clients. We follow the request until the corresponding
|
||
dispatch upstream and the response path.</p>
|
||
<a class="reference internal image-reference" href="../_images/network-topology-ingress-egress.jpg"><img alt="../_images/network-topology-ingress-egress.jpg" class="align-center" src="../_images/network-topology-ingress-egress.jpg" style="width: 100%;" />
|
||
</a>
|
||
<section id="terminology">
|
||
<h2>Terminology<a class="headerlink" href="#terminology" title="Link to this heading">#</a></h2>
|
||
<p>We recommend that you get familiar with some of the <a class="reference internal" href="architecture/intro/terminology.html#arch-terminology"><span class="std std-ref">terminology</span></a> used in Arch
|
||
before reading this section.</p>
|
||
</section>
|
||
<section id="network-topology">
|
||
<h2>Network topology<a class="headerlink" href="#network-topology" title="Link to this heading">#</a></h2>
|
||
<p>How a request flows through the components in a network (including Arch) depends on the network’s topology.
|
||
Arch can be used in a wide variety of networking topologies. We focus on the inner operation of Arch below,
|
||
but briefly we address how Arch relates to the rest of the network in this section.</p>
|
||
<ul class="simple">
|
||
<li><p><strong>Downstream(Ingress)</strong> listeners take requests from upstream clients like a web UI or clients that forward
|
||
prompts to you local application responses from the application flow back through Arch to the downstream.</p></li>
|
||
<li><p><strong>Upstream(Egress)</strong> listeners take requests from the application and forward them to LLMs.</p></li>
|
||
</ul>
|
||
<a class="reference internal image-reference" href="../_images/network-topology-ingress-egress.jpg"><img alt="../_images/network-topology-ingress-egress.jpg" class="align-center" src="../_images/network-topology-ingress-egress.jpg" style="width: 100%;" />
|
||
</a>
|
||
<p>In practice, Arch can be deployed on the edge and as an internal load balancer between AI agents. A request path may
|
||
traverse multiple Arch gateways:</p>
|
||
<a class="reference internal image-reference" href="../_images/network-topology-agent.jpg"><img alt="../_images/network-topology-agent.jpg" class="align-center" src="../_images/network-topology-agent.jpg" style="width: 100%;" />
|
||
</a>
|
||
</section>
|
||
<section id="high-level-architecture">
|
||
<h2>High level architecture<a class="headerlink" href="#high-level-architecture" title="Link to this heading">#</a></h2>
|
||
<p>Arch is a set of <strong>two</strong> self-contained processes that are designed to run alongside your application servers
|
||
(or on a separate server connected to your application servers via a network). The first process is designated
|
||
to manage HTTP-level networking and connection management concerns (protocol management, request id generation,
|
||
header sanitization, etc.), and the other process is for <strong>model serving</strong>, which helps Arch make intelligent
|
||
decisions about the incoming prompts. The model server hosts the purpose-built <a class="reference internal" href="../llms/llms.html#llms-in-arch"><span class="std std-ref">LLMs</span></a> to
|
||
manage several critical, but undifferentiated, prompt related tasks on behalf of developers.</p>
|
||
<p>The request processing path in Arch has three main parts:</p>
|
||
<ul class="simple">
|
||
<li><p><a class="reference internal" href="architecture/listeners/listeners.html#arch-overview-listeners"><span class="std std-ref">Listener subsystem</span></a> which handles <strong>downstream</strong> and <strong>upstream</strong> request
|
||
processing. It is responsible for managing the downstream (ingress) and the upstream (egress) request
|
||
lifecycle. The downstream and upstream HTTP/2 codec lives here.</p></li>
|
||
<li><p><a class="reference internal" href="architecture/prompt_processing/prompt_processing.html#arch-overview-prompt-handling"><span class="std std-ref">Prompt handler subsystem</span></a> which is responsible for selecting and
|
||
forwarding prompts <code class="docutils literal notranslate"><span class="pre">prompt_targets</span></code> and establishes the lifecycle of any <strong>upstream</strong> connection to a
|
||
hosted endpoint that implements domain-specific business logic for incoming promots. This is where knowledge
|
||
of targets and endpoint health, load balancing and connection pooling exists.</p></li>
|
||
<li><p><a class="reference internal" href="architecture/model_serving/model_serving.html#arch-model-serving"><span class="std std-ref">Model serving subsystem</span></a> which helps Arch make intelligent decisions about the
|
||
incoming prompts. The model server is designed to call the purpose-built <a class="reference internal" href="../llms/llms.html#llms-in-arch"><span class="std std-ref">LLMs</span></a> in Arch.</p></li>
|
||
</ul>
|
||
<p>The three subsystems are bridged with either the HTTP router filter, and the cluster manager subsystems of Envoy.</p>
|
||
<p>Also, Arch utilizes <a class="reference external" href="https://blog.envoyproxy.io/envoy-threading-model-a8d44b922310">Envoy event-based thread model</a>.
|
||
A main thread is responsible forthe server lifecycle, configuration processing, stats, etc. and some number of
|
||
<a class="reference internal" href="architecture/intro/threading_model.html#arch-overview-threading"><span class="std std-ref">worker threads</span></a> process requests. All threads operate around an event loop (<a class="reference external" href="https://libevent.org/">libevent</a>)
|
||
and any given downstream TCP connection will be handled by exactly one worker thread for its lifetime. Each worker
|
||
thread maintains its own pool of TCP connections to upstream endpoints.</p>
|
||
<p>Worker threads rarely share state and operate in a trivially parallel fashion. This threading model
|
||
enables scaling to very high core count CPUs.</p>
|
||
</section>
|
||
<section id="configuration">
|
||
<h2>Configuration<a class="headerlink" href="#configuration" title="Link to this heading">#</a></h2>
|
||
<p>Today, only support a static bootstrap configuration file for simplicity today:</p>
|
||
<div class="highlight-yaml notranslate"><div class="highlight"><pre><span></span><span class="nt">version</span><span class="p">:</span><span class="w"> </span><span class="s">"0.1-beta"</span>
|
||
<span class="nt">listener</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="nt">address</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">127.0.0.1 | 0.0.0.0</span>
|
||
<span class="w"> </span><span class="nt">port_value</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">8080</span><span class="w"> </span><span class="c1">#If you configure port 443, you'll need to update the listener with tls_certificates</span>
|
||
<span class="w"> </span><span class="nt">messages</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">tuple | hugging-face-messages-api</span>
|
||
|
||
<span class="nt">system_prompts</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="nt">name</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">network_assistant</span>
|
||
<span class="w"> </span><span class="nt">content</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">You are a network assistant that just offers facts about the operational health of the network</span>
|
||
|
||
<span class="nt">llm_providers</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="nt">name</span><span class="p">:</span><span class="w"> </span><span class="s">"OpenAI"</span>
|
||
<span class="w"> </span><span class="nt">access_key</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">$OPEN_AI_KEY</span>
|
||
<span class="w"> </span><span class="nt">model</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">gpt-4o</span>
|
||
<span class="w"> </span><span class="nt">default</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span>
|
||
<span class="w"> </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="nt">name</span><span class="p">:</span><span class="w"> </span><span class="s">"Mistral"</span>
|
||
<span class="w"> </span><span class="nt">access_key</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">$MISTRAL_KEY</span>
|
||
<span class="w"> </span><span class="nt">model</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">mixtral8-7B</span>
|
||
|
||
<span class="nt">prompt_endpoints</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="s">"http://127.0.0.2"</span>
|
||
<span class="w"> </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="s">"http://127.0.0.1"</span>
|
||
|
||
<span class="nt">prompt_guards</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="nt">input-guard</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="nt">name</span><span class="p">:</span><span class="w"> </span><span class="c1">#jailbreak</span>
|
||
<span class="w"> </span><span class="nt">on-exception-message</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">Looks like you are curious about my abilities. But I can only</span>
|
||
|
||
<span class="nt">prompt_targets</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="nt">name</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">information_extraction</span>
|
||
<span class="w"> </span><span class="nt">type</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">RAG</span>
|
||
<span class="w"> </span><span class="nt">description</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">this prompt handles all information extractions scenarios</span>
|
||
<span class="w"> </span><span class="nt">path</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">/agent/summary</span>
|
||
|
||
<span class="w"> </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="nt">name</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">reboot_network_device</span>
|
||
<span class="w"> </span><span class="nt">path</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">/agent/action</span>
|
||
<span class="w"> </span><span class="nt">description</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">used to help network operators with perform device operations like rebooting a device.</span>
|
||
<span class="w"> </span><span class="nt">parameters</span><span class="p">:</span>
|
||
<span class="nt">error_target</span><span class="p">:</span><span class="w"> </span><span class="c1">#handle errors from Bolt or upstream LLMs</span>
|
||
<span class="w"> </span><span class="nt">name</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">“error_handler”</span>
|
||
<span class="w"> </span><span class="nt">path</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">/errors</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="request-flow-ingress">
|
||
<h2>Request Flow (Ingress)<a class="headerlink" href="#request-flow-ingress" title="Link to this heading">#</a></h2>
|
||
<section id="overview">
|
||
<h3>Overview<a class="headerlink" href="#overview" title="Link to this heading">#</a></h3>
|
||
<p>A brief outline of the life cycle of a request and response using the example configuration above:</p>
|
||
<ol class="arabic simple">
|
||
<li><p><strong>TCP Connection Establishment</strong>:
|
||
A TCP connection from downstream is accepted by an Arch listener running on a worker thread.
|
||
The listener filter chain provides SNI and other pre-TLS information. The transport socket, typically TLS,
|
||
decrypts incoming data for processing.</p></li>
|
||
<li><p><strong>Prompt Guardrails Check</strong>:
|
||
Arch first checks the incoming prompts for guardrails such as jailbreak attempts. This ensures
|
||
that harmful or unwanted behaviors are detected early in the request processing pipeline.</p></li>
|
||
<li><p><strong>Intent Matching</strong>:
|
||
The decrypted data stream is deframed by the HTTP/2 codec in Arch’s HTTP connection manager. Arch performs
|
||
intent matching via is <strong>prompt-handler</strong> subsystem using the name and description of the defined prompt targets,
|
||
determining which endpoint should handle the prompt.</p></li>
|
||
<li><p><strong>Parameter Gathering with Arch-FC</strong>:
|
||
If a prompt target requires specific parameters, Arch engages Arch-FC to extract the necessary details
|
||
from the incoming prompt(s). This process gathers the critical information needed for downstream API calls.</p></li>
|
||
<li><p><strong>API Call Execution</strong>:
|
||
Arch routes the prompt to the appropriate backend API or function call. If an endpoint cluster is identified,
|
||
load balancing is performed, circuit breakers are checked, and the request is proxied to the upstream endpoint.</p></li>
|
||
<li><p><strong>Default Summarization by Upstream LLM</strong>:
|
||
By default, if no specific endpoint processing is needed, the prompt is sent to an upstream LLM for summarization.
|
||
This ensures that responses are concise and relevant, enhancing user experience in RAG (Retrieval-Augmented Generation)
|
||
and agentic applications.</p></li>
|
||
<li><p><strong>Error Handling and Forwarding</strong>:
|
||
Errors encountered during processing, such as failed function calls or guardrail detections, are forwarded to
|
||
designated error targets. Error details are communicated through specific headers to the application:</p>
|
||
<ul class="simple">
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">X-Function-Error-Code</span></code>: Code indicating the type of function call error.</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">X-Prompt-Guard-Error-Code</span></code>: Code specifying violations detected by prompt guardrails.</p></li>
|
||
<li><p>Additional headers carry messages and timestamps to aid in debugging and logging.</p></li>
|
||
</ul>
|
||
</li>
|
||
<li><p><strong>Response Handling</strong>:
|
||
The upstream endpoint’s TLS transport socket encrypts the response, which is then proxied back downstream.
|
||
Responses pass through HTTP filters in reverse order, ensuring any necessary processing or modification before final delivery.</p></li>
|
||
</ol>
|
||
</section>
|
||
</section>
|
||
<section id="request-flow-egress">
|
||
<h2>Request Flow (Egress)<a class="headerlink" href="#request-flow-egress" title="Link to this heading">#</a></h2>
|
||
</section>
|
||
<section id="id2">
|
||
<h2>Overview<a class="headerlink" href="#id2" title="Link to this heading">#</a></h2>
|
||
<p>A brief outline of the life cycle of a request and response in the context of egress traffic from an application
|
||
to Large Language Models (LLMs) via Arch:</p>
|
||
<ol class="arabic simple">
|
||
<li><p><strong>HTTP Connection Establishment to LLM</strong>:
|
||
Arch initiates an HTTP connection to the upstream LLM service. This connection is handled by Arch’s egress listener
|
||
running on a worker thread. The connection typically uses a secure transport protocol such as HTTPS, ensuring the
|
||
prompt data is encrypted before being sent to the LLM service.</p></li>
|
||
<li><p><strong>Rate Limiting</strong>:
|
||
Before sending the request to the LLM, Arch applies rate-limiting policies to ensure that the upstream LLM service
|
||
is not overwhelmed by excessive traffic. Rate limits are enforced per client or service, ensuring fair usage and
|
||
preventing accidental or malicious overload. If the rate limit is exceeded, Arch may return an appropriate HTTP
|
||
error (e.g., 429 Too Many Requests) without sending the prompt to the LLM.</p></li>
|
||
<li><p><strong>Load Balancing to (hosted) LLM Endpoints</strong>:
|
||
After passing the rate-limiting checks, Arch routes the prompt to the appropriate LLM endpoint.
|
||
If multiple LLM providers instances are available, load balancing is performed to distribute traffic evenly
|
||
across the instances. Arch checks the health of the LLM endpoints using circuit breakers and health checks,
|
||
ensuring that the prompt is only routed to a healthy, responsive instance.</p></li>
|
||
<li><p><strong>Response Reception and Forwarding</strong>:
|
||
Once the LLM processes the prompt, Arch receives the response from the LLM service. The response is typically a
|
||
generated text, completion, or summarization. Upon reception, Arch decrypts (if necessary) and handles the response,
|
||
passing it through any egress processing pipeline defined by the application, such as logging or additional response filtering.</p></li>
|
||
</ol>
|
||
<section id="post-request-processing">
|
||
<h3>Post-request processing<a class="headerlink" href="#post-request-processing" title="Link to this heading">#</a></h3>
|
||
<p>Once a request completes, the stream is destroyed. The following also takes places:</p>
|
||
<ul class="simple">
|
||
<li><p>The post-request <a class="reference internal" href="../observability/stats.html#monitoring"><span class="std std-ref">monitoring</span></a> are updated (e.g. timing, active requests, upgrades, health checks).
|
||
Some statistics are updated earlier however, during request processing. Stats are batchedand written by the main
|
||
thread periodically.</p></li>
|
||
<li><p><a class="reference internal" href="../observability/access_logs.html#arch-access-logging"><span class="std std-ref">Access logs</span></a> are written to the access log</p></li>
|
||
<li><p><a class="reference internal" href="../observability/tracing.html#arch-overview-tracing"><span class="std std-ref">Trace</span></a> spans are finalized. If our example request was traced, a
|
||
trace span, describing the duration and details of the request would be created by the HCM when
|
||
processing request headers and then finalized by the HCM during post-request processing.</p></li>
|
||
</ul>
|
||
</section>
|
||
</section>
|
||
</section>
|
||
|
||
|
||
</article>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<footer class="prev-next-footer d-print-none">
|
||
|
||
<div class="prev-next-area">
|
||
<a class="left-prev"
|
||
href="architecture/model_serving/model_serving.html"
|
||
title="previous page">
|
||
<i class="fa-solid fa-angle-left"></i>
|
||
<div class="prev-next-info">
|
||
<p class="prev-next-subtitle">previous</p>
|
||
<p class="prev-next-title">Model Serving</p>
|
||
</div>
|
||
</a>
|
||
<a class="right-next"
|
||
href="getting_help.html"
|
||
title="next page">
|
||
<div class="prev-next-info">
|
||
<p class="prev-next-subtitle">next</p>
|
||
<p class="prev-next-title">Getting help</p>
|
||
</div>
|
||
<i class="fa-solid fa-angle-right"></i>
|
||
</a>
|
||
</div>
|
||
</footer>
|
||
|
||
</div>
|
||
|
||
|
||
|
||
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
|
||
|
||
|
||
<div class="sidebar-secondary-item">
|
||
<div class="page-toc tocsection onthispage">
|
||
<i class="fa-solid fa-list"></i> Contents
|
||
</div>
|
||
<nav class="bd-toc-nav page-toc">
|
||
<ul class="visible nav section-nav flex-column">
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#terminology">Terminology</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#network-topology">Network topology</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#high-level-architecture">High level architecture</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#configuration">Configuration</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#request-flow-ingress">Request Flow (Ingress)</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#request-flow-egress">Request Flow (Egress)</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#id2">Overview</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#post-request-processing">Post-request processing</a></li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
</nav></div>
|
||
|
||
</div></div>
|
||
|
||
|
||
</div>
|
||
<footer class="bd-footer-content">
|
||
|
||
<div class="bd-footer-content__inner container">
|
||
|
||
<div class="footer-item">
|
||
|
||
<p class="component-author">
|
||
By Katanemo Labs, Inc
|
||
</p>
|
||
|
||
</div>
|
||
|
||
<div class="footer-item">
|
||
|
||
|
||
<p class="copyright">
|
||
|
||
© Copyright 2024, Katanemo Labs, Inc.
|
||
<br/>
|
||
|
||
</p>
|
||
|
||
</div>
|
||
|
||
<div class="footer-item">
|
||
|
||
</div>
|
||
|
||
<div class="footer-item">
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</footer>
|
||
|
||
|
||
</main>
|
||
</div>
|
||
</div>
|
||
|
||
<!-- Scripts loaded after <body> so the DOM is not blocked -->
|
||
<script src="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
|
||
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>
|
||
|
||
<footer class="bd-footer">
|
||
</footer>
|
||
</body>
|
||
</html> |