plano/intro/architecture/model_serving/model_serving.html

506 lines
20 KiB
HTML
Raw Normal View History

<!DOCTYPE html>
<html lang="en" data-content_root="../../../" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Model Serving &#8212; Arch 0.1-beta documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../../../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../../../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../../../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=a746c00c" />
<link rel="stylesheet" type="text/css" href="../../../_static/styles/sphinx-book-theme.css?v=a3416100" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css?v=76b2166b" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
<script src="../../../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../../../_static/documentation_options.js?v=2742c0eb"></script>
<script src="../../../_static/doctools.js?v=9a2dae69"></script>
<script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../_static/clipboard.min.js?v=a7894cd8"></script>
<script src="../../../_static/copybutton.js?v=f281be69"></script>
<script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'intro/architecture/model_serving/model_serving';</script>
<link rel="icon" href="../../../_static/favicon.ico"/>
<link rel="index" title="Index" href="../../../genindex.html" />
<link rel="search" title="Search" href="../../../search.html" />
<link rel="next" title="Life of a Request" href="../../life_of_a_request.html" />
<link rel="prev" title="LLM Provider" href="../listeners/llm_provider.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
<input type="checkbox"
class="sidebar-toggle"
id="pst-primary-sidebar-checkbox"/>
<label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>
<input type="checkbox"
class="sidebar-toggle"
id="pst-secondary-sidebar-checkbox"/>
<label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../../../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search..."
aria-label="Search..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<div class="pst-async-banner-revealer d-none">
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<a class="navbar-brand logo" href="../../../root.html">
<img src="../../../_static/arch-nav-logo.png" class="logo__image only-light" alt="Arch 0.1-beta documentation - Home"/>
<script>document.write(`<img src="../../../_static/arch-nav-logo.png" class="logo__image only-dark" alt="Arch 0.1-beta documentation - Home"/>`);</script>
</a></div>
<div class="sidebar-primary-item">
<script>
document.write(`
<button class="btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script></div>
<div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
<div class="bd-toc-item navbar-nav active">
<ul class="current nav bd-sidenav">
<li class="toctree-l1 current active has-children"><a class="reference internal" href="../../intro.html">Introduction</a><details open="open"><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="../../what_is_arch.html">What is Arch</a></li>
<li class="toctree-l2 current active has-children"><a class="reference internal" href="../architecture.html">Technical Architecture</a><details open="open"><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul class="current">
<li class="toctree-l3"><a class="reference internal" href="../intro/terminology.html">Terminology</a></li>
<li class="toctree-l3"><a class="reference internal" href="../intro/threading_model.html">Threading model</a></li>
<li class="toctree-l3"><a class="reference internal" href="../listeners/listeners.html">Listener</a></li>
<li class="toctree-l3"><a class="reference internal" href="../prompt_processing/prompt_processing.html">Prompts</a></li>
<li class="toctree-l3"><a class="reference internal" href="../listeners/llm_provider.html">LLM Provider</a></li>
<li class="toctree-l3 current active"><a class="current reference internal" href="#">Model Serving</a></li>
</ul>
</details></li>
<li class="toctree-l2"><a class="reference internal" href="../../life_of_a_request.html">Life of a Request</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../getting_help.html">Getting help</a></li>
</ul>
</details></li>
<li class="toctree-l1"><a class="reference internal" href="../../../getting_started/getting_started.html">Getting Started</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../getting_started/use_cases/rag.html">Retrieval-Augmented (RAG)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../getting_started/use_cases/function_calling.html">Agentic (Text-to-Action) Apps</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../observability/observability.html">Observability</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../../../observability/tracing.html">Tracing</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../observability/stats.html">Monitoring</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../observability/access_logs.html">Access Logging</a></li>
</ul>
</details></li>
<li class="toctree-l1"><a class="reference internal" href="../../../llms/llms.html">LLMs</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../configuration_reference.html">Configuration Reference</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main" role="main">
<div class="sbt-scroll-pixel-helper"></div>
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-bars"></span>
</button></div>
</div>
<div class="header-article-items__end">
<div class="header-article-item">
<div class="article-header-buttons">
<div class="dropdown dropdown-download-buttons">
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
<i class="fas fa-download"></i>
</button>
<ul class="dropdown-menu">
<li><a href="../../../_sources/intro/architecture/model_serving/model_serving.rst" target="_blank"
class="btn btn-sm btn-download-source-button dropdown-item"
title="Download source file"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file"></i>
</span>
<span class="btn__text-container">.rst</span>
</a>
</li>
<li>
<button onclick="window.print()"
class="btn btn-sm btn-download-pdf-button dropdown-item"
title="Print to PDF"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file-pdf"></i>
</span>
<span class="btn__text-container">.pdf</span>
</button>
</li>
</ul>
</div>
<button onclick="toggleFullScreen()"
class="btn btn-sm btn-fullscreen-button"
title="Fullscreen mode"
data-bs-placement="bottom" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-expand"></i>
</span>
</button>
<script>
document.write(`
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
</button>
`);
</script>
<script>
document.write(`
<button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass fa-lg"></i>
</button>
`);
</script>
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-list"></span>
</button>
</div></div>
</div>
</div>
</div>
<div id="jb-print-docs-body" class="onlyprint">
<h1>Model Serving</h1>
<!-- Table of contents -->
<div id="print-main-content">
<div id="jb-print-toc">
<div>
<h2> Contents </h2>
</div>
<nav aria-label="Page">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#local-serving-cpu-moderate">Local Serving (CPU - Moderate)</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#local-serving-gpu-fast">Local Serving (GPU- Fast)</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#cloud-serving-gpu-blazing-fast">Cloud Serving (GPU - Blazing Fast)</a></li>
</ul>
</nav>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<section id="model-serving">
<span id="arch-model-serving"></span><h1>Model Serving<a class="headerlink" href="#model-serving" title="Link to this heading">#</a></h1>
<p>Arch is a set of <strong>two</strong> self-contained processes that are designed to run alongside your application
servers (or on a separate host connected via a network). The first process is designated to manage low-level
networking and HTTP related comcerns, and the other process is for <strong>model serving</strong>, which helps Arch make
intelligent decisions about the incoming prompts. The model server is designed to call the purpose-built
<a class="reference internal" href="../../../llms/llms.html#llms-in-arch"><span class="std std-ref">LLMs</span></a> in Arch.</p>
<a class="reference internal image-reference" href="../../../_images/arch-system-architecture.jpg"><img alt="../../../_images/arch-system-architecture.jpg" class="align-center" src="../../../_images/arch-system-architecture.jpg" style="width: 50%;" />
</a>
<hr class="docutils" />
<p>Arch is designed to be deployed in your cloud VPC, on a on-premises host, and can work on devices that dont
have a GPU. Note, GPU devices are need for fast and cost-efficient use, so that Arch (model server, specifically)
can process prompts quickly and forward control back to the applicaton host. There are three modes in which Arch
can be configured to run its <strong>model server</strong> subsystem:</p>
<section id="local-serving-cpu-moderate">
<h2>Local Serving (CPU - Moderate)<a class="headerlink" href="#local-serving-cpu-moderate" title="Link to this heading">#</a></h2>
<p>The following bash commands enable you to configure the model server subsystem in Arch to run local on device
and only use CPU devices. This will be the slowest option but can be useful in dev/test scenarios where GPUs
might not be available.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>archgw<span class="w"> </span>up<span class="w"> </span>--local<span class="w"> </span>-cpu
</pre></div>
</div>
</section>
<section id="local-serving-gpu-fast">
<h2>Local Serving (GPU- Fast)<a class="headerlink" href="#local-serving-gpu-fast" title="Link to this heading">#</a></h2>
<p>The following bash commands enable you to configure the model server subsystem in Arch to run locally on the
machine and utilize the GPU available for fast inference across all model use cases, including function calling
guardails, etc.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>archgw<span class="w"> </span>up<span class="w"> </span>--local
</pre></div>
</div>
</section>
<section id="cloud-serving-gpu-blazing-fast">
<h2>Cloud Serving (GPU - Blazing Fast)<a class="headerlink" href="#cloud-serving-gpu-blazing-fast" title="Link to this heading">#</a></h2>
<p>The command below instructs Arch to intelligently use GPUs locally for fast intent detection, but default to
cloud serving for function calling and guardails scenarios to dramatically improve the speed and overall performance
of your applications.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>archgw<span class="w"> </span>up
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Archs model serving in the cloud is priced at $0.05M/token (156x cheaper than GPT-4o) with averlage latency
of 200ms (10x faster than GPT-4o). Please refer to our <a class="reference internal" href="../../../getting_started/getting_started.html#getting-started"><span class="std std-ref">getting started guide</span></a> to know
how to generate API keys for model serving</p>
</div>
</section>
</section>
</article>
<footer class="prev-next-footer d-print-none">
<div class="prev-next-area">
<a class="left-prev"
href="../listeners/llm_provider.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">LLM Provider</p>
</div>
</a>
<a class="right-next"
href="../../life_of_a_request.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Life of a Request</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> Contents
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#local-serving-cpu-moderate">Local Serving (CPU - Moderate)</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#local-serving-gpu-fast">Local Serving (GPU- Fast)</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#cloud-serving-gpu-blazing-fast">Cloud Serving (GPU - Blazing Fast)</a></li>
</ul>
</nav></div>
</div></div>
</div>
<footer class="bd-footer-content">
<div class="bd-footer-content__inner container">
<div class="footer-item">
<p class="component-author">
By Katanemo Labs, Inc
</p>
</div>
<div class="footer-item">
<p class="copyright">
© Copyright 2024, Katanemo Labs, Inc.
<br/>
</p>
</div>
<div class="footer-item">
</div>
<div class="footer-item">
</div>
</div>
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../../../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>
<footer class="bd-footer">
</footer>
</body>
</html>