mirror of
https://github.com/katanemo/plano.git
synced 2026-05-15 11:02:39 +02:00
506 lines
No EOL
20 KiB
HTML
506 lines
No EOL
20 KiB
HTML
|
||
<!DOCTYPE html>
|
||
|
||
|
||
<html lang="en" data-content_root="../../../" >
|
||
|
||
<head>
|
||
<meta charset="utf-8" />
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||
|
||
<title>Model Serving — Arch 0.1-beta documentation</title>
|
||
|
||
|
||
|
||
<script data-cfasync="false">
|
||
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
|
||
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
|
||
</script>
|
||
|
||
<!-- Loaded before other Sphinx assets -->
|
||
<link href="../../../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
|
||
<link href="../../../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
|
||
<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
|
||
|
||
|
||
<link href="../../../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
|
||
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
|
||
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
|
||
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />
|
||
|
||
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=a746c00c" />
|
||
<link rel="stylesheet" type="text/css" href="../../../_static/styles/sphinx-book-theme.css?v=a3416100" />
|
||
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css?v=76b2166b" />
|
||
|
||
<!-- Pre-loaded scripts that we'll load fully later -->
|
||
<link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
|
||
<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
|
||
<script src="../../../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>
|
||
|
||
<script src="../../../_static/documentation_options.js?v=2742c0eb"></script>
|
||
<script src="../../../_static/doctools.js?v=9a2dae69"></script>
|
||
<script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
|
||
<script src="../../../_static/clipboard.min.js?v=a7894cd8"></script>
|
||
<script src="../../../_static/copybutton.js?v=f281be69"></script>
|
||
<script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
|
||
<script>DOCUMENTATION_OPTIONS.pagename = 'intro/architecture/model_serving/model_serving';</script>
|
||
<link rel="icon" href="../../../_static/favicon.ico"/>
|
||
<link rel="index" title="Index" href="../../../genindex.html" />
|
||
<link rel="search" title="Search" href="../../../search.html" />
|
||
<link rel="next" title="Life of a Request" href="../../life_of_a_request.html" />
|
||
<link rel="prev" title="LLM Provider" href="../listeners/llm_provider.html" />
|
||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||
<meta name="docsearch:language" content="en"/>
|
||
</head>
|
||
|
||
|
||
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
|
||
|
||
|
||
|
||
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
|
||
|
||
<div id="pst-scroll-pixel-helper"></div>
|
||
|
||
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
|
||
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
|
||
|
||
|
||
<input type="checkbox"
|
||
class="sidebar-toggle"
|
||
id="pst-primary-sidebar-checkbox"/>
|
||
<label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>
|
||
|
||
<input type="checkbox"
|
||
class="sidebar-toggle"
|
||
id="pst-secondary-sidebar-checkbox"/>
|
||
<label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>
|
||
|
||
<div class="search-button__wrapper">
|
||
<div class="search-button__overlay"></div>
|
||
<div class="search-button__search-container">
|
||
<form class="bd-search d-flex align-items-center"
|
||
action="../../../search.html"
|
||
method="get">
|
||
<i class="fa-solid fa-magnifying-glass"></i>
|
||
<input type="search"
|
||
class="form-control"
|
||
name="q"
|
||
id="search-input"
|
||
placeholder="Search..."
|
||
aria-label="Search..."
|
||
autocomplete="off"
|
||
autocorrect="off"
|
||
autocapitalize="off"
|
||
spellcheck="false"/>
|
||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
|
||
</form></div>
|
||
</div>
|
||
|
||
<div class="pst-async-banner-revealer d-none">
|
||
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
|
||
</div>
|
||
|
||
|
||
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
|
||
</header>
|
||
|
||
|
||
<div class="bd-container">
|
||
<div class="bd-container__inner bd-page-width">
|
||
|
||
|
||
|
||
<div class="bd-sidebar-primary bd-sidebar">
|
||
|
||
|
||
|
||
<div class="sidebar-header-items sidebar-primary__section">
|
||
|
||
|
||
|
||
|
||
</div>
|
||
|
||
<div class="sidebar-primary-items__start sidebar-primary__section">
|
||
<div class="sidebar-primary-item">
|
||
|
||
|
||
|
||
|
||
|
||
<a class="navbar-brand logo" href="../../../root.html">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<img src="../../../_static/arch-nav-logo.png" class="logo__image only-light" alt="Arch 0.1-beta documentation - Home"/>
|
||
<script>document.write(`<img src="../../../_static/arch-nav-logo.png" class="logo__image only-dark" alt="Arch 0.1-beta documentation - Home"/>`);</script>
|
||
|
||
|
||
</a></div>
|
||
<div class="sidebar-primary-item">
|
||
|
||
<script>
|
||
document.write(`
|
||
<button class="btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<i class="fa-solid fa-magnifying-glass"></i>
|
||
<span class="search-button__default-text">Search</span>
|
||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
|
||
</button>
|
||
`);
|
||
</script></div>
|
||
<div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
|
||
<div class="bd-toc-item navbar-nav active">
|
||
<ul class="current nav bd-sidenav">
|
||
<li class="toctree-l1 current active has-children"><a class="reference internal" href="../../intro.html">Introduction</a><details open="open"><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul class="current">
|
||
<li class="toctree-l2"><a class="reference internal" href="../../what_is_arch.html">What is Arch</a></li>
|
||
<li class="toctree-l2 current active has-children"><a class="reference internal" href="../architecture.html">Technical Architecture</a><details open="open"><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul class="current">
|
||
<li class="toctree-l3"><a class="reference internal" href="../intro/terminology.html">Terminology</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="../intro/threading_model.html">Threading model</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="../listeners/listeners.html">Listener</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="../prompt_processing/prompt_processing.html">Prompts</a></li>
|
||
|
||
|
||
|
||
|
||
<li class="toctree-l3"><a class="reference internal" href="../listeners/llm_provider.html">LLM Provider</a></li>
|
||
|
||
<li class="toctree-l3 current active"><a class="current reference internal" href="#">Model Serving</a></li>
|
||
</ul>
|
||
</details></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../life_of_a_request.html">Life of a Request</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../getting_help.html">Getting help</a></li>
|
||
</ul>
|
||
</details></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../getting_started/getting_started.html">Getting Started</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../getting_started/use_cases/rag.html">Retrieval-Augmented (RAG)</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../getting_started/use_cases/function_calling.html">Agentic (Text-to-Action) Apps</a></li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../observability/observability.html">Observability</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../observability/tracing.html">Tracing</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../observability/stats.html">Monitoring</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../../../observability/access_logs.html">Access Logging</a></li>
|
||
</ul>
|
||
</details></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../llms/llms.html">LLMs</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../configuration_reference.html">Configuration Reference</a></li>
|
||
</ul>
|
||
|
||
</div>
|
||
</nav></div>
|
||
</div>
|
||
|
||
|
||
<div class="sidebar-primary-items__end sidebar-primary__section">
|
||
</div>
|
||
|
||
<div id="rtd-footer-container"></div>
|
||
|
||
|
||
</div>
|
||
|
||
<main id="main-content" class="bd-main" role="main">
|
||
|
||
|
||
|
||
<div class="sbt-scroll-pixel-helper"></div>
|
||
|
||
<div class="bd-content">
|
||
<div class="bd-article-container">
|
||
|
||
<div class="bd-header-article d-print-none">
|
||
<div class="header-article-items header-article__inner">
|
||
|
||
<div class="header-article-items__start">
|
||
|
||
<div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<span class="fa-solid fa-bars"></span>
|
||
</button></div>
|
||
|
||
</div>
|
||
|
||
|
||
<div class="header-article-items__end">
|
||
|
||
<div class="header-article-item">
|
||
|
||
<div class="article-header-buttons">
|
||
|
||
|
||
|
||
|
||
|
||
<div class="dropdown dropdown-download-buttons">
|
||
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
|
||
<i class="fas fa-download"></i>
|
||
</button>
|
||
<ul class="dropdown-menu">
|
||
|
||
|
||
|
||
<li><a href="../../../_sources/intro/architecture/model_serving/model_serving.rst" target="_blank"
|
||
class="btn btn-sm btn-download-source-button dropdown-item"
|
||
title="Download source file"
|
||
data-bs-placement="left" data-bs-toggle="tooltip"
|
||
>
|
||
|
||
|
||
<span class="btn__icon-container">
|
||
<i class="fas fa-file"></i>
|
||
</span>
|
||
<span class="btn__text-container">.rst</span>
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
<li>
|
||
<button onclick="window.print()"
|
||
class="btn btn-sm btn-download-pdf-button dropdown-item"
|
||
title="Print to PDF"
|
||
data-bs-placement="left" data-bs-toggle="tooltip"
|
||
>
|
||
|
||
|
||
<span class="btn__icon-container">
|
||
<i class="fas fa-file-pdf"></i>
|
||
</span>
|
||
<span class="btn__text-container">.pdf</span>
|
||
</button>
|
||
</li>
|
||
|
||
</ul>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
<button onclick="toggleFullScreen()"
|
||
class="btn btn-sm btn-fullscreen-button"
|
||
title="Fullscreen mode"
|
||
data-bs-placement="bottom" data-bs-toggle="tooltip"
|
||
>
|
||
|
||
|
||
<span class="btn__icon-container">
|
||
<i class="fas fa-expand"></i>
|
||
</span>
|
||
|
||
</button>
|
||
|
||
|
||
|
||
<script>
|
||
document.write(`
|
||
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
|
||
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
|
||
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
|
||
</button>
|
||
`);
|
||
</script>
|
||
|
||
|
||
<script>
|
||
document.write(`
|
||
<button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<i class="fa-solid fa-magnifying-glass fa-lg"></i>
|
||
</button>
|
||
`);
|
||
</script>
|
||
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<span class="fa-solid fa-list"></span>
|
||
</button>
|
||
</div></div>
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div id="jb-print-docs-body" class="onlyprint">
|
||
<h1>Model Serving</h1>
|
||
<!-- Table of contents -->
|
||
<div id="print-main-content">
|
||
<div id="jb-print-toc">
|
||
|
||
<div>
|
||
<h2> Contents </h2>
|
||
</div>
|
||
<nav aria-label="Page">
|
||
<ul class="visible nav section-nav flex-column">
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#local-serving-cpu-moderate">Local Serving (CPU - Moderate)</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#local-serving-gpu-fast">Local Serving (GPU- Fast)</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#cloud-serving-gpu-blazing-fast">Cloud Serving (GPU - Blazing Fast)</a></li>
|
||
</ul>
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div id="searchbox"></div>
|
||
<article class="bd-article">
|
||
|
||
<section id="model-serving">
|
||
<span id="arch-model-serving"></span><h1>Model Serving<a class="headerlink" href="#model-serving" title="Link to this heading">#</a></h1>
|
||
<p>Arch is a set of <strong>two</strong> self-contained processes that are designed to run alongside your application
|
||
servers (or on a separate host connected via a network). The first process is designated to manage low-level
|
||
networking and HTTP related comcerns, and the other process is for <strong>model serving</strong>, which helps Arch make
|
||
intelligent decisions about the incoming prompts. The model server is designed to call the purpose-built
|
||
<a class="reference internal" href="../../../llms/llms.html#llms-in-arch"><span class="std std-ref">LLMs</span></a> in Arch.</p>
|
||
<a class="reference internal image-reference" href="../../../_images/arch-system-architecture.jpg"><img alt="../../../_images/arch-system-architecture.jpg" class="align-center" src="../../../_images/arch-system-architecture.jpg" style="width: 50%;" />
|
||
</a>
|
||
<hr class="docutils" />
|
||
<p>Arch’ is designed to be deployed in your cloud VPC, on a on-premises host, and can work on devices that don’t
|
||
have a GPU. Note, GPU devices are need for fast and cost-efficient use, so that Arch (model server, specifically)
|
||
can process prompts quickly and forward control back to the applicaton host. There are three modes in which Arch
|
||
can be configured to run its <strong>model server</strong> subsystem:</p>
|
||
<section id="local-serving-cpu-moderate">
|
||
<h2>Local Serving (CPU - Moderate)<a class="headerlink" href="#local-serving-cpu-moderate" title="Link to this heading">#</a></h2>
|
||
<p>The following bash commands enable you to configure the model server subsystem in Arch to run local on device
|
||
and only use CPU devices. This will be the slowest option but can be useful in dev/test scenarios where GPUs
|
||
might not be available.</p>
|
||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>archgw<span class="w"> </span>up<span class="w"> </span>--local<span class="w"> </span>-cpu
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="local-serving-gpu-fast">
|
||
<h2>Local Serving (GPU- Fast)<a class="headerlink" href="#local-serving-gpu-fast" title="Link to this heading">#</a></h2>
|
||
<p>The following bash commands enable you to configure the model server subsystem in Arch to run locally on the
|
||
machine and utilize the GPU available for fast inference across all model use cases, including function calling
|
||
guardails, etc.</p>
|
||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>archgw<span class="w"> </span>up<span class="w"> </span>--local
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="cloud-serving-gpu-blazing-fast">
|
||
<h2>Cloud Serving (GPU - Blazing Fast)<a class="headerlink" href="#cloud-serving-gpu-blazing-fast" title="Link to this heading">#</a></h2>
|
||
<p>The command below instructs Arch to intelligently use GPUs locally for fast intent detection, but default to
|
||
cloud serving for function calling and guardails scenarios to dramatically improve the speed and overall performance
|
||
of your applications.</p>
|
||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>archgw<span class="w"> </span>up
|
||
</pre></div>
|
||
</div>
|
||
<div class="admonition note">
|
||
<p class="admonition-title">Note</p>
|
||
<p>Arch’s model serving in the cloud is priced at $0.05M/token (156x cheaper than GPT-4o) with averlage latency
|
||
of 200ms (10x faster than GPT-4o). Please refer to our <a class="reference internal" href="../../../getting_started/getting_started.html#getting-started"><span class="std std-ref">getting started guide</span></a> to know
|
||
how to generate API keys for model serving</p>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
|
||
|
||
</article>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<footer class="prev-next-footer d-print-none">
|
||
|
||
<div class="prev-next-area">
|
||
<a class="left-prev"
|
||
href="../listeners/llm_provider.html"
|
||
title="previous page">
|
||
<i class="fa-solid fa-angle-left"></i>
|
||
<div class="prev-next-info">
|
||
<p class="prev-next-subtitle">previous</p>
|
||
<p class="prev-next-title">LLM Provider</p>
|
||
</div>
|
||
</a>
|
||
<a class="right-next"
|
||
href="../../life_of_a_request.html"
|
||
title="next page">
|
||
<div class="prev-next-info">
|
||
<p class="prev-next-subtitle">next</p>
|
||
<p class="prev-next-title">Life of a Request</p>
|
||
</div>
|
||
<i class="fa-solid fa-angle-right"></i>
|
||
</a>
|
||
</div>
|
||
</footer>
|
||
|
||
</div>
|
||
|
||
|
||
|
||
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
|
||
|
||
|
||
<div class="sidebar-secondary-item">
|
||
<div class="page-toc tocsection onthispage">
|
||
<i class="fa-solid fa-list"></i> Contents
|
||
</div>
|
||
<nav class="bd-toc-nav page-toc">
|
||
<ul class="visible nav section-nav flex-column">
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#local-serving-cpu-moderate">Local Serving (CPU - Moderate)</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#local-serving-gpu-fast">Local Serving (GPU- Fast)</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#cloud-serving-gpu-blazing-fast">Cloud Serving (GPU - Blazing Fast)</a></li>
|
||
</ul>
|
||
</nav></div>
|
||
|
||
</div></div>
|
||
|
||
|
||
</div>
|
||
<footer class="bd-footer-content">
|
||
|
||
<div class="bd-footer-content__inner container">
|
||
|
||
<div class="footer-item">
|
||
|
||
<p class="component-author">
|
||
By Katanemo Labs, Inc
|
||
</p>
|
||
|
||
</div>
|
||
|
||
<div class="footer-item">
|
||
|
||
|
||
<p class="copyright">
|
||
|
||
© Copyright 2024, Katanemo Labs, Inc.
|
||
<br/>
|
||
|
||
</p>
|
||
|
||
</div>
|
||
|
||
<div class="footer-item">
|
||
|
||
</div>
|
||
|
||
<div class="footer-item">
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</footer>
|
||
|
||
|
||
</main>
|
||
</div>
|
||
</div>
|
||
|
||
<!-- Scripts loaded after <body> so the DOM is not blocked -->
|
||
<script src="../../../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
|
||
<script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>
|
||
|
||
<footer class="bd-footer">
|
||
</footer>
|
||
</body>
|
||
</html> |