mirror of
https://github.com/katanemo/plano.git
synced 2026-05-15 11:02:39 +02:00
Use intent model from archfc to pick prompt gateway (#328)
This commit is contained in:
parent
67b8fd635e
commit
ba7279becb
151 changed files with 8642 additions and 10932 deletions
|
|
@ -1,18 +1,18 @@
|
|||
.. _arch_agent_guide:
|
||||
|
||||
Agentic Workflow
|
||||
==============================
|
||||
Agentic Apps
|
||||
=============
|
||||
|
||||
Arch helps you easily personalize your applications by calling application-specific (API) functions
|
||||
via user prompts. This involves any predefined functions or APIs you want to expose to users to perform tasks,
|
||||
gather information, or manipulate data. This capability is generally referred to as :ref:`function calling <function_calling>`, where
|
||||
you have the flexibility to support “agentic” apps tailored to specific use cases - from updating insurance
|
||||
claims to creating ad campaigns - via prompts.
|
||||
Arch helps you build personalized agentic applications by calling application-specific (API) functions via user prompts.
|
||||
This involves any predefined functions or APIs you want to expose to users to perform tasks, gather information,
|
||||
or manipulate data. This capability is generally referred to as :ref:`function calling <function_calling>`, where
|
||||
you can support “agentic” apps tailored to specific use cases - from updating insurance claims to creating ad campaigns - via prompts.
|
||||
|
||||
Arch analyzes prompts, extracts critical information from prompts, engages in lightweight conversation with
|
||||
the user to gather any missing parameters and makes API calls so that you can focus on writing business logic.
|
||||
Arch does this via its purpose-built `Arch-Function <https://huggingface.co/collections/katanemo/arch-function-66f209a693ea8df14317ad68>`_ - the fastest (200ms p90 - 10x faser than GPT-4o)
|
||||
and cheapest (100x than GPT-4o) function calling LLM that matches performance with frontier models.
|
||||
Arch analyzes prompts, extracts critical information from prompts, engages in lightweight conversation with the user to
|
||||
gather any missing parameters and makes API calls so that you can focus on writing business logic. Arch does this via its
|
||||
purpose-built `Arch-Function <https://huggingface.co/collections/katanemo/arch-function-66f209a693ea8df14317ad68>`_ -
|
||||
the fastest (200ms p50 - 12x faser than GPT-4o) and cheapest (44x than GPT-4o) function calling LLM that matches or outperforms
|
||||
frontier LLMs.
|
||||
|
||||
.. image:: includes/agent/function-calling-flow.jpg
|
||||
:width: 100%
|
||||
|
|
|
|||
|
|
@ -0,0 +1,39 @@
|
|||
import os
|
||||
import gradio as gr
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional
|
||||
from openai import OpenAI
|
||||
from common import create_gradio_app
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
|
||||
# Define the request model
|
||||
class EnergySourceRequest(BaseModel):
|
||||
energy_source: str
|
||||
consideration: Optional[str] = None
|
||||
|
||||
|
||||
class EnergySourceResponse(BaseModel):
|
||||
energy_source: str
|
||||
consideration: Optional[str] = None
|
||||
|
||||
|
||||
# Post method for device summary
|
||||
@app.post("/agent/energy_source_info")
|
||||
def get_workforce(request: EnergySourceRequest):
|
||||
"""
|
||||
Endpoint to get details about energy source
|
||||
"""
|
||||
considertion = "You don't have any specific consideration. Feel free to talk in a more open ended fashion"
|
||||
|
||||
if request.consideration is not None:
|
||||
considertion = f"Add specific focus on the following consideration when you summarize the content for the energy source: {request.consideration}"
|
||||
|
||||
response = {
|
||||
"energy_source": request.energy_source,
|
||||
"consideration": considertion,
|
||||
}
|
||||
return response
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 852 KiB |
|
|
@ -0,0 +1,35 @@
|
|||
version: v0.1
|
||||
listener:
|
||||
address: 127.0.0.1
|
||||
port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates
|
||||
message_format: huggingface
|
||||
|
||||
# Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
|
||||
llm_providers:
|
||||
- name: OpenAI
|
||||
provider: openai
|
||||
access_key: $OPENAI_API_KEY
|
||||
model: gpt-3.5-turbo
|
||||
default: true
|
||||
|
||||
# default system prompt used by all prompt targets
|
||||
system_prompt: |
|
||||
You are a helpful assistant and can offer information about energy sources. You will get a JSON object with energy_source and consideration fields. Focus on answering using those fields
|
||||
|
||||
prompt_targets:
|
||||
- name: get_info_for_energy_source
|
||||
description: get information about an energy source
|
||||
parameters:
|
||||
- name: energy_source
|
||||
type: str
|
||||
description: a source of energy
|
||||
required: true
|
||||
enum: [renewable, fossil]
|
||||
- name: consideration
|
||||
type: str
|
||||
description: a specific type of consideration for an energy source
|
||||
enum: [cost, economic, technology]
|
||||
endpoint:
|
||||
name: rag_energy_source_agent
|
||||
path: /agent/energy_source_info
|
||||
http_method: POST
|
||||
|
|
@ -1,162 +0,0 @@
|
|||
from flask import Flask, request, jsonify
|
||||
from datetime import datetime
|
||||
import uuid
|
||||
from langchain.memory import ConversationBufferMemory
|
||||
from langchain.schema import AIMessage, HumanMessage
|
||||
from langchain import OpenAI
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Global dictionary to keep track of user memories
|
||||
user_memories = {}
|
||||
|
||||
|
||||
def get_user_conversation(user_id):
|
||||
"""
|
||||
Retrieve the user's conversation memory using LangChain.
|
||||
If the user does not exist, initialize their conversation memory.
|
||||
"""
|
||||
if user_id not in user_memories:
|
||||
user_memories[user_id] = ConversationBufferMemory(return_messages=True)
|
||||
return user_memories[user_id]
|
||||
|
||||
|
||||
def update_user_conversation(user_id, client_messages, intent_changed):
|
||||
"""
|
||||
Update the user's conversation memory with new messages using LangChain.
|
||||
Each message is augmented with a UUID, timestamp, and intent change marker.
|
||||
Only new messages are added to avoid duplication.
|
||||
"""
|
||||
memory = get_user_conversation(user_id)
|
||||
stored_messages = memory.chat_memory.messages
|
||||
|
||||
# Determine the number of stored messages
|
||||
num_stored_messages = len(stored_messages)
|
||||
new_messages = client_messages[num_stored_messages:]
|
||||
|
||||
# Process each new message
|
||||
for index, message in enumerate(new_messages):
|
||||
role = message.get("role")
|
||||
content = message.get("content")
|
||||
metadata = {
|
||||
"uuid": str(uuid.uuid4()),
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"intent_changed": False, # Default value
|
||||
}
|
||||
|
||||
# Mark the intent change on the last message if detected
|
||||
if intent_changed and index == len(new_messages) - 1:
|
||||
metadata["intent_changed"] = True
|
||||
|
||||
# Create a new message with metadata
|
||||
if role == "user":
|
||||
memory.chat_memory.add_message(
|
||||
HumanMessage(content=content, additional_kwargs={"metadata": metadata})
|
||||
)
|
||||
elif role == "assistant":
|
||||
memory.chat_memory.add_message(
|
||||
AIMessage(content=content, additional_kwargs={"metadata": metadata})
|
||||
)
|
||||
else:
|
||||
# Handle other roles if necessary
|
||||
pass
|
||||
|
||||
return memory
|
||||
|
||||
|
||||
def get_messages_since_last_intent(messages):
|
||||
"""
|
||||
Retrieve messages from the last intent change onwards using LangChain.
|
||||
"""
|
||||
messages_since_intent = []
|
||||
for message in reversed(messages):
|
||||
# Insert message at the beginning to maintain correct order
|
||||
messages_since_intent.insert(0, message)
|
||||
metadata = message.additional_kwargs.get("metadata", {})
|
||||
# Break if intent_changed is True
|
||||
if metadata.get("intent_changed", False) == True:
|
||||
break
|
||||
|
||||
return messages_since_intent
|
||||
|
||||
|
||||
def forward_to_llm(messages):
|
||||
"""
|
||||
Forward messages to an upstream LLM using LangChain.
|
||||
"""
|
||||
# Convert messages to a conversation string
|
||||
conversation = ""
|
||||
for message in messages:
|
||||
role = "User" if isinstance(message, HumanMessage) else "Assistant"
|
||||
content = message.content
|
||||
conversation += f"{role}: {content}\n"
|
||||
# Use LangChain's LLM to get a response. This call is proxied through Arch for end-to-end observability and traffic management
|
||||
llm = OpenAI()
|
||||
# Create a prompt that includes the conversation
|
||||
prompt = f"{conversation}Assistant:"
|
||||
response = llm(prompt)
|
||||
return response
|
||||
|
||||
|
||||
@app.route("/process_rag", methods=["POST"])
|
||||
def process_rag():
|
||||
# Extract JSON data from the request
|
||||
data = request.get_json()
|
||||
|
||||
user_id = data.get("user_id")
|
||||
if not user_id:
|
||||
return jsonify({"error": "User ID is required"}), 400
|
||||
|
||||
client_messages = data.get("messages")
|
||||
if not client_messages or not isinstance(client_messages, list):
|
||||
return jsonify({"error": "Messages array is required"}), 400
|
||||
|
||||
# Extract the intent change marker from Arch's headers if present for the current prompt
|
||||
intent_changed_header = request.headers.get("x-arch-intent-marker", "").lower()
|
||||
if intent_changed_header in ["", "false"]:
|
||||
intent_changed = False
|
||||
elif intent_changed_header == "true":
|
||||
intent_changed = True
|
||||
else:
|
||||
# Invalid value provided
|
||||
return (
|
||||
jsonify({"error": "Invalid value for x-arch-prompt-intent-change header"}),
|
||||
400,
|
||||
)
|
||||
|
||||
# Update user conversation based on intent change
|
||||
memory = update_user_conversation(user_id, client_messages, intent_changed)
|
||||
|
||||
# Retrieve messages since last intent change for LLM
|
||||
messages_for_llm = get_messages_since_last_intent(memory.chat_memory.messages)
|
||||
|
||||
# Forward messages to upstream LLM
|
||||
llm_response = forward_to_llm(messages_for_llm)
|
||||
|
||||
# Prepare the messages to return
|
||||
messages_to_return = []
|
||||
for message in memory.chat_memory.messages:
|
||||
role = "user" if isinstance(message, HumanMessage) else "assistant"
|
||||
content = message.content
|
||||
metadata = message.additional_kwargs.get("metadata", {})
|
||||
message_entry = {
|
||||
"uuid": metadata.get("uuid"),
|
||||
"timestamp": metadata.get("timestamp"),
|
||||
"role": role,
|
||||
"content": content,
|
||||
"intent_changed": metadata.get("intent_changed", False),
|
||||
}
|
||||
messages_to_return.append(message_entry)
|
||||
|
||||
# Prepare the response
|
||||
response = {
|
||||
"user_id": user_id,
|
||||
"messages": messages_to_return,
|
||||
"llm_response": llm_response,
|
||||
}
|
||||
|
||||
return jsonify(response), 200
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(debug=True)
|
||||
90
docs/source/build_with_arch/multi_turn.rst
Normal file
90
docs/source/build_with_arch/multi_turn.rst
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
.. _arch_multi_turn_guide:
|
||||
|
||||
Multi-Turn
|
||||
==========
|
||||
Developers often `struggle <https://www.reddit.com/r/LocalLLaMA/comments/18mqwg6/best_practice_for_rag_with_followup_chat/>`_ to efficiently handle
|
||||
``follow-up`` or ``clarification`` questions. Specifically, when users ask for changes or additions to previous responses, it requires developers to
|
||||
re-write prompts using LLMs with precise prompt engineering techniques. This process is slow, manual, error prone and adds latency and token cost for
|
||||
common scenarios that can be managed more efficiently.
|
||||
|
||||
Arch is highly capable of accurately detecting and processing prompts in multi-turn scenarios so that you can buil fast and accurate agents in minutes.
|
||||
Below are some cnversational examples that you can build via Arch. Each example is enriched with annotations (via ** [Arch] ** ) that illustrates how Arch
|
||||
processess conversational messages on your behalf.
|
||||
|
||||
.. Note::
|
||||
The following section assumes that you have some knowledge about the core concepts of Arch, such as :ref:`prompt_targets <arch_overview_prompt_handling>`.
|
||||
If you haven't familizaried yourself with Arch's concepts, we recommend you first read the :ref:`tech overview <tech_overview>` section firtst.
|
||||
Additionally, the conversation examples below assume the usage of the following :ref:`arch_config.yaml <multi_turn_subsection_prompt_target>` file.
|
||||
|
||||
Example 1: Adjusting Retrieval
|
||||
------------------------------
|
||||
.. code-block:: text
|
||||
|
||||
User: What are the benefits of renewable energy?
|
||||
**[Arch]**: Check if there is an available <prompt_target> that can handle this user query.
|
||||
**[Arch]**: Found "get_info_for_energy_source" prompt_target in arch_config.yaml. Forward prompt to the endpoint configured in "get_info_for_energy_source"
|
||||
...
|
||||
Assistant: Renewable energy reduces greenhouse gas emissions, lowers air pollution, and provides sustainable power sources like solar and wind.
|
||||
|
||||
User: Include cost considerations in the response.
|
||||
**[Arch]**: Follow-up detected. Forward prompt history to the "get_info_for_energy_source" prompt_target and post the following parameters consideration="cost"
|
||||
...
|
||||
Assistant: Renewable energy reduces greenhouse gas emissions, lowers air pollution, and provides sustainable power sources like solar and wind. While the initial setup costs can be high, long-term savings from reduced fuel expenses and government incentives make it cost-effective.
|
||||
|
||||
|
||||
Example 2: Switching Intent
|
||||
---------------------------
|
||||
.. code-block:: text
|
||||
|
||||
User: What are the symptoms of diabetes?
|
||||
**[Arch]**: Check if there is an available <prompt_target> that can handle this user query.
|
||||
**[Arch]**: Found "diseases_symptoms" prompt_target in arch_config.yaml. Forward disease=diabeteres to "diseases_symptoms" prompt target
|
||||
...
|
||||
Assistant: Common symptoms include frequent urination, excessive thirst, fatigue, and blurry vision.
|
||||
|
||||
User: How is it diagnosed?
|
||||
**[Arch]**: New intent detected.
|
||||
**[Arch]**: Found "disease_diagnoses" prompt_target in arch_config.yaml. Forward disease=diabeteres to "disease_diagnoses" prompt target
|
||||
...
|
||||
Assistant: Diabetes is diagnosed through blood tests like fasting blood sugar, A1C, or an oral glucose tolerance test.
|
||||
|
||||
|
||||
Build Multi-Turn RAG Apps
|
||||
--------------------------
|
||||
The following section describes how you can easilly add support for multi-turn scenarios via Arch. You process and manage multi-turn prompts
|
||||
just like you manage single-turn ones. Arch handles the conpleixity of detecting the correct intent based on the last user prompt and
|
||||
the covnersational history, extracts relevant parameters needed by downstream APIs, and dipatches calls to any upstream LLMs to summarize the
|
||||
response from your APIs.
|
||||
|
||||
|
||||
.. _multi_turn_subsection_prompt_target:
|
||||
|
||||
Step 1: Define Arch Config
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: includes/multi_turn/prompt_targets_multi_turn.yaml
|
||||
:language: yaml
|
||||
:caption: Arch Config
|
||||
:linenos:
|
||||
|
||||
Step 2: Process Request in Flask
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Once the prompt targets are configured as above, handle parameters across multi-turn as if its a single-turn request
|
||||
|
||||
.. literalinclude:: includes/multi_turn/multi_turn_rag.py
|
||||
:language: python
|
||||
:caption: Parameter handling with Flask
|
||||
:linenos:
|
||||
|
||||
Demo App
|
||||
~~~~~~~~
|
||||
|
||||
For your convenience, we've built a `demo app <https://github.com/katanemo/archgw/main/demos/multi_turn_rag_agent>`_
|
||||
that you can test and modify locally for multi-turn RAG scenarios.
|
||||
|
||||
.. figure:: includes/multi_turn/mutli-turn-example.png
|
||||
:width: 100%
|
||||
:align: center
|
||||
|
||||
Example multi-turn user conversation showing adjusting retrieval
|
||||
|
|
@ -1,10 +1,18 @@
|
|||
.. _arch_rag_guide:
|
||||
|
||||
RAG Application
|
||||
===============
|
||||
RAG Apps
|
||||
========
|
||||
|
||||
The following section describes how Arch can help you build faster, smarter and more accurate
|
||||
Retrieval-Augmented Generation (RAG) applications.
|
||||
Retrieval-Augmented Generation (RAG) applications, including fast and accurate RAG in multi-turn
|
||||
converational scenarios.
|
||||
|
||||
What is Retrieval-Augmented Generation (RAG)?
|
||||
---------------------------------------------
|
||||
RAG applications combine retrieval-based methods with generative AI models to provide more accurate,
|
||||
contextually relevant, and reliable outputs. These applications leverage external data sources to augment
|
||||
the capabilities of Large Language Models (LLMs), enabling them to retrieve and integrate specific information
|
||||
rather than relying solely on the LLM's internal knowledge.
|
||||
|
||||
Parameter Extraction for RAG
|
||||
----------------------------
|
||||
|
|
@ -33,60 +41,12 @@ Once the prompt targets are configured as above, handling those parameters is
|
|||
:caption: Parameter handling with Flask
|
||||
:linenos:
|
||||
|
||||
[Coming Soon] `Drift Detection via Arch Intent-Markers <https://github.com/orgs/katanemo/projects/1/views/1?pane=issue&itemId=82697909>`_
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------
|
||||
Developers struggle to efficiently handle ``follow-up`` or ``clarification`` questions. Specifically, when users ask for
|
||||
changes or additions to previous responses their AI applications often generate entirely new responses instead of adjusting
|
||||
previous ones. Arch offers ``intent tracking`` as a feature so that developers can know when the user has shifted away from a
|
||||
previous intent so that they can dramatically improve retrieval accuracy, lower overall token cost and improve the speed of
|
||||
their responses back to users.
|
||||
Multi-Turn RAG (Follow-up Questions)
|
||||
-------------------------------------
|
||||
Developers often `struggle <https://www.reddit.com/r/LocalLLaMA/comments/18mqwg6/best_practice_for_rag_with_followup_chat/>`_ to efficiently handle
|
||||
``follow-up`` or ``clarification`` questions. Specifically, when users ask for changes or additions to previous responses, it requires developers to
|
||||
re-write prompts using LLMs with precise prompt engineering techniques. This process is slow, manual, error prone and adds signifcant latency to the
|
||||
user experience. Arch
|
||||
|
||||
Arch uses its built-in lightweight NLI and embedding models to know if the user has steered away from an active intent.
|
||||
Arch's intent-drift detection mechanism is based on its :ref:`prompt target <prompt_target>` primtive. Arch tries to match an incoming
|
||||
prompt to one of the prompt_targets configured in the gateway. Once it detects that the user has moved away from an active
|
||||
active intent, Arch adds the ``x-arch-intent-marker`` headers to the request before sending it your application servers.
|
||||
|
||||
.. literalinclude:: includes/rag/intent_detection.py
|
||||
:language: python
|
||||
:linenos:
|
||||
:lines: 101-157
|
||||
:emphasize-lines: 14-25
|
||||
:caption: Intent Detection Example
|
||||
|
||||
|
||||
.. Note::
|
||||
|
||||
Arch is (mostly) stateless so that it can scale in an embarrassingly parrallel fashion. So, while Arch offers
|
||||
intent-drift detetction, you still have to maintain converational state with intent drift as metadata. The
|
||||
following code snippets show how easily you can build and enrich conversational history with Langchain (in Python),
|
||||
so that you can use the most relevant prompts for your retrieval and for prompting upstream LLMs.
|
||||
|
||||
|
||||
Step 1: Define ConversationBufferMemory
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: includes/rag/intent_detection.py
|
||||
:language: python
|
||||
:linenos:
|
||||
:lines: 1-21
|
||||
|
||||
Step 2: Update ConversationBufferMemory with Intents
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: includes/rag/intent_detection.py
|
||||
:language: python
|
||||
:linenos:
|
||||
:lines: 24-64
|
||||
|
||||
Step 3: Get Messages based on latest drift
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: includes/rag/intent_detection.py
|
||||
:language: python
|
||||
:linenos:
|
||||
:lines: 67-80
|
||||
|
||||
|
||||
You can used the last set of messages that match to an intent to prompt an LLM, use it with an vector-DB for
|
||||
improved retrieval, etc. With Arch and a few lines of code, you can improve the retrieval accuracy, lower overall
|
||||
token cost and dramatically improve the speed of their responses back to users.
|
||||
Arch is highly capable of accurately detecting and processing prompts in a multi-turn scenarios so that you can buil fast and accurate RAG apps in
|
||||
minutes. For additional details on how to build multi-turn RAG applications please refer to our :ref:`multi-turn <arch_multi_turn_guide>` docs.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue