mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-29 02:23:44 +02:00
Feature/streaming llm phase 1 (#566)
* Tidy up duplicate tech specs in doc directory * Streaming LLM text-completion service tech spec. * text-completion and prompt interfaces * streaming change applied to all LLMs, so far tested with VertexAI * Skip Pinecone unit tests, upstream module issue is affecting things, tests are passing again * Added agent streaming, not working and has broken tests
This commit is contained in:
parent
943a9d83b0
commit
310a2deb06
44 changed files with 2684 additions and 937 deletions
|
|
@ -2,6 +2,7 @@
|
|||
import logging
|
||||
import json
|
||||
import re
|
||||
import asyncio
|
||||
|
||||
from . types import Action, Final
|
||||
|
||||
|
|
@ -169,7 +170,7 @@ class AgentManager:
|
|||
|
||||
raise ValueError(f"Could not parse response: {text}")
|
||||
|
||||
async def reason(self, question, history, context):
|
||||
async def reason(self, question, history, context, streaming=False, think=None, observe=None, answer=None):
|
||||
|
||||
logger.debug(f"calling reason: {question}")
|
||||
|
||||
|
|
@ -219,25 +220,62 @@ class AgentManager:
|
|||
|
||||
logger.info(f"prompt: {variables}")
|
||||
|
||||
# Get text response from prompt service
|
||||
response_text = await context("prompt-request").agent_react(variables)
|
||||
# Streaming path - use StreamingReActParser
|
||||
if streaming and think:
|
||||
from .streaming_parser import StreamingReActParser
|
||||
|
||||
logger.debug(f"Response text:\n{response_text}")
|
||||
# Create parser with streaming callbacks
|
||||
# Thought chunks go to think(), answer chunks go to answer()
|
||||
parser = StreamingReActParser(
|
||||
on_thought_chunk=lambda chunk: asyncio.create_task(think(chunk)),
|
||||
on_answer_chunk=lambda chunk: asyncio.create_task(answer(chunk) if answer else think(chunk)),
|
||||
)
|
||||
|
||||
logger.info(f"response: {response_text}")
|
||||
# Create async chunk callback that feeds parser
|
||||
async def on_chunk(text):
|
||||
parser.feed(text)
|
||||
|
||||
# Get streaming response
|
||||
response_text = await context("prompt-request").agent_react(
|
||||
variables=variables,
|
||||
streaming=True,
|
||||
chunk_callback=on_chunk
|
||||
)
|
||||
|
||||
# Finalize parser
|
||||
parser.finalize()
|
||||
|
||||
# Get result
|
||||
result = parser.get_result()
|
||||
if result is None:
|
||||
raise RuntimeError("Parser failed to produce a result")
|
||||
|
||||
# Parse the text response
|
||||
try:
|
||||
result = self.parse_react_response(response_text)
|
||||
logger.info(f"Parsed result: {result}")
|
||||
return result
|
||||
except ValueError as e:
|
||||
logger.error(f"Failed to parse response: {e}")
|
||||
# Try to provide a helpful error message
|
||||
logger.error(f"Response was: {response_text}")
|
||||
raise RuntimeError(f"Failed to parse agent response: {e}")
|
||||
|
||||
async def react(self, question, history, think, observe, context):
|
||||
else:
|
||||
# Non-streaming path - get complete text and parse
|
||||
response_text = await context("prompt-request").agent_react(
|
||||
variables=variables,
|
||||
streaming=False
|
||||
)
|
||||
|
||||
logger.debug(f"Response text:\n{response_text}")
|
||||
|
||||
logger.info(f"response: {response_text}")
|
||||
|
||||
# Parse the text response
|
||||
try:
|
||||
result = self.parse_react_response(response_text)
|
||||
logger.info(f"Parsed result: {result}")
|
||||
return result
|
||||
except ValueError as e:
|
||||
logger.error(f"Failed to parse response: {e}")
|
||||
# Try to provide a helpful error message
|
||||
logger.error(f"Response was: {response_text}")
|
||||
raise RuntimeError(f"Failed to parse agent response: {e}")
|
||||
|
||||
async def react(self, question, history, think, observe, context, streaming=False, answer=None):
|
||||
|
||||
logger.info(f"question: {question}")
|
||||
|
||||
|
|
@ -245,17 +283,27 @@ class AgentManager:
|
|||
question = question,
|
||||
history = history,
|
||||
context = context,
|
||||
streaming = streaming,
|
||||
think = think,
|
||||
observe = observe,
|
||||
answer = answer,
|
||||
)
|
||||
logger.info(f"act: {act}")
|
||||
|
||||
if isinstance(act, Final):
|
||||
|
||||
await think(act.thought)
|
||||
# In non-streaming mode, send complete thought
|
||||
# In streaming mode, thoughts were already sent as chunks
|
||||
if not streaming:
|
||||
await think(act.thought)
|
||||
return act
|
||||
|
||||
else:
|
||||
|
||||
await think(act.thought)
|
||||
# In non-streaming mode, send complete thought
|
||||
# In streaming mode, thoughts were already sent as chunks
|
||||
if not streaming:
|
||||
await think(act.thought)
|
||||
|
||||
logger.debug(f"ACTION: {act.name}")
|
||||
|
||||
|
|
|
|||
|
|
@ -191,6 +191,9 @@ class Processor(AgentService):
|
|||
|
||||
try:
|
||||
|
||||
# Check if streaming is enabled
|
||||
streaming = getattr(request, 'streaming', False)
|
||||
|
||||
if request.history:
|
||||
history = [
|
||||
Action(
|
||||
|
|
@ -215,12 +218,27 @@ class Processor(AgentService):
|
|||
|
||||
logger.debug(f"Think: {x}")
|
||||
|
||||
r = AgentResponse(
|
||||
answer=None,
|
||||
error=None,
|
||||
thought=x,
|
||||
observation=None,
|
||||
)
|
||||
if streaming:
|
||||
# Streaming format
|
||||
r = AgentResponse(
|
||||
chunk_type="thought",
|
||||
content=x,
|
||||
end_of_message=True,
|
||||
end_of_dialog=False,
|
||||
# Legacy fields for backward compatibility
|
||||
answer=None,
|
||||
error=None,
|
||||
thought=x,
|
||||
observation=None,
|
||||
)
|
||||
else:
|
||||
# Legacy format
|
||||
r = AgentResponse(
|
||||
answer=None,
|
||||
error=None,
|
||||
thought=x,
|
||||
observation=None,
|
||||
)
|
||||
|
||||
await respond(r)
|
||||
|
||||
|
|
@ -228,12 +246,55 @@ class Processor(AgentService):
|
|||
|
||||
logger.debug(f"Observe: {x}")
|
||||
|
||||
r = AgentResponse(
|
||||
answer=None,
|
||||
error=None,
|
||||
thought=None,
|
||||
observation=x,
|
||||
)
|
||||
if streaming:
|
||||
# Streaming format
|
||||
r = AgentResponse(
|
||||
chunk_type="observation",
|
||||
content=x,
|
||||
end_of_message=True,
|
||||
end_of_dialog=False,
|
||||
# Legacy fields for backward compatibility
|
||||
answer=None,
|
||||
error=None,
|
||||
thought=None,
|
||||
observation=x,
|
||||
)
|
||||
else:
|
||||
# Legacy format
|
||||
r = AgentResponse(
|
||||
answer=None,
|
||||
error=None,
|
||||
thought=None,
|
||||
observation=x,
|
||||
)
|
||||
|
||||
await respond(r)
|
||||
|
||||
async def answer(x):
|
||||
|
||||
logger.debug(f"Answer: {x}")
|
||||
|
||||
if streaming:
|
||||
# Streaming format
|
||||
r = AgentResponse(
|
||||
chunk_type="answer",
|
||||
content=x,
|
||||
end_of_message=False, # More chunks may follow
|
||||
end_of_dialog=False,
|
||||
# Legacy fields for backward compatibility
|
||||
answer=None,
|
||||
error=None,
|
||||
thought=None,
|
||||
observation=None,
|
||||
)
|
||||
else:
|
||||
# Legacy format - shouldn't be called in non-streaming mode
|
||||
r = AgentResponse(
|
||||
answer=x,
|
||||
error=None,
|
||||
thought=None,
|
||||
observation=None,
|
||||
)
|
||||
|
||||
await respond(r)
|
||||
|
||||
|
|
@ -273,7 +334,9 @@ class Processor(AgentService):
|
|||
history = history,
|
||||
think = think,
|
||||
observe = observe,
|
||||
answer = answer,
|
||||
context = UserAwareContext(flow, request.user),
|
||||
streaming = streaming,
|
||||
)
|
||||
|
||||
logger.debug(f"Action: {act}")
|
||||
|
|
@ -287,11 +350,26 @@ class Processor(AgentService):
|
|||
else:
|
||||
f = json.dumps(act.final)
|
||||
|
||||
r = AgentResponse(
|
||||
answer=act.final,
|
||||
error=None,
|
||||
thought=None,
|
||||
)
|
||||
if streaming:
|
||||
# Streaming format - send end-of-dialog marker
|
||||
# Answer chunks were already sent via think() callback during parsing
|
||||
r = AgentResponse(
|
||||
chunk_type="answer",
|
||||
content="", # Empty content, just marking end of dialog
|
||||
end_of_message=True,
|
||||
end_of_dialog=True,
|
||||
# Legacy fields for backward compatibility
|
||||
answer=act.final,
|
||||
error=None,
|
||||
thought=None,
|
||||
)
|
||||
else:
|
||||
# Legacy format - send complete answer
|
||||
r = AgentResponse(
|
||||
answer=act.final,
|
||||
error=None,
|
||||
thought=None,
|
||||
)
|
||||
|
||||
await respond(r)
|
||||
|
||||
|
|
@ -321,7 +399,9 @@ class Processor(AgentService):
|
|||
observation=h.observation
|
||||
)
|
||||
for h in history
|
||||
]
|
||||
],
|
||||
user=request.user,
|
||||
streaming=streaming,
|
||||
)
|
||||
|
||||
await next(r)
|
||||
|
|
@ -336,14 +416,32 @@ class Processor(AgentService):
|
|||
|
||||
logger.debug("Send error response...")
|
||||
|
||||
r = AgentResponse(
|
||||
error=Error(
|
||||
type = "agent-error",
|
||||
message = str(e),
|
||||
),
|
||||
response=None,
|
||||
error_obj = Error(
|
||||
type = "agent-error",
|
||||
message = str(e),
|
||||
)
|
||||
|
||||
# Check if streaming was enabled (may not be set if error occurred early)
|
||||
streaming = getattr(request, 'streaming', False) if 'request' in locals() else False
|
||||
|
||||
if streaming:
|
||||
# Streaming format
|
||||
r = AgentResponse(
|
||||
chunk_type="error",
|
||||
content=str(e),
|
||||
end_of_message=True,
|
||||
end_of_dialog=True,
|
||||
# Legacy fields for backward compatibility
|
||||
error=error_obj,
|
||||
response=None,
|
||||
)
|
||||
else:
|
||||
# Legacy format
|
||||
r = AgentResponse(
|
||||
error=error_obj,
|
||||
response=None,
|
||||
)
|
||||
|
||||
await respond(r)
|
||||
|
||||
@staticmethod
|
||||
|
|
|
|||
339
trustgraph-flow/trustgraph/agent/react/streaming_parser.py
Normal file
339
trustgraph-flow/trustgraph/agent/react/streaming_parser.py
Normal file
|
|
@ -0,0 +1,339 @@
|
|||
"""
|
||||
Streaming parser for ReAct responses.
|
||||
|
||||
This parser handles text chunks from LLM streaming responses and parses them
|
||||
into ReAct format (Thought/Action/Args or Thought/Final Answer). It maintains
|
||||
state across chunk boundaries to handle cases where delimiters or JSON are split.
|
||||
|
||||
Key challenges:
|
||||
- Delimiters may be split across chunks: "Tho" + "ught:" or "Final An" + "swer:"
|
||||
- JSON arguments may be split: '{"loc' + 'ation": "NYC"}'
|
||||
- Need to emit thought/answer chunks as they arrive for streaming
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from enum import Enum
|
||||
from typing import Optional, Callable, Any
|
||||
from . types import Action, Final
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ParserState(Enum):
|
||||
"""States for the streaming ReAct parser state machine"""
|
||||
INITIAL = "initial" # Waiting for first content
|
||||
THOUGHT = "thought" # Accumulating thought content
|
||||
ACTION = "action" # Found "Action:", collecting action name
|
||||
ARGS = "args" # Found "Args:", collecting JSON arguments
|
||||
FINAL_ANSWER = "final_answer" # Found "Final Answer:", collecting answer
|
||||
COMPLETE = "complete" # Parsing complete, object ready
|
||||
|
||||
|
||||
class StreamingReActParser:
|
||||
"""
|
||||
Stateful parser for streaming ReAct responses.
|
||||
|
||||
Expected format:
|
||||
Thought: [reasoning about what to do next]
|
||||
Action: [tool_name]
|
||||
Args: {
|
||||
"param": "value"
|
||||
}
|
||||
|
||||
OR
|
||||
Thought: [reasoning about the final answer]
|
||||
Final Answer: [the answer]
|
||||
|
||||
Usage:
|
||||
parser = StreamingReActParser(
|
||||
on_thought_chunk=lambda chunk: print(f"Thought: {chunk}"),
|
||||
on_answer_chunk=lambda chunk: print(f"Answer: {chunk}"),
|
||||
)
|
||||
|
||||
for chunk in llm_stream:
|
||||
parser.feed(chunk)
|
||||
if parser.is_complete():
|
||||
result = parser.get_result()
|
||||
break
|
||||
"""
|
||||
|
||||
# Delimiters we're looking for
|
||||
THOUGHT_DELIMITER = "Thought:"
|
||||
ACTION_DELIMITER = "Action:"
|
||||
ARGS_DELIMITER = "Args:"
|
||||
FINAL_ANSWER_DELIMITER = "Final Answer:"
|
||||
|
||||
# Maximum buffer size for delimiter detection (longest delimiter + safety margin)
|
||||
MAX_DELIMITER_BUFFER = 20
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
on_thought_chunk: Optional[Callable[[str], Any]] = None,
|
||||
on_answer_chunk: Optional[Callable[[str], Any]] = None,
|
||||
):
|
||||
"""
|
||||
Initialize streaming parser.
|
||||
|
||||
Args:
|
||||
on_thought_chunk: Callback for thought text chunks as they arrive
|
||||
on_answer_chunk: Callback for final answer text chunks as they arrive
|
||||
"""
|
||||
self.on_thought_chunk = on_thought_chunk
|
||||
self.on_answer_chunk = on_answer_chunk
|
||||
|
||||
# Parser state
|
||||
self.state = ParserState.INITIAL
|
||||
|
||||
# Buffers for accumulating content
|
||||
self.line_buffer = "" # For detecting delimiters across chunk boundaries
|
||||
self.thought_buffer = "" # Accumulated thought text
|
||||
self.action_buffer = "" # Action name
|
||||
self.args_buffer = "" # JSON arguments text
|
||||
self.answer_buffer = "" # Final answer text
|
||||
|
||||
# JSON parsing state for Args
|
||||
self.brace_count = 0
|
||||
self.args_started = False
|
||||
|
||||
# Result object (Action or Final)
|
||||
self.result = None
|
||||
|
||||
def feed(self, chunk: str) -> None:
|
||||
"""
|
||||
Feed a text chunk to the parser.
|
||||
|
||||
Args:
|
||||
chunk: Text chunk from LLM stream
|
||||
"""
|
||||
if self.state == ParserState.COMPLETE:
|
||||
return # Already complete, ignore further chunks
|
||||
|
||||
# Add chunk to line buffer for delimiter detection
|
||||
self.line_buffer += chunk
|
||||
|
||||
# Remove markdown code blocks if present
|
||||
self.line_buffer = re.sub(r'^```[^\n]*\n', '', self.line_buffer)
|
||||
self.line_buffer = re.sub(r'\n```$', '', self.line_buffer)
|
||||
|
||||
# Process based on current state
|
||||
while self.line_buffer and self.state != ParserState.COMPLETE:
|
||||
if self.state == ParserState.INITIAL:
|
||||
self._process_initial()
|
||||
elif self.state == ParserState.THOUGHT:
|
||||
self._process_thought()
|
||||
elif self.state == ParserState.ACTION:
|
||||
self._process_action()
|
||||
elif self.state == ParserState.ARGS:
|
||||
self._process_args()
|
||||
elif self.state == ParserState.FINAL_ANSWER:
|
||||
self._process_final_answer()
|
||||
|
||||
def _process_initial(self) -> None:
|
||||
"""Process INITIAL state - looking for 'Thought:' delimiter"""
|
||||
idx = self.line_buffer.find(self.THOUGHT_DELIMITER)
|
||||
|
||||
if idx >= 0:
|
||||
# Found thought delimiter
|
||||
# Discard any content before it
|
||||
self.line_buffer = self.line_buffer[idx + len(self.THOUGHT_DELIMITER):]
|
||||
self.state = ParserState.THOUGHT
|
||||
elif len(self.line_buffer) >= self.MAX_DELIMITER_BUFFER:
|
||||
# Buffer getting too large, probably junk before thought
|
||||
# Keep only the tail that might contain partial delimiter
|
||||
self.line_buffer = self.line_buffer[-self.MAX_DELIMITER_BUFFER:]
|
||||
|
||||
def _process_thought(self) -> None:
|
||||
"""Process THOUGHT state - accumulating thought content"""
|
||||
# Check for Action or Final Answer delimiter
|
||||
action_idx = self.line_buffer.find(self.ACTION_DELIMITER)
|
||||
final_idx = self.line_buffer.find(self.FINAL_ANSWER_DELIMITER)
|
||||
|
||||
# Find which delimiter comes first (if any)
|
||||
next_delimiter_idx = -1
|
||||
next_state = None
|
||||
|
||||
if action_idx >= 0 and (final_idx < 0 or action_idx < final_idx):
|
||||
next_delimiter_idx = action_idx
|
||||
next_state = ParserState.ACTION
|
||||
delimiter_len = len(self.ACTION_DELIMITER)
|
||||
elif final_idx >= 0:
|
||||
next_delimiter_idx = final_idx
|
||||
next_state = ParserState.FINAL_ANSWER
|
||||
delimiter_len = len(self.FINAL_ANSWER_DELIMITER)
|
||||
|
||||
if next_delimiter_idx >= 0:
|
||||
# Found next delimiter
|
||||
thought_chunk = self.line_buffer[:next_delimiter_idx].strip()
|
||||
if thought_chunk:
|
||||
self.thought_buffer += thought_chunk
|
||||
if self.on_thought_chunk:
|
||||
self.on_thought_chunk(thought_chunk)
|
||||
|
||||
self.line_buffer = self.line_buffer[next_delimiter_idx + delimiter_len:]
|
||||
self.state = next_state
|
||||
else:
|
||||
# No delimiter found yet
|
||||
# Keep tail in buffer (might contain partial delimiter)
|
||||
# Emit the rest as thought chunk
|
||||
if len(self.line_buffer) > self.MAX_DELIMITER_BUFFER:
|
||||
emittable = self.line_buffer[:-self.MAX_DELIMITER_BUFFER]
|
||||
self.thought_buffer += emittable
|
||||
if self.on_thought_chunk:
|
||||
self.on_thought_chunk(emittable)
|
||||
self.line_buffer = self.line_buffer[-self.MAX_DELIMITER_BUFFER:]
|
||||
|
||||
def _process_action(self) -> None:
|
||||
"""Process ACTION state - collecting action name"""
|
||||
# Action name is on one line (or at least until newline or Args:)
|
||||
newline_idx = self.line_buffer.find('\n')
|
||||
args_idx = self.line_buffer.find(self.ARGS_DELIMITER)
|
||||
|
||||
# Find which comes first
|
||||
if args_idx >= 0 and (newline_idx < 0 or args_idx < newline_idx):
|
||||
# Args delimiter found first
|
||||
self.action_buffer = self.line_buffer[:args_idx].strip().strip('"')
|
||||
self.line_buffer = self.line_buffer[args_idx + len(self.ARGS_DELIMITER):]
|
||||
self.state = ParserState.ARGS
|
||||
elif newline_idx >= 0:
|
||||
# Newline found, action name complete
|
||||
self.action_buffer = self.line_buffer[:newline_idx].strip().strip('"')
|
||||
self.line_buffer = self.line_buffer[newline_idx + 1:]
|
||||
# Stay in ACTION state or move to ARGS if we find delimiter
|
||||
# Actually, check if next line has Args:
|
||||
if self.line_buffer.lstrip().startswith(self.ARGS_DELIMITER):
|
||||
args_start = self.line_buffer.find(self.ARGS_DELIMITER)
|
||||
self.line_buffer = self.line_buffer[args_start + len(self.ARGS_DELIMITER):]
|
||||
self.state = ParserState.ARGS
|
||||
else:
|
||||
# Not enough content yet, keep buffering
|
||||
# But if buffer is getting large, action name is probably complete
|
||||
if len(self.line_buffer) > 100:
|
||||
self.action_buffer = self.line_buffer.strip().strip('"')
|
||||
self.line_buffer = ""
|
||||
# Assume Args comes next, but we need more content
|
||||
self.state = ParserState.ARGS
|
||||
|
||||
def _process_args(self) -> None:
|
||||
"""Process ARGS state - collecting JSON arguments"""
|
||||
# Process character by character to track brace matching
|
||||
i = 0
|
||||
while i < len(self.line_buffer):
|
||||
char = self.line_buffer[i]
|
||||
self.args_buffer += char
|
||||
|
||||
if char == '{':
|
||||
self.brace_count += 1
|
||||
self.args_started = True
|
||||
elif char == '}':
|
||||
self.brace_count -= 1
|
||||
|
||||
# Check if JSON is complete
|
||||
if self.args_started and self.brace_count == 0:
|
||||
# JSON complete, try to parse
|
||||
try:
|
||||
args_dict = json.loads(self.args_buffer.strip())
|
||||
# Success! Create Action result
|
||||
self.result = Action(
|
||||
thought=self.thought_buffer.strip(),
|
||||
name=self.action_buffer,
|
||||
arguments=args_dict,
|
||||
observation=""
|
||||
)
|
||||
self.state = ParserState.COMPLETE
|
||||
self.line_buffer = "" # Clear buffer
|
||||
return
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse JSON args: {self.args_buffer}")
|
||||
raise ValueError(f"Invalid JSON in Args: {e}")
|
||||
|
||||
i += 1
|
||||
|
||||
# Consumed entire buffer, clear it and wait for more chunks
|
||||
self.line_buffer = ""
|
||||
|
||||
def _process_final_answer(self) -> None:
|
||||
"""Process FINAL_ANSWER state - collecting final answer"""
|
||||
# For final answer, we consume everything until we decide we're done
|
||||
# In streaming mode, we can't know when answer is complete until stream ends
|
||||
# So we emit chunks and accumulate
|
||||
|
||||
# Check if this might be JSON
|
||||
is_json = self.answer_buffer.strip().startswith('{') or \
|
||||
self.line_buffer.strip().startswith('{')
|
||||
|
||||
if is_json:
|
||||
# Handle JSON final answer
|
||||
self.answer_buffer += self.line_buffer
|
||||
|
||||
# Count braces to detect completion
|
||||
brace_count = self.answer_buffer.count('{') - self.answer_buffer.count('}')
|
||||
|
||||
if brace_count == 0 and '{' in self.answer_buffer:
|
||||
# JSON might be complete
|
||||
# Note: We can't be 100% sure without trying to parse
|
||||
# But in streaming mode, we'll finish when stream ends
|
||||
pass
|
||||
|
||||
# Emit chunk
|
||||
if self.on_answer_chunk:
|
||||
self.on_answer_chunk(self.line_buffer)
|
||||
|
||||
self.line_buffer = ""
|
||||
else:
|
||||
# Regular text answer - emit everything
|
||||
if self.line_buffer:
|
||||
self.answer_buffer += self.line_buffer
|
||||
if self.on_answer_chunk:
|
||||
self.on_answer_chunk(self.line_buffer)
|
||||
self.line_buffer = ""
|
||||
|
||||
def finalize(self) -> None:
|
||||
"""
|
||||
Call this when the stream is complete to finalize parsing.
|
||||
This handles any remaining buffered content.
|
||||
"""
|
||||
if self.state == ParserState.COMPLETE:
|
||||
return
|
||||
|
||||
# Flush any remaining thought chunks
|
||||
if self.state == ParserState.THOUGHT and self.line_buffer:
|
||||
self.thought_buffer += self.line_buffer
|
||||
if self.on_thought_chunk:
|
||||
self.on_thought_chunk(self.line_buffer)
|
||||
self.line_buffer = ""
|
||||
|
||||
# Finalize final answer
|
||||
if self.state == ParserState.FINAL_ANSWER:
|
||||
# Flush any remaining answer content
|
||||
if self.line_buffer:
|
||||
self.answer_buffer += self.line_buffer
|
||||
if self.on_answer_chunk:
|
||||
self.on_answer_chunk(self.line_buffer)
|
||||
self.line_buffer = ""
|
||||
|
||||
# Create Final result
|
||||
self.result = Final(
|
||||
thought=self.thought_buffer.strip(),
|
||||
final=self.answer_buffer.strip()
|
||||
)
|
||||
self.state = ParserState.COMPLETE
|
||||
|
||||
# If we're in other states, something went wrong
|
||||
if self.state not in [ParserState.COMPLETE, ParserState.FINAL_ANSWER]:
|
||||
if self.thought_buffer:
|
||||
raise ValueError(
|
||||
f"Stream ended in {self.state.value} state with incomplete parsing. "
|
||||
f"Thought: {self.thought_buffer[:100]}..."
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Stream ended in {self.state.value} state with no content")
|
||||
|
||||
def is_complete(self) -> bool:
|
||||
"""Check if parsing is complete"""
|
||||
return self.state == ParserState.COMPLETE
|
||||
|
||||
def get_result(self) -> Optional[Action | Final]:
|
||||
"""Get the parsed result (Action or Final)"""
|
||||
return self.result
|
||||
|
|
@ -34,9 +34,9 @@ class BlobStore:
|
|||
def ensure_bucket(self):
|
||||
|
||||
# Make the bucket if it doesn't exist.
|
||||
found = self.minio.bucket_exists(self.bucket_name)
|
||||
found = self.minio.bucket_exists(bucket_name=self.bucket_name)
|
||||
if not found:
|
||||
self.minio.make_bucket(self.bucket_name)
|
||||
self.minio.make_bucket(bucket_name=self.bucket_name)
|
||||
logger.info(f"Created bucket {self.bucket_name}")
|
||||
else:
|
||||
logger.debug(f"Bucket {self.bucket_name} already exists")
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ import os
|
|||
import logging
|
||||
|
||||
from .... exceptions import TooManyRequests
|
||||
from .... base import LlmService, LlmResult
|
||||
from .... base import LlmService, LlmResult, LlmChunk
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -55,7 +55,7 @@ class Processor(LlmService):
|
|||
self.max_output = max_output
|
||||
self.default_model = model
|
||||
|
||||
def build_prompt(self, system, content, temperature=None):
|
||||
def build_prompt(self, system, content, temperature=None, stream=False):
|
||||
# Use provided temperature or fall back to default
|
||||
effective_temperature = temperature if temperature is not None else self.temperature
|
||||
|
||||
|
|
@ -73,6 +73,9 @@ class Processor(LlmService):
|
|||
"top_p": 1
|
||||
}
|
||||
|
||||
if stream:
|
||||
data["stream"] = True
|
||||
|
||||
body = json.dumps(data)
|
||||
|
||||
return body
|
||||
|
|
@ -157,6 +160,84 @@ class Processor(LlmService):
|
|||
|
||||
logger.debug("Azure LLM processing complete")
|
||||
|
||||
def supports_streaming(self):
|
||||
"""Azure serverless endpoints support streaming"""
|
||||
return True
|
||||
|
||||
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
|
||||
"""Stream content generation from Azure serverless endpoint"""
|
||||
model_name = model or self.default_model
|
||||
effective_temperature = temperature if temperature is not None else self.temperature
|
||||
|
||||
logger.debug(f"Using model (streaming): {model_name}")
|
||||
logger.debug(f"Using temperature: {effective_temperature}")
|
||||
|
||||
try:
|
||||
body = self.build_prompt(system, prompt, effective_temperature, stream=True)
|
||||
|
||||
url = self.endpoint
|
||||
api_key = self.token
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {api_key}'
|
||||
}
|
||||
|
||||
response = requests.post(url, data=body, headers=headers, stream=True)
|
||||
|
||||
if response.status_code == 429:
|
||||
raise TooManyRequests()
|
||||
|
||||
if response.status_code != 200:
|
||||
raise RuntimeError("LLM failure")
|
||||
|
||||
# Parse SSE stream
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
line = line.decode('utf-8').strip()
|
||||
if line.startswith('data: '):
|
||||
data = line[6:] # Remove 'data: ' prefix
|
||||
|
||||
if data == '[DONE]':
|
||||
break
|
||||
|
||||
try:
|
||||
chunk_data = json.loads(data)
|
||||
|
||||
if 'choices' in chunk_data and len(chunk_data['choices']) > 0:
|
||||
delta = chunk_data['choices'][0].get('delta', {})
|
||||
content = delta.get('content')
|
||||
if content:
|
||||
yield LlmChunk(
|
||||
text=content,
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=False
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning(f"Failed to parse chunk: {data}")
|
||||
continue
|
||||
|
||||
# Send final chunk
|
||||
yield LlmChunk(
|
||||
text="",
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=True
|
||||
)
|
||||
|
||||
logger.debug("Streaming complete")
|
||||
|
||||
except TooManyRequests:
|
||||
logger.warning("Rate limit exceeded during streaming")
|
||||
raise TooManyRequests()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Azure streaming exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ import logging
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
from .... exceptions import TooManyRequests
|
||||
from .... base import LlmService, LlmResult
|
||||
from .... base import LlmService, LlmResult, LlmChunk
|
||||
|
||||
default_ident = "text-completion"
|
||||
|
||||
|
|
@ -125,6 +125,75 @@ class Processor(LlmService):
|
|||
|
||||
logger.debug("Azure OpenAI LLM processing complete")
|
||||
|
||||
def supports_streaming(self):
|
||||
"""Azure OpenAI supports streaming"""
|
||||
return True
|
||||
|
||||
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
|
||||
"""
|
||||
Stream content generation from Azure OpenAI.
|
||||
Yields LlmChunk objects with is_final=True on the last chunk.
|
||||
"""
|
||||
# Use provided model or fall back to default
|
||||
model_name = model or self.default_model
|
||||
# Use provided temperature or fall back to default
|
||||
effective_temperature = temperature if temperature is not None else self.temperature
|
||||
|
||||
logger.debug(f"Using model (streaming): {model_name}")
|
||||
logger.debug(f"Using temperature: {effective_temperature}")
|
||||
|
||||
prompt = system + "\n\n" + prompt
|
||||
|
||||
try:
|
||||
response = self.openai.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
temperature=effective_temperature,
|
||||
max_tokens=self.max_output,
|
||||
top_p=1,
|
||||
stream=True # Enable streaming
|
||||
)
|
||||
|
||||
# Stream chunks
|
||||
for chunk in response:
|
||||
if chunk.choices and chunk.choices[0].delta.content:
|
||||
yield LlmChunk(
|
||||
text=chunk.choices[0].delta.content,
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=False
|
||||
)
|
||||
|
||||
# Send final chunk
|
||||
yield LlmChunk(
|
||||
text="",
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=True
|
||||
)
|
||||
|
||||
logger.debug("Streaming complete")
|
||||
|
||||
except RateLimitError:
|
||||
logger.warning("Rate limit exceeded during streaming")
|
||||
raise TooManyRequests()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Azure OpenAI streaming exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ import os
|
|||
import logging
|
||||
|
||||
from .... exceptions import TooManyRequests
|
||||
from .... base import LlmService, LlmResult
|
||||
from .... base import LlmService, LlmResult, LlmChunk
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -106,6 +106,65 @@ class Processor(LlmService):
|
|||
logger.error(f"Claude LLM exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
def supports_streaming(self):
|
||||
"""Claude/Anthropic supports streaming"""
|
||||
return True
|
||||
|
||||
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
|
||||
"""Stream content generation from Claude"""
|
||||
model_name = model or self.default_model
|
||||
effective_temperature = temperature if temperature is not None else self.temperature
|
||||
|
||||
logger.debug(f"Using model (streaming): {model_name}")
|
||||
logger.debug(f"Using temperature: {effective_temperature}")
|
||||
|
||||
try:
|
||||
with self.claude.messages.stream(
|
||||
model=model_name,
|
||||
max_tokens=self.max_output,
|
||||
temperature=effective_temperature,
|
||||
system=system,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
) as stream:
|
||||
for text in stream.text_stream:
|
||||
yield LlmChunk(
|
||||
text=text,
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=False
|
||||
)
|
||||
|
||||
# Get final message for token counts
|
||||
final_message = stream.get_final_message()
|
||||
yield LlmChunk(
|
||||
text="",
|
||||
in_token=final_message.usage.input_tokens,
|
||||
out_token=final_message.usage.output_tokens,
|
||||
model=model_name,
|
||||
is_final=True
|
||||
)
|
||||
|
||||
logger.debug("Streaming complete")
|
||||
|
||||
except anthropic.RateLimitError:
|
||||
logger.warning("Rate limit exceeded during streaming")
|
||||
raise TooManyRequests()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Claude streaming exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ import logging
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
from .... exceptions import TooManyRequests
|
||||
from .... base import LlmService, LlmResult
|
||||
from .... base import LlmService, LlmResult, LlmChunk
|
||||
|
||||
default_ident = "text-completion"
|
||||
|
||||
|
|
@ -98,6 +98,68 @@ class Processor(LlmService):
|
|||
logger.error(f"Cohere LLM exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
def supports_streaming(self):
|
||||
"""Cohere supports streaming"""
|
||||
return True
|
||||
|
||||
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
|
||||
"""Stream content generation from Cohere"""
|
||||
model_name = model or self.default_model
|
||||
effective_temperature = temperature if temperature is not None else self.temperature
|
||||
|
||||
logger.debug(f"Using model (streaming): {model_name}")
|
||||
logger.debug(f"Using temperature: {effective_temperature}")
|
||||
|
||||
try:
|
||||
stream = self.cohere.chat_stream(
|
||||
model=model_name,
|
||||
message=prompt,
|
||||
preamble=system,
|
||||
temperature=effective_temperature,
|
||||
chat_history=[],
|
||||
prompt_truncation='auto',
|
||||
connectors=[]
|
||||
)
|
||||
|
||||
total_input_tokens = 0
|
||||
total_output_tokens = 0
|
||||
|
||||
for event in stream:
|
||||
if event.event_type == "text-generation":
|
||||
if hasattr(event, 'text') and event.text:
|
||||
yield LlmChunk(
|
||||
text=event.text,
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=False
|
||||
)
|
||||
elif event.event_type == "stream-end":
|
||||
# Extract token counts from final event
|
||||
if hasattr(event, 'response') and hasattr(event.response, 'meta'):
|
||||
if hasattr(event.response.meta, 'billed_units'):
|
||||
total_input_tokens = int(event.response.meta.billed_units.input_tokens)
|
||||
total_output_tokens = int(event.response.meta.billed_units.output_tokens)
|
||||
|
||||
# Send final chunk with token counts
|
||||
yield LlmChunk(
|
||||
text="",
|
||||
in_token=total_input_tokens,
|
||||
out_token=total_output_tokens,
|
||||
model=model_name,
|
||||
is_final=True
|
||||
)
|
||||
|
||||
logger.debug("Streaming complete")
|
||||
|
||||
except cohere.TooManyRequestsError:
|
||||
logger.warning("Rate limit exceeded during streaming")
|
||||
raise TooManyRequests()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Cohere streaming exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ import logging
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
from .... exceptions import TooManyRequests
|
||||
from .... base import LlmService, LlmResult
|
||||
from .... base import LlmService, LlmResult, LlmChunk
|
||||
|
||||
default_ident = "text-completion"
|
||||
|
||||
|
|
@ -159,6 +159,67 @@ class Processor(LlmService):
|
|||
logger.error(f"GoogleAIStudio LLM exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
def supports_streaming(self):
|
||||
"""Google AI Studio supports streaming"""
|
||||
return True
|
||||
|
||||
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
|
||||
"""Stream content generation from Google AI Studio"""
|
||||
model_name = model or self.default_model
|
||||
effective_temperature = temperature if temperature is not None else self.temperature
|
||||
|
||||
logger.debug(f"Using model (streaming): {model_name}")
|
||||
logger.debug(f"Using temperature: {effective_temperature}")
|
||||
|
||||
generation_config = self._get_or_create_config(model_name, effective_temperature)
|
||||
generation_config.system_instruction = system
|
||||
|
||||
try:
|
||||
response = self.client.models.generate_content_stream(
|
||||
model=model_name,
|
||||
config=generation_config,
|
||||
contents=prompt,
|
||||
)
|
||||
|
||||
total_input_tokens = 0
|
||||
total_output_tokens = 0
|
||||
|
||||
for chunk in response:
|
||||
if hasattr(chunk, 'text') and chunk.text:
|
||||
yield LlmChunk(
|
||||
text=chunk.text,
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=False
|
||||
)
|
||||
|
||||
# Accumulate token counts if available
|
||||
if hasattr(chunk, 'usage_metadata'):
|
||||
if hasattr(chunk.usage_metadata, 'prompt_token_count'):
|
||||
total_input_tokens = int(chunk.usage_metadata.prompt_token_count)
|
||||
if hasattr(chunk.usage_metadata, 'candidates_token_count'):
|
||||
total_output_tokens = int(chunk.usage_metadata.candidates_token_count)
|
||||
|
||||
# Send final chunk with token counts
|
||||
yield LlmChunk(
|
||||
text="",
|
||||
in_token=total_input_tokens,
|
||||
out_token=total_output_tokens,
|
||||
model=model_name,
|
||||
is_final=True
|
||||
)
|
||||
|
||||
logger.debug("Streaming complete")
|
||||
|
||||
except ResourceExhausted:
|
||||
logger.warning("Rate limit exceeded during streaming")
|
||||
raise TooManyRequests()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"GoogleAIStudio streaming exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ import logging
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
from .... exceptions import TooManyRequests
|
||||
from .... base import LlmService, LlmResult
|
||||
from .... base import LlmService, LlmResult, LlmChunk
|
||||
|
||||
default_ident = "text-completion"
|
||||
|
||||
|
|
@ -102,6 +102,57 @@ class Processor(LlmService):
|
|||
logger.error(f"Llamafile LLM exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
def supports_streaming(self):
|
||||
"""LlamaFile supports streaming"""
|
||||
return True
|
||||
|
||||
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
|
||||
"""Stream content generation from LlamaFile"""
|
||||
model_name = model or self.default_model
|
||||
effective_temperature = temperature if temperature is not None else self.temperature
|
||||
|
||||
logger.debug(f"Using model (streaming): {model_name}")
|
||||
logger.debug(f"Using temperature: {effective_temperature}")
|
||||
|
||||
prompt = system + "\n\n" + prompt
|
||||
|
||||
try:
|
||||
response = self.openai.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=effective_temperature,
|
||||
max_tokens=self.max_output,
|
||||
top_p=1,
|
||||
frequency_penalty=0,
|
||||
presence_penalty=0,
|
||||
response_format={"type": "text"},
|
||||
stream=True
|
||||
)
|
||||
|
||||
for chunk in response:
|
||||
if chunk.choices and chunk.choices[0].delta.content:
|
||||
yield LlmChunk(
|
||||
text=chunk.choices[0].delta.content,
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=False
|
||||
)
|
||||
|
||||
yield LlmChunk(
|
||||
text="",
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=True
|
||||
)
|
||||
|
||||
logger.debug("Streaming complete")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LlamaFile streaming exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ import logging
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
from .... exceptions import TooManyRequests
|
||||
from .... base import LlmService, LlmResult
|
||||
from .... base import LlmService, LlmResult, LlmChunk
|
||||
|
||||
default_ident = "text-completion"
|
||||
|
||||
|
|
@ -106,6 +106,57 @@ class Processor(LlmService):
|
|||
logger.error(f"LMStudio LLM exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
def supports_streaming(self):
|
||||
"""LM Studio supports streaming"""
|
||||
return True
|
||||
|
||||
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
|
||||
"""Stream content generation from LM Studio"""
|
||||
model_name = model or self.default_model
|
||||
effective_temperature = temperature if temperature is not None else self.temperature
|
||||
|
||||
logger.debug(f"Using model (streaming): {model_name}")
|
||||
logger.debug(f"Using temperature: {effective_temperature}")
|
||||
|
||||
prompt = system + "\n\n" + prompt
|
||||
|
||||
try:
|
||||
response = self.openai.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=effective_temperature,
|
||||
max_tokens=self.max_output,
|
||||
top_p=1,
|
||||
frequency_penalty=0,
|
||||
presence_penalty=0,
|
||||
response_format={"type": "text"},
|
||||
stream=True
|
||||
)
|
||||
|
||||
for chunk in response:
|
||||
if chunk.choices and chunk.choices[0].delta.content:
|
||||
yield LlmChunk(
|
||||
text=chunk.choices[0].delta.content,
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=False
|
||||
)
|
||||
|
||||
yield LlmChunk(
|
||||
text="",
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=True
|
||||
)
|
||||
|
||||
logger.debug("Streaming complete")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LMStudio streaming exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ import logging
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
from .... exceptions import TooManyRequests
|
||||
from .... base import LlmService, LlmResult
|
||||
from .... base import LlmService, LlmResult, LlmChunk
|
||||
|
||||
default_ident = "text-completion"
|
||||
|
||||
|
|
@ -120,6 +120,67 @@ class Processor(LlmService):
|
|||
logger.error(f"Mistral LLM exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
def supports_streaming(self):
|
||||
"""Mistral supports streaming"""
|
||||
return True
|
||||
|
||||
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
|
||||
"""Stream content generation from Mistral"""
|
||||
model_name = model or self.default_model
|
||||
effective_temperature = temperature if temperature is not None else self.temperature
|
||||
|
||||
logger.debug(f"Using model (streaming): {model_name}")
|
||||
logger.debug(f"Using temperature: {effective_temperature}")
|
||||
|
||||
prompt = system + "\n\n" + prompt
|
||||
|
||||
try:
|
||||
stream = self.mistral.chat.stream(
|
||||
model=model_name,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
temperature=effective_temperature,
|
||||
max_tokens=self.max_output,
|
||||
top_p=1,
|
||||
frequency_penalty=0,
|
||||
presence_penalty=0,
|
||||
response_format={"type": "text"}
|
||||
)
|
||||
|
||||
for chunk in stream:
|
||||
if chunk.data.choices and chunk.data.choices[0].delta.content:
|
||||
yield LlmChunk(
|
||||
text=chunk.data.choices[0].delta.content,
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=False
|
||||
)
|
||||
|
||||
# Send final chunk
|
||||
yield LlmChunk(
|
||||
text="",
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=True
|
||||
)
|
||||
|
||||
logger.debug("Streaming complete")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Mistral streaming exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ import logging
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
from .... exceptions import TooManyRequests
|
||||
from .... base import LlmService, LlmResult
|
||||
from .... base import LlmService, LlmResult, LlmChunk
|
||||
|
||||
default_ident = "text-completion"
|
||||
|
||||
|
|
@ -79,6 +79,62 @@ class Processor(LlmService):
|
|||
logger.error(f"Ollama LLM exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
def supports_streaming(self):
|
||||
"""Ollama supports streaming"""
|
||||
return True
|
||||
|
||||
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
|
||||
"""Stream content generation from Ollama"""
|
||||
model_name = model or self.default_model
|
||||
effective_temperature = temperature if temperature is not None else self.temperature
|
||||
|
||||
logger.debug(f"Using model (streaming): {model_name}")
|
||||
logger.debug(f"Using temperature: {effective_temperature}")
|
||||
|
||||
prompt = system + "\n\n" + prompt
|
||||
|
||||
try:
|
||||
stream = self.llm.generate(
|
||||
model_name,
|
||||
prompt,
|
||||
options={'temperature': effective_temperature},
|
||||
stream=True
|
||||
)
|
||||
|
||||
total_input_tokens = 0
|
||||
total_output_tokens = 0
|
||||
|
||||
for chunk in stream:
|
||||
if 'response' in chunk and chunk['response']:
|
||||
yield LlmChunk(
|
||||
text=chunk['response'],
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=False
|
||||
)
|
||||
|
||||
# Accumulate token counts if available
|
||||
if 'prompt_eval_count' in chunk:
|
||||
total_input_tokens = int(chunk['prompt_eval_count'])
|
||||
if 'eval_count' in chunk:
|
||||
total_output_tokens = int(chunk['eval_count'])
|
||||
|
||||
# Send final chunk with token counts
|
||||
yield LlmChunk(
|
||||
text="",
|
||||
in_token=total_input_tokens,
|
||||
out_token=total_output_tokens,
|
||||
model=model_name,
|
||||
is_final=True
|
||||
)
|
||||
|
||||
logger.debug("Streaming complete")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Ollama streaming exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ import os
|
|||
import logging
|
||||
|
||||
from .... exceptions import TooManyRequests
|
||||
from .... base import LlmService, LlmResult
|
||||
from .... base import LlmService, LlmResult, LlmChunk
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -118,6 +118,75 @@ class Processor(LlmService):
|
|||
logger.error(f"OpenAI LLM exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
def supports_streaming(self):
|
||||
"""OpenAI supports streaming"""
|
||||
return True
|
||||
|
||||
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
|
||||
"""
|
||||
Stream content generation from OpenAI.
|
||||
Yields LlmChunk objects with is_final=True on the last chunk.
|
||||
"""
|
||||
# Use provided model or fall back to default
|
||||
model_name = model or self.default_model
|
||||
# Use provided temperature or fall back to default
|
||||
effective_temperature = temperature if temperature is not None else self.temperature
|
||||
|
||||
logger.debug(f"Using model (streaming): {model_name}")
|
||||
logger.debug(f"Using temperature: {effective_temperature}")
|
||||
|
||||
prompt = system + "\n\n" + prompt
|
||||
|
||||
try:
|
||||
response = self.openai.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
temperature=effective_temperature,
|
||||
max_tokens=self.max_output,
|
||||
stream=True # Enable streaming
|
||||
)
|
||||
|
||||
# Stream chunks
|
||||
for chunk in response:
|
||||
if chunk.choices and chunk.choices[0].delta.content:
|
||||
yield LlmChunk(
|
||||
text=chunk.choices[0].delta.content,
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=False
|
||||
)
|
||||
|
||||
# Note: OpenAI doesn't provide token counts in streaming mode
|
||||
# Send final chunk without token counts
|
||||
yield LlmChunk(
|
||||
text="",
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=True
|
||||
)
|
||||
|
||||
logger.debug("Streaming complete")
|
||||
|
||||
except RateLimitError:
|
||||
logger.warning("Hit rate limit during streaming")
|
||||
raise TooManyRequests()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OpenAI streaming exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ import logging
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
from .... exceptions import TooManyRequests
|
||||
from .... base import LlmService, LlmResult
|
||||
from .... base import LlmService, LlmResult, LlmChunk
|
||||
|
||||
default_ident = "text-completion"
|
||||
|
||||
|
|
@ -121,6 +121,100 @@ class Processor(LlmService):
|
|||
logger.error(f"TGI LLM exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
def supports_streaming(self):
|
||||
"""TGI supports streaming"""
|
||||
return True
|
||||
|
||||
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
|
||||
"""Stream content generation from TGI"""
|
||||
model_name = model or self.default_model
|
||||
effective_temperature = temperature if temperature is not None else self.temperature
|
||||
|
||||
logger.debug(f"Using model (streaming): {model_name}")
|
||||
logger.debug(f"Using temperature: {effective_temperature}")
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
request = {
|
||||
"model": model_name,
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": system,
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
"max_tokens": self.max_output,
|
||||
"temperature": effective_temperature,
|
||||
"stream": True,
|
||||
}
|
||||
|
||||
try:
|
||||
url = f"{self.base_url}/chat/completions"
|
||||
|
||||
async with self.session.post(
|
||||
url,
|
||||
headers=headers,
|
||||
json=request,
|
||||
) as response:
|
||||
|
||||
if response.status != 200:
|
||||
raise RuntimeError("Bad status: " + str(response.status))
|
||||
|
||||
# Parse SSE stream
|
||||
async for line in response.content:
|
||||
line = line.decode('utf-8').strip()
|
||||
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith('data: '):
|
||||
data = line[6:] # Remove 'data: ' prefix
|
||||
|
||||
if data == '[DONE]':
|
||||
break
|
||||
|
||||
try:
|
||||
import json
|
||||
chunk_data = json.loads(data)
|
||||
|
||||
# Extract text from chunk
|
||||
if 'choices' in chunk_data and len(chunk_data['choices']) > 0:
|
||||
choice = chunk_data['choices'][0]
|
||||
if 'delta' in choice and 'content' in choice['delta']:
|
||||
content = choice['delta']['content']
|
||||
if content:
|
||||
yield LlmChunk(
|
||||
text=content,
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=False
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning(f"Failed to parse chunk: {data}")
|
||||
continue
|
||||
|
||||
# Send final chunk
|
||||
yield LlmChunk(
|
||||
text="",
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=True
|
||||
)
|
||||
|
||||
logger.debug("Streaming complete")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TGI streaming exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ import logging
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
from .... exceptions import TooManyRequests
|
||||
from .... base import LlmService, LlmResult
|
||||
from .... base import LlmService, LlmResult, LlmChunk
|
||||
|
||||
default_ident = "text-completion"
|
||||
|
||||
|
|
@ -113,6 +113,89 @@ class Processor(LlmService):
|
|||
logger.error(f"vLLM LLM exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
def supports_streaming(self):
|
||||
"""vLLM supports streaming"""
|
||||
return True
|
||||
|
||||
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
|
||||
"""Stream content generation from vLLM"""
|
||||
model_name = model or self.default_model
|
||||
effective_temperature = temperature if temperature is not None else self.temperature
|
||||
|
||||
logger.debug(f"Using model (streaming): {model_name}")
|
||||
logger.debug(f"Using temperature: {effective_temperature}")
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
request = {
|
||||
"model": model_name,
|
||||
"prompt": system + "\n\n" + prompt,
|
||||
"max_tokens": self.max_output,
|
||||
"temperature": effective_temperature,
|
||||
"stream": True,
|
||||
}
|
||||
|
||||
try:
|
||||
url = f"{self.base_url}/completions"
|
||||
|
||||
async with self.session.post(
|
||||
url,
|
||||
headers=headers,
|
||||
json=request,
|
||||
) as response:
|
||||
|
||||
if response.status != 200:
|
||||
raise RuntimeError("Bad status: " + str(response.status))
|
||||
|
||||
# Parse SSE stream
|
||||
async for line in response.content:
|
||||
line = line.decode('utf-8').strip()
|
||||
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith('data: '):
|
||||
data = line[6:] # Remove 'data: ' prefix
|
||||
|
||||
if data == '[DONE]':
|
||||
break
|
||||
|
||||
try:
|
||||
import json
|
||||
chunk_data = json.loads(data)
|
||||
|
||||
# Extract text from chunk
|
||||
if 'choices' in chunk_data and len(chunk_data['choices']) > 0:
|
||||
choice = chunk_data['choices'][0]
|
||||
if 'text' in choice and choice['text']:
|
||||
yield LlmChunk(
|
||||
text=choice['text'],
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=False
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning(f"Failed to parse chunk: {data}")
|
||||
continue
|
||||
|
||||
# Send final chunk
|
||||
yield LlmChunk(
|
||||
text="",
|
||||
in_token=None,
|
||||
out_token=None,
|
||||
model=model_name,
|
||||
is_final=True
|
||||
)
|
||||
|
||||
logger.debug("Streaming complete")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"vLLM streaming exception ({type(e).__name__}): {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
|
|
|
|||
|
|
@ -101,6 +101,9 @@ class Processor(FlowProcessor):
|
|||
|
||||
kind = v.id
|
||||
|
||||
# Check if streaming is requested
|
||||
streaming = getattr(v, 'streaming', False)
|
||||
|
||||
try:
|
||||
|
||||
logger.debug(f"Prompt terms: {v.terms}")
|
||||
|
|
@ -109,16 +112,68 @@ class Processor(FlowProcessor):
|
|||
k: json.loads(v)
|
||||
for k, v in v.terms.items()
|
||||
}
|
||||
|
||||
logger.debug(f"Handling prompt kind {kind}...")
|
||||
|
||||
logger.debug(f"Handling prompt kind {kind}... (streaming={streaming})")
|
||||
|
||||
# If streaming, we need to handle it differently
|
||||
if streaming:
|
||||
# For streaming, we need to intercept LLM responses
|
||||
# and forward them as they arrive
|
||||
|
||||
async def llm_streaming(system, prompt):
|
||||
logger.debug(f"System prompt: {system}")
|
||||
logger.debug(f"User prompt: {prompt}")
|
||||
|
||||
# Use the text completion client with recipient handler
|
||||
client = flow("text-completion-request")
|
||||
|
||||
async def forward_chunks(resp):
|
||||
if resp.error:
|
||||
raise RuntimeError(resp.error.message)
|
||||
|
||||
is_final = getattr(resp, 'end_of_stream', False)
|
||||
|
||||
# Always send a message if there's content OR if it's the final message
|
||||
if resp.response or is_final:
|
||||
# Forward each chunk immediately
|
||||
r = PromptResponse(
|
||||
text=resp.response if resp.response else "",
|
||||
object=None,
|
||||
error=None,
|
||||
end_of_stream=is_final,
|
||||
)
|
||||
await flow("response").send(r, properties={"id": id})
|
||||
|
||||
# Return True when end_of_stream
|
||||
return is_final
|
||||
|
||||
await client.request(
|
||||
TextCompletionRequest(
|
||||
system=system, prompt=prompt, streaming=True
|
||||
),
|
||||
recipient=forward_chunks,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
# Return empty string since we already sent all chunks
|
||||
return ""
|
||||
|
||||
try:
|
||||
await self.manager.invoke(kind, input, llm_streaming)
|
||||
except Exception as e:
|
||||
logger.error(f"Prompt streaming exception: {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
return
|
||||
|
||||
# Non-streaming path (original behavior)
|
||||
async def llm(system, prompt):
|
||||
|
||||
logger.debug(f"System prompt: {system}")
|
||||
logger.debug(f"User prompt: {prompt}")
|
||||
|
||||
resp = await flow("text-completion-request").text_completion(
|
||||
system = system, prompt = prompt,
|
||||
system = system, prompt = prompt, streaming = False,
|
||||
)
|
||||
|
||||
try:
|
||||
|
|
@ -143,6 +198,7 @@ class Processor(FlowProcessor):
|
|||
text=resp,
|
||||
object=None,
|
||||
error=None,
|
||||
end_of_stream=True,
|
||||
)
|
||||
|
||||
await flow("response").send(r, properties={"id": id})
|
||||
|
|
@ -158,6 +214,7 @@ class Processor(FlowProcessor):
|
|||
text=None,
|
||||
object=json.dumps(resp),
|
||||
error=None,
|
||||
end_of_stream=True,
|
||||
)
|
||||
|
||||
await flow("response").send(r, properties={"id": id})
|
||||
|
|
@ -175,27 +232,13 @@ class Processor(FlowProcessor):
|
|||
type = "llm-error",
|
||||
message = str(e),
|
||||
),
|
||||
response=None,
|
||||
text=None,
|
||||
object=None,
|
||||
end_of_stream=True,
|
||||
)
|
||||
|
||||
await flow("response").send(r, properties={"id": id})
|
||||
|
||||
except Exception as e:
|
||||
|
||||
logger.error(f"Prompt service exception: {e}", exc_info=True)
|
||||
|
||||
logger.debug("Sending error response...")
|
||||
|
||||
r = PromptResponse(
|
||||
error=Error(
|
||||
type = "llm-error",
|
||||
message = str(e),
|
||||
),
|
||||
response=None,
|
||||
)
|
||||
|
||||
await self.send(r, properties={"id": id})
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue