Feature/streaming llm phase 1 (#566)

* Tidy up duplicate tech specs in doc directory

* Streaming LLM text-completion service tech spec.

* text-completion and prompt interfaces

* streaming change applied to all LLMs, so far tested with VertexAI

* Skip Pinecone unit tests, upstream module issue is affecting things, tests are passing again

* Added agent streaming, not working and has broken tests
This commit is contained in:
cybermaggedon 2025-11-26 09:59:10 +00:00 committed by GitHub
parent 943a9d83b0
commit 310a2deb06
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
44 changed files with 2684 additions and 937 deletions

View file

@ -2,6 +2,7 @@
import logging
import json
import re
import asyncio
from . types import Action, Final
@ -169,7 +170,7 @@ class AgentManager:
raise ValueError(f"Could not parse response: {text}")
async def reason(self, question, history, context):
async def reason(self, question, history, context, streaming=False, think=None, observe=None, answer=None):
logger.debug(f"calling reason: {question}")
@ -219,25 +220,62 @@ class AgentManager:
logger.info(f"prompt: {variables}")
# Get text response from prompt service
response_text = await context("prompt-request").agent_react(variables)
# Streaming path - use StreamingReActParser
if streaming and think:
from .streaming_parser import StreamingReActParser
logger.debug(f"Response text:\n{response_text}")
# Create parser with streaming callbacks
# Thought chunks go to think(), answer chunks go to answer()
parser = StreamingReActParser(
on_thought_chunk=lambda chunk: asyncio.create_task(think(chunk)),
on_answer_chunk=lambda chunk: asyncio.create_task(answer(chunk) if answer else think(chunk)),
)
logger.info(f"response: {response_text}")
# Create async chunk callback that feeds parser
async def on_chunk(text):
parser.feed(text)
# Get streaming response
response_text = await context("prompt-request").agent_react(
variables=variables,
streaming=True,
chunk_callback=on_chunk
)
# Finalize parser
parser.finalize()
# Get result
result = parser.get_result()
if result is None:
raise RuntimeError("Parser failed to produce a result")
# Parse the text response
try:
result = self.parse_react_response(response_text)
logger.info(f"Parsed result: {result}")
return result
except ValueError as e:
logger.error(f"Failed to parse response: {e}")
# Try to provide a helpful error message
logger.error(f"Response was: {response_text}")
raise RuntimeError(f"Failed to parse agent response: {e}")
async def react(self, question, history, think, observe, context):
else:
# Non-streaming path - get complete text and parse
response_text = await context("prompt-request").agent_react(
variables=variables,
streaming=False
)
logger.debug(f"Response text:\n{response_text}")
logger.info(f"response: {response_text}")
# Parse the text response
try:
result = self.parse_react_response(response_text)
logger.info(f"Parsed result: {result}")
return result
except ValueError as e:
logger.error(f"Failed to parse response: {e}")
# Try to provide a helpful error message
logger.error(f"Response was: {response_text}")
raise RuntimeError(f"Failed to parse agent response: {e}")
async def react(self, question, history, think, observe, context, streaming=False, answer=None):
logger.info(f"question: {question}")
@ -245,17 +283,27 @@ class AgentManager:
question = question,
history = history,
context = context,
streaming = streaming,
think = think,
observe = observe,
answer = answer,
)
logger.info(f"act: {act}")
if isinstance(act, Final):
await think(act.thought)
# In non-streaming mode, send complete thought
# In streaming mode, thoughts were already sent as chunks
if not streaming:
await think(act.thought)
return act
else:
await think(act.thought)
# In non-streaming mode, send complete thought
# In streaming mode, thoughts were already sent as chunks
if not streaming:
await think(act.thought)
logger.debug(f"ACTION: {act.name}")

View file

@ -191,6 +191,9 @@ class Processor(AgentService):
try:
# Check if streaming is enabled
streaming = getattr(request, 'streaming', False)
if request.history:
history = [
Action(
@ -215,12 +218,27 @@ class Processor(AgentService):
logger.debug(f"Think: {x}")
r = AgentResponse(
answer=None,
error=None,
thought=x,
observation=None,
)
if streaming:
# Streaming format
r = AgentResponse(
chunk_type="thought",
content=x,
end_of_message=True,
end_of_dialog=False,
# Legacy fields for backward compatibility
answer=None,
error=None,
thought=x,
observation=None,
)
else:
# Legacy format
r = AgentResponse(
answer=None,
error=None,
thought=x,
observation=None,
)
await respond(r)
@ -228,12 +246,55 @@ class Processor(AgentService):
logger.debug(f"Observe: {x}")
r = AgentResponse(
answer=None,
error=None,
thought=None,
observation=x,
)
if streaming:
# Streaming format
r = AgentResponse(
chunk_type="observation",
content=x,
end_of_message=True,
end_of_dialog=False,
# Legacy fields for backward compatibility
answer=None,
error=None,
thought=None,
observation=x,
)
else:
# Legacy format
r = AgentResponse(
answer=None,
error=None,
thought=None,
observation=x,
)
await respond(r)
async def answer(x):
logger.debug(f"Answer: {x}")
if streaming:
# Streaming format
r = AgentResponse(
chunk_type="answer",
content=x,
end_of_message=False, # More chunks may follow
end_of_dialog=False,
# Legacy fields for backward compatibility
answer=None,
error=None,
thought=None,
observation=None,
)
else:
# Legacy format - shouldn't be called in non-streaming mode
r = AgentResponse(
answer=x,
error=None,
thought=None,
observation=None,
)
await respond(r)
@ -273,7 +334,9 @@ class Processor(AgentService):
history = history,
think = think,
observe = observe,
answer = answer,
context = UserAwareContext(flow, request.user),
streaming = streaming,
)
logger.debug(f"Action: {act}")
@ -287,11 +350,26 @@ class Processor(AgentService):
else:
f = json.dumps(act.final)
r = AgentResponse(
answer=act.final,
error=None,
thought=None,
)
if streaming:
# Streaming format - send end-of-dialog marker
# Answer chunks were already sent via think() callback during parsing
r = AgentResponse(
chunk_type="answer",
content="", # Empty content, just marking end of dialog
end_of_message=True,
end_of_dialog=True,
# Legacy fields for backward compatibility
answer=act.final,
error=None,
thought=None,
)
else:
# Legacy format - send complete answer
r = AgentResponse(
answer=act.final,
error=None,
thought=None,
)
await respond(r)
@ -321,7 +399,9 @@ class Processor(AgentService):
observation=h.observation
)
for h in history
]
],
user=request.user,
streaming=streaming,
)
await next(r)
@ -336,14 +416,32 @@ class Processor(AgentService):
logger.debug("Send error response...")
r = AgentResponse(
error=Error(
type = "agent-error",
message = str(e),
),
response=None,
error_obj = Error(
type = "agent-error",
message = str(e),
)
# Check if streaming was enabled (may not be set if error occurred early)
streaming = getattr(request, 'streaming', False) if 'request' in locals() else False
if streaming:
# Streaming format
r = AgentResponse(
chunk_type="error",
content=str(e),
end_of_message=True,
end_of_dialog=True,
# Legacy fields for backward compatibility
error=error_obj,
response=None,
)
else:
# Legacy format
r = AgentResponse(
error=error_obj,
response=None,
)
await respond(r)
@staticmethod

View file

@ -0,0 +1,339 @@
"""
Streaming parser for ReAct responses.
This parser handles text chunks from LLM streaming responses and parses them
into ReAct format (Thought/Action/Args or Thought/Final Answer). It maintains
state across chunk boundaries to handle cases where delimiters or JSON are split.
Key challenges:
- Delimiters may be split across chunks: "Tho" + "ught:" or "Final An" + "swer:"
- JSON arguments may be split: '{"loc' + 'ation": "NYC"}'
- Need to emit thought/answer chunks as they arrive for streaming
"""
import json
import logging
import re
from enum import Enum
from typing import Optional, Callable, Any
from . types import Action, Final
logger = logging.getLogger(__name__)
class ParserState(Enum):
"""States for the streaming ReAct parser state machine"""
INITIAL = "initial" # Waiting for first content
THOUGHT = "thought" # Accumulating thought content
ACTION = "action" # Found "Action:", collecting action name
ARGS = "args" # Found "Args:", collecting JSON arguments
FINAL_ANSWER = "final_answer" # Found "Final Answer:", collecting answer
COMPLETE = "complete" # Parsing complete, object ready
class StreamingReActParser:
"""
Stateful parser for streaming ReAct responses.
Expected format:
Thought: [reasoning about what to do next]
Action: [tool_name]
Args: {
"param": "value"
}
OR
Thought: [reasoning about the final answer]
Final Answer: [the answer]
Usage:
parser = StreamingReActParser(
on_thought_chunk=lambda chunk: print(f"Thought: {chunk}"),
on_answer_chunk=lambda chunk: print(f"Answer: {chunk}"),
)
for chunk in llm_stream:
parser.feed(chunk)
if parser.is_complete():
result = parser.get_result()
break
"""
# Delimiters we're looking for
THOUGHT_DELIMITER = "Thought:"
ACTION_DELIMITER = "Action:"
ARGS_DELIMITER = "Args:"
FINAL_ANSWER_DELIMITER = "Final Answer:"
# Maximum buffer size for delimiter detection (longest delimiter + safety margin)
MAX_DELIMITER_BUFFER = 20
def __init__(
self,
on_thought_chunk: Optional[Callable[[str], Any]] = None,
on_answer_chunk: Optional[Callable[[str], Any]] = None,
):
"""
Initialize streaming parser.
Args:
on_thought_chunk: Callback for thought text chunks as they arrive
on_answer_chunk: Callback for final answer text chunks as they arrive
"""
self.on_thought_chunk = on_thought_chunk
self.on_answer_chunk = on_answer_chunk
# Parser state
self.state = ParserState.INITIAL
# Buffers for accumulating content
self.line_buffer = "" # For detecting delimiters across chunk boundaries
self.thought_buffer = "" # Accumulated thought text
self.action_buffer = "" # Action name
self.args_buffer = "" # JSON arguments text
self.answer_buffer = "" # Final answer text
# JSON parsing state for Args
self.brace_count = 0
self.args_started = False
# Result object (Action or Final)
self.result = None
def feed(self, chunk: str) -> None:
"""
Feed a text chunk to the parser.
Args:
chunk: Text chunk from LLM stream
"""
if self.state == ParserState.COMPLETE:
return # Already complete, ignore further chunks
# Add chunk to line buffer for delimiter detection
self.line_buffer += chunk
# Remove markdown code blocks if present
self.line_buffer = re.sub(r'^```[^\n]*\n', '', self.line_buffer)
self.line_buffer = re.sub(r'\n```$', '', self.line_buffer)
# Process based on current state
while self.line_buffer and self.state != ParserState.COMPLETE:
if self.state == ParserState.INITIAL:
self._process_initial()
elif self.state == ParserState.THOUGHT:
self._process_thought()
elif self.state == ParserState.ACTION:
self._process_action()
elif self.state == ParserState.ARGS:
self._process_args()
elif self.state == ParserState.FINAL_ANSWER:
self._process_final_answer()
def _process_initial(self) -> None:
"""Process INITIAL state - looking for 'Thought:' delimiter"""
idx = self.line_buffer.find(self.THOUGHT_DELIMITER)
if idx >= 0:
# Found thought delimiter
# Discard any content before it
self.line_buffer = self.line_buffer[idx + len(self.THOUGHT_DELIMITER):]
self.state = ParserState.THOUGHT
elif len(self.line_buffer) >= self.MAX_DELIMITER_BUFFER:
# Buffer getting too large, probably junk before thought
# Keep only the tail that might contain partial delimiter
self.line_buffer = self.line_buffer[-self.MAX_DELIMITER_BUFFER:]
def _process_thought(self) -> None:
"""Process THOUGHT state - accumulating thought content"""
# Check for Action or Final Answer delimiter
action_idx = self.line_buffer.find(self.ACTION_DELIMITER)
final_idx = self.line_buffer.find(self.FINAL_ANSWER_DELIMITER)
# Find which delimiter comes first (if any)
next_delimiter_idx = -1
next_state = None
if action_idx >= 0 and (final_idx < 0 or action_idx < final_idx):
next_delimiter_idx = action_idx
next_state = ParserState.ACTION
delimiter_len = len(self.ACTION_DELIMITER)
elif final_idx >= 0:
next_delimiter_idx = final_idx
next_state = ParserState.FINAL_ANSWER
delimiter_len = len(self.FINAL_ANSWER_DELIMITER)
if next_delimiter_idx >= 0:
# Found next delimiter
thought_chunk = self.line_buffer[:next_delimiter_idx].strip()
if thought_chunk:
self.thought_buffer += thought_chunk
if self.on_thought_chunk:
self.on_thought_chunk(thought_chunk)
self.line_buffer = self.line_buffer[next_delimiter_idx + delimiter_len:]
self.state = next_state
else:
# No delimiter found yet
# Keep tail in buffer (might contain partial delimiter)
# Emit the rest as thought chunk
if len(self.line_buffer) > self.MAX_DELIMITER_BUFFER:
emittable = self.line_buffer[:-self.MAX_DELIMITER_BUFFER]
self.thought_buffer += emittable
if self.on_thought_chunk:
self.on_thought_chunk(emittable)
self.line_buffer = self.line_buffer[-self.MAX_DELIMITER_BUFFER:]
def _process_action(self) -> None:
"""Process ACTION state - collecting action name"""
# Action name is on one line (or at least until newline or Args:)
newline_idx = self.line_buffer.find('\n')
args_idx = self.line_buffer.find(self.ARGS_DELIMITER)
# Find which comes first
if args_idx >= 0 and (newline_idx < 0 or args_idx < newline_idx):
# Args delimiter found first
self.action_buffer = self.line_buffer[:args_idx].strip().strip('"')
self.line_buffer = self.line_buffer[args_idx + len(self.ARGS_DELIMITER):]
self.state = ParserState.ARGS
elif newline_idx >= 0:
# Newline found, action name complete
self.action_buffer = self.line_buffer[:newline_idx].strip().strip('"')
self.line_buffer = self.line_buffer[newline_idx + 1:]
# Stay in ACTION state or move to ARGS if we find delimiter
# Actually, check if next line has Args:
if self.line_buffer.lstrip().startswith(self.ARGS_DELIMITER):
args_start = self.line_buffer.find(self.ARGS_DELIMITER)
self.line_buffer = self.line_buffer[args_start + len(self.ARGS_DELIMITER):]
self.state = ParserState.ARGS
else:
# Not enough content yet, keep buffering
# But if buffer is getting large, action name is probably complete
if len(self.line_buffer) > 100:
self.action_buffer = self.line_buffer.strip().strip('"')
self.line_buffer = ""
# Assume Args comes next, but we need more content
self.state = ParserState.ARGS
def _process_args(self) -> None:
"""Process ARGS state - collecting JSON arguments"""
# Process character by character to track brace matching
i = 0
while i < len(self.line_buffer):
char = self.line_buffer[i]
self.args_buffer += char
if char == '{':
self.brace_count += 1
self.args_started = True
elif char == '}':
self.brace_count -= 1
# Check if JSON is complete
if self.args_started and self.brace_count == 0:
# JSON complete, try to parse
try:
args_dict = json.loads(self.args_buffer.strip())
# Success! Create Action result
self.result = Action(
thought=self.thought_buffer.strip(),
name=self.action_buffer,
arguments=args_dict,
observation=""
)
self.state = ParserState.COMPLETE
self.line_buffer = "" # Clear buffer
return
except json.JSONDecodeError as e:
logger.error(f"Failed to parse JSON args: {self.args_buffer}")
raise ValueError(f"Invalid JSON in Args: {e}")
i += 1
# Consumed entire buffer, clear it and wait for more chunks
self.line_buffer = ""
def _process_final_answer(self) -> None:
"""Process FINAL_ANSWER state - collecting final answer"""
# For final answer, we consume everything until we decide we're done
# In streaming mode, we can't know when answer is complete until stream ends
# So we emit chunks and accumulate
# Check if this might be JSON
is_json = self.answer_buffer.strip().startswith('{') or \
self.line_buffer.strip().startswith('{')
if is_json:
# Handle JSON final answer
self.answer_buffer += self.line_buffer
# Count braces to detect completion
brace_count = self.answer_buffer.count('{') - self.answer_buffer.count('}')
if brace_count == 0 and '{' in self.answer_buffer:
# JSON might be complete
# Note: We can't be 100% sure without trying to parse
# But in streaming mode, we'll finish when stream ends
pass
# Emit chunk
if self.on_answer_chunk:
self.on_answer_chunk(self.line_buffer)
self.line_buffer = ""
else:
# Regular text answer - emit everything
if self.line_buffer:
self.answer_buffer += self.line_buffer
if self.on_answer_chunk:
self.on_answer_chunk(self.line_buffer)
self.line_buffer = ""
def finalize(self) -> None:
"""
Call this when the stream is complete to finalize parsing.
This handles any remaining buffered content.
"""
if self.state == ParserState.COMPLETE:
return
# Flush any remaining thought chunks
if self.state == ParserState.THOUGHT and self.line_buffer:
self.thought_buffer += self.line_buffer
if self.on_thought_chunk:
self.on_thought_chunk(self.line_buffer)
self.line_buffer = ""
# Finalize final answer
if self.state == ParserState.FINAL_ANSWER:
# Flush any remaining answer content
if self.line_buffer:
self.answer_buffer += self.line_buffer
if self.on_answer_chunk:
self.on_answer_chunk(self.line_buffer)
self.line_buffer = ""
# Create Final result
self.result = Final(
thought=self.thought_buffer.strip(),
final=self.answer_buffer.strip()
)
self.state = ParserState.COMPLETE
# If we're in other states, something went wrong
if self.state not in [ParserState.COMPLETE, ParserState.FINAL_ANSWER]:
if self.thought_buffer:
raise ValueError(
f"Stream ended in {self.state.value} state with incomplete parsing. "
f"Thought: {self.thought_buffer[:100]}..."
)
else:
raise ValueError(f"Stream ended in {self.state.value} state with no content")
def is_complete(self) -> bool:
"""Check if parsing is complete"""
return self.state == ParserState.COMPLETE
def get_result(self) -> Optional[Action | Final]:
"""Get the parsed result (Action or Final)"""
return self.result

View file

@ -34,9 +34,9 @@ class BlobStore:
def ensure_bucket(self):
# Make the bucket if it doesn't exist.
found = self.minio.bucket_exists(self.bucket_name)
found = self.minio.bucket_exists(bucket_name=self.bucket_name)
if not found:
self.minio.make_bucket(self.bucket_name)
self.minio.make_bucket(bucket_name=self.bucket_name)
logger.info(f"Created bucket {self.bucket_name}")
else:
logger.debug(f"Bucket {self.bucket_name} already exists")

View file

@ -11,7 +11,7 @@ import os
import logging
from .... exceptions import TooManyRequests
from .... base import LlmService, LlmResult
from .... base import LlmService, LlmResult, LlmChunk
# Module logger
logger = logging.getLogger(__name__)
@ -55,7 +55,7 @@ class Processor(LlmService):
self.max_output = max_output
self.default_model = model
def build_prompt(self, system, content, temperature=None):
def build_prompt(self, system, content, temperature=None, stream=False):
# Use provided temperature or fall back to default
effective_temperature = temperature if temperature is not None else self.temperature
@ -73,6 +73,9 @@ class Processor(LlmService):
"top_p": 1
}
if stream:
data["stream"] = True
body = json.dumps(data)
return body
@ -157,6 +160,84 @@ class Processor(LlmService):
logger.debug("Azure LLM processing complete")
def supports_streaming(self):
"""Azure serverless endpoints support streaming"""
return True
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
"""Stream content generation from Azure serverless endpoint"""
model_name = model or self.default_model
effective_temperature = temperature if temperature is not None else self.temperature
logger.debug(f"Using model (streaming): {model_name}")
logger.debug(f"Using temperature: {effective_temperature}")
try:
body = self.build_prompt(system, prompt, effective_temperature, stream=True)
url = self.endpoint
api_key = self.token
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {api_key}'
}
response = requests.post(url, data=body, headers=headers, stream=True)
if response.status_code == 429:
raise TooManyRequests()
if response.status_code != 200:
raise RuntimeError("LLM failure")
# Parse SSE stream
for line in response.iter_lines():
if line:
line = line.decode('utf-8').strip()
if line.startswith('data: '):
data = line[6:] # Remove 'data: ' prefix
if data == '[DONE]':
break
try:
chunk_data = json.loads(data)
if 'choices' in chunk_data and len(chunk_data['choices']) > 0:
delta = chunk_data['choices'][0].get('delta', {})
content = delta.get('content')
if content:
yield LlmChunk(
text=content,
in_token=None,
out_token=None,
model=model_name,
is_final=False
)
except json.JSONDecodeError:
logger.warning(f"Failed to parse chunk: {data}")
continue
# Send final chunk
yield LlmChunk(
text="",
in_token=None,
out_token=None,
model=model_name,
is_final=True
)
logger.debug("Streaming complete")
except TooManyRequests:
logger.warning("Rate limit exceeded during streaming")
raise TooManyRequests()
except Exception as e:
logger.error(f"Azure streaming exception ({type(e).__name__}): {e}", exc_info=True)
raise e
@staticmethod
def add_args(parser):

View file

@ -14,7 +14,7 @@ import logging
logger = logging.getLogger(__name__)
from .... exceptions import TooManyRequests
from .... base import LlmService, LlmResult
from .... base import LlmService, LlmResult, LlmChunk
default_ident = "text-completion"
@ -125,6 +125,75 @@ class Processor(LlmService):
logger.debug("Azure OpenAI LLM processing complete")
def supports_streaming(self):
"""Azure OpenAI supports streaming"""
return True
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
"""
Stream content generation from Azure OpenAI.
Yields LlmChunk objects with is_final=True on the last chunk.
"""
# Use provided model or fall back to default
model_name = model or self.default_model
# Use provided temperature or fall back to default
effective_temperature = temperature if temperature is not None else self.temperature
logger.debug(f"Using model (streaming): {model_name}")
logger.debug(f"Using temperature: {effective_temperature}")
prompt = system + "\n\n" + prompt
try:
response = self.openai.chat.completions.create(
model=model_name,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
}
]
}
],
temperature=effective_temperature,
max_tokens=self.max_output,
top_p=1,
stream=True # Enable streaming
)
# Stream chunks
for chunk in response:
if chunk.choices and chunk.choices[0].delta.content:
yield LlmChunk(
text=chunk.choices[0].delta.content,
in_token=None,
out_token=None,
model=model_name,
is_final=False
)
# Send final chunk
yield LlmChunk(
text="",
in_token=None,
out_token=None,
model=model_name,
is_final=True
)
logger.debug("Streaming complete")
except RateLimitError:
logger.warning("Rate limit exceeded during streaming")
raise TooManyRequests()
except Exception as e:
logger.error(f"Azure OpenAI streaming exception ({type(e).__name__}): {e}", exc_info=True)
raise e
@staticmethod
def add_args(parser):

View file

@ -9,7 +9,7 @@ import os
import logging
from .... exceptions import TooManyRequests
from .... base import LlmService, LlmResult
from .... base import LlmService, LlmResult, LlmChunk
# Module logger
logger = logging.getLogger(__name__)
@ -106,6 +106,65 @@ class Processor(LlmService):
logger.error(f"Claude LLM exception ({type(e).__name__}): {e}", exc_info=True)
raise e
def supports_streaming(self):
"""Claude/Anthropic supports streaming"""
return True
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
"""Stream content generation from Claude"""
model_name = model or self.default_model
effective_temperature = temperature if temperature is not None else self.temperature
logger.debug(f"Using model (streaming): {model_name}")
logger.debug(f"Using temperature: {effective_temperature}")
try:
with self.claude.messages.stream(
model=model_name,
max_tokens=self.max_output,
temperature=effective_temperature,
system=system,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
}
]
}
]
) as stream:
for text in stream.text_stream:
yield LlmChunk(
text=text,
in_token=None,
out_token=None,
model=model_name,
is_final=False
)
# Get final message for token counts
final_message = stream.get_final_message()
yield LlmChunk(
text="",
in_token=final_message.usage.input_tokens,
out_token=final_message.usage.output_tokens,
model=model_name,
is_final=True
)
logger.debug("Streaming complete")
except anthropic.RateLimitError:
logger.warning("Rate limit exceeded during streaming")
raise TooManyRequests()
except Exception as e:
logger.error(f"Claude streaming exception ({type(e).__name__}): {e}", exc_info=True)
raise e
@staticmethod
def add_args(parser):

View file

@ -13,7 +13,7 @@ import logging
logger = logging.getLogger(__name__)
from .... exceptions import TooManyRequests
from .... base import LlmService, LlmResult
from .... base import LlmService, LlmResult, LlmChunk
default_ident = "text-completion"
@ -98,6 +98,68 @@ class Processor(LlmService):
logger.error(f"Cohere LLM exception ({type(e).__name__}): {e}", exc_info=True)
raise e
def supports_streaming(self):
"""Cohere supports streaming"""
return True
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
"""Stream content generation from Cohere"""
model_name = model or self.default_model
effective_temperature = temperature if temperature is not None else self.temperature
logger.debug(f"Using model (streaming): {model_name}")
logger.debug(f"Using temperature: {effective_temperature}")
try:
stream = self.cohere.chat_stream(
model=model_name,
message=prompt,
preamble=system,
temperature=effective_temperature,
chat_history=[],
prompt_truncation='auto',
connectors=[]
)
total_input_tokens = 0
total_output_tokens = 0
for event in stream:
if event.event_type == "text-generation":
if hasattr(event, 'text') and event.text:
yield LlmChunk(
text=event.text,
in_token=None,
out_token=None,
model=model_name,
is_final=False
)
elif event.event_type == "stream-end":
# Extract token counts from final event
if hasattr(event, 'response') and hasattr(event.response, 'meta'):
if hasattr(event.response.meta, 'billed_units'):
total_input_tokens = int(event.response.meta.billed_units.input_tokens)
total_output_tokens = int(event.response.meta.billed_units.output_tokens)
# Send final chunk with token counts
yield LlmChunk(
text="",
in_token=total_input_tokens,
out_token=total_output_tokens,
model=model_name,
is_final=True
)
logger.debug("Streaming complete")
except cohere.TooManyRequestsError:
logger.warning("Rate limit exceeded during streaming")
raise TooManyRequests()
except Exception as e:
logger.error(f"Cohere streaming exception ({type(e).__name__}): {e}", exc_info=True)
raise e
@staticmethod
def add_args(parser):

View file

@ -23,7 +23,7 @@ import logging
logger = logging.getLogger(__name__)
from .... exceptions import TooManyRequests
from .... base import LlmService, LlmResult
from .... base import LlmService, LlmResult, LlmChunk
default_ident = "text-completion"
@ -159,6 +159,67 @@ class Processor(LlmService):
logger.error(f"GoogleAIStudio LLM exception ({type(e).__name__}): {e}", exc_info=True)
raise e
def supports_streaming(self):
"""Google AI Studio supports streaming"""
return True
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
"""Stream content generation from Google AI Studio"""
model_name = model or self.default_model
effective_temperature = temperature if temperature is not None else self.temperature
logger.debug(f"Using model (streaming): {model_name}")
logger.debug(f"Using temperature: {effective_temperature}")
generation_config = self._get_or_create_config(model_name, effective_temperature)
generation_config.system_instruction = system
try:
response = self.client.models.generate_content_stream(
model=model_name,
config=generation_config,
contents=prompt,
)
total_input_tokens = 0
total_output_tokens = 0
for chunk in response:
if hasattr(chunk, 'text') and chunk.text:
yield LlmChunk(
text=chunk.text,
in_token=None,
out_token=None,
model=model_name,
is_final=False
)
# Accumulate token counts if available
if hasattr(chunk, 'usage_metadata'):
if hasattr(chunk.usage_metadata, 'prompt_token_count'):
total_input_tokens = int(chunk.usage_metadata.prompt_token_count)
if hasattr(chunk.usage_metadata, 'candidates_token_count'):
total_output_tokens = int(chunk.usage_metadata.candidates_token_count)
# Send final chunk with token counts
yield LlmChunk(
text="",
in_token=total_input_tokens,
out_token=total_output_tokens,
model=model_name,
is_final=True
)
logger.debug("Streaming complete")
except ResourceExhausted:
logger.warning("Rate limit exceeded during streaming")
raise TooManyRequests()
except Exception as e:
logger.error(f"GoogleAIStudio streaming exception ({type(e).__name__}): {e}", exc_info=True)
raise e
@staticmethod
def add_args(parser):

View file

@ -12,7 +12,7 @@ import logging
logger = logging.getLogger(__name__)
from .... exceptions import TooManyRequests
from .... base import LlmService, LlmResult
from .... base import LlmService, LlmResult, LlmChunk
default_ident = "text-completion"
@ -102,6 +102,57 @@ class Processor(LlmService):
logger.error(f"Llamafile LLM exception ({type(e).__name__}): {e}", exc_info=True)
raise e
def supports_streaming(self):
"""LlamaFile supports streaming"""
return True
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
"""Stream content generation from LlamaFile"""
model_name = model or self.default_model
effective_temperature = temperature if temperature is not None else self.temperature
logger.debug(f"Using model (streaming): {model_name}")
logger.debug(f"Using temperature: {effective_temperature}")
prompt = system + "\n\n" + prompt
try:
response = self.openai.chat.completions.create(
model=model_name,
messages=[{"role": "user", "content": prompt}],
temperature=effective_temperature,
max_tokens=self.max_output,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
response_format={"type": "text"},
stream=True
)
for chunk in response:
if chunk.choices and chunk.choices[0].delta.content:
yield LlmChunk(
text=chunk.choices[0].delta.content,
in_token=None,
out_token=None,
model=model_name,
is_final=False
)
yield LlmChunk(
text="",
in_token=None,
out_token=None,
model=model_name,
is_final=True
)
logger.debug("Streaming complete")
except Exception as e:
logger.error(f"LlamaFile streaming exception ({type(e).__name__}): {e}", exc_info=True)
raise e
@staticmethod
def add_args(parser):

View file

@ -12,7 +12,7 @@ import logging
logger = logging.getLogger(__name__)
from .... exceptions import TooManyRequests
from .... base import LlmService, LlmResult
from .... base import LlmService, LlmResult, LlmChunk
default_ident = "text-completion"
@ -106,6 +106,57 @@ class Processor(LlmService):
logger.error(f"LMStudio LLM exception ({type(e).__name__}): {e}", exc_info=True)
raise e
def supports_streaming(self):
"""LM Studio supports streaming"""
return True
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
"""Stream content generation from LM Studio"""
model_name = model or self.default_model
effective_temperature = temperature if temperature is not None else self.temperature
logger.debug(f"Using model (streaming): {model_name}")
logger.debug(f"Using temperature: {effective_temperature}")
prompt = system + "\n\n" + prompt
try:
response = self.openai.chat.completions.create(
model=model_name,
messages=[{"role": "user", "content": prompt}],
temperature=effective_temperature,
max_tokens=self.max_output,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
response_format={"type": "text"},
stream=True
)
for chunk in response:
if chunk.choices and chunk.choices[0].delta.content:
yield LlmChunk(
text=chunk.choices[0].delta.content,
in_token=None,
out_token=None,
model=model_name,
is_final=False
)
yield LlmChunk(
text="",
in_token=None,
out_token=None,
model=model_name,
is_final=True
)
logger.debug("Streaming complete")
except Exception as e:
logger.error(f"LMStudio streaming exception ({type(e).__name__}): {e}", exc_info=True)
raise e
@staticmethod
def add_args(parser):

View file

@ -12,7 +12,7 @@ import logging
logger = logging.getLogger(__name__)
from .... exceptions import TooManyRequests
from .... base import LlmService, LlmResult
from .... base import LlmService, LlmResult, LlmChunk
default_ident = "text-completion"
@ -120,6 +120,67 @@ class Processor(LlmService):
logger.error(f"Mistral LLM exception ({type(e).__name__}): {e}", exc_info=True)
raise e
def supports_streaming(self):
"""Mistral supports streaming"""
return True
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
"""Stream content generation from Mistral"""
model_name = model or self.default_model
effective_temperature = temperature if temperature is not None else self.temperature
logger.debug(f"Using model (streaming): {model_name}")
logger.debug(f"Using temperature: {effective_temperature}")
prompt = system + "\n\n" + prompt
try:
stream = self.mistral.chat.stream(
model=model_name,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
}
]
}
],
temperature=effective_temperature,
max_tokens=self.max_output,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
response_format={"type": "text"}
)
for chunk in stream:
if chunk.data.choices and chunk.data.choices[0].delta.content:
yield LlmChunk(
text=chunk.data.choices[0].delta.content,
in_token=None,
out_token=None,
model=model_name,
is_final=False
)
# Send final chunk
yield LlmChunk(
text="",
in_token=None,
out_token=None,
model=model_name,
is_final=True
)
logger.debug("Streaming complete")
except Exception as e:
logger.error(f"Mistral streaming exception ({type(e).__name__}): {e}", exc_info=True)
raise e
@staticmethod
def add_args(parser):

View file

@ -12,7 +12,7 @@ import logging
logger = logging.getLogger(__name__)
from .... exceptions import TooManyRequests
from .... base import LlmService, LlmResult
from .... base import LlmService, LlmResult, LlmChunk
default_ident = "text-completion"
@ -79,6 +79,62 @@ class Processor(LlmService):
logger.error(f"Ollama LLM exception ({type(e).__name__}): {e}", exc_info=True)
raise e
def supports_streaming(self):
"""Ollama supports streaming"""
return True
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
"""Stream content generation from Ollama"""
model_name = model or self.default_model
effective_temperature = temperature if temperature is not None else self.temperature
logger.debug(f"Using model (streaming): {model_name}")
logger.debug(f"Using temperature: {effective_temperature}")
prompt = system + "\n\n" + prompt
try:
stream = self.llm.generate(
model_name,
prompt,
options={'temperature': effective_temperature},
stream=True
)
total_input_tokens = 0
total_output_tokens = 0
for chunk in stream:
if 'response' in chunk and chunk['response']:
yield LlmChunk(
text=chunk['response'],
in_token=None,
out_token=None,
model=model_name,
is_final=False
)
# Accumulate token counts if available
if 'prompt_eval_count' in chunk:
total_input_tokens = int(chunk['prompt_eval_count'])
if 'eval_count' in chunk:
total_output_tokens = int(chunk['eval_count'])
# Send final chunk with token counts
yield LlmChunk(
text="",
in_token=total_input_tokens,
out_token=total_output_tokens,
model=model_name,
is_final=True
)
logger.debug("Streaming complete")
except Exception as e:
logger.error(f"Ollama streaming exception ({type(e).__name__}): {e}", exc_info=True)
raise e
@staticmethod
def add_args(parser):

View file

@ -9,7 +9,7 @@ import os
import logging
from .... exceptions import TooManyRequests
from .... base import LlmService, LlmResult
from .... base import LlmService, LlmResult, LlmChunk
# Module logger
logger = logging.getLogger(__name__)
@ -118,6 +118,75 @@ class Processor(LlmService):
logger.error(f"OpenAI LLM exception ({type(e).__name__}): {e}", exc_info=True)
raise e
def supports_streaming(self):
"""OpenAI supports streaming"""
return True
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
"""
Stream content generation from OpenAI.
Yields LlmChunk objects with is_final=True on the last chunk.
"""
# Use provided model or fall back to default
model_name = model or self.default_model
# Use provided temperature or fall back to default
effective_temperature = temperature if temperature is not None else self.temperature
logger.debug(f"Using model (streaming): {model_name}")
logger.debug(f"Using temperature: {effective_temperature}")
prompt = system + "\n\n" + prompt
try:
response = self.openai.chat.completions.create(
model=model_name,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
}
]
}
],
temperature=effective_temperature,
max_tokens=self.max_output,
stream=True # Enable streaming
)
# Stream chunks
for chunk in response:
if chunk.choices and chunk.choices[0].delta.content:
yield LlmChunk(
text=chunk.choices[0].delta.content,
in_token=None,
out_token=None,
model=model_name,
is_final=False
)
# Note: OpenAI doesn't provide token counts in streaming mode
# Send final chunk without token counts
yield LlmChunk(
text="",
in_token=None,
out_token=None,
model=model_name,
is_final=True
)
logger.debug("Streaming complete")
except RateLimitError:
logger.warning("Hit rate limit during streaming")
raise TooManyRequests()
except Exception as e:
logger.error(f"OpenAI streaming exception ({type(e).__name__}): {e}", exc_info=True)
raise e
@staticmethod
def add_args(parser):

View file

@ -12,7 +12,7 @@ import logging
logger = logging.getLogger(__name__)
from .... exceptions import TooManyRequests
from .... base import LlmService, LlmResult
from .... base import LlmService, LlmResult, LlmChunk
default_ident = "text-completion"
@ -121,6 +121,100 @@ class Processor(LlmService):
logger.error(f"TGI LLM exception ({type(e).__name__}): {e}", exc_info=True)
raise e
def supports_streaming(self):
"""TGI supports streaming"""
return True
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
"""Stream content generation from TGI"""
model_name = model or self.default_model
effective_temperature = temperature if temperature is not None else self.temperature
logger.debug(f"Using model (streaming): {model_name}")
logger.debug(f"Using temperature: {effective_temperature}")
headers = {
"Content-Type": "application/json",
}
request = {
"model": model_name,
"messages": [
{
"role": "system",
"content": system,
},
{
"role": "user",
"content": prompt,
}
],
"max_tokens": self.max_output,
"temperature": effective_temperature,
"stream": True,
}
try:
url = f"{self.base_url}/chat/completions"
async with self.session.post(
url,
headers=headers,
json=request,
) as response:
if response.status != 200:
raise RuntimeError("Bad status: " + str(response.status))
# Parse SSE stream
async for line in response.content:
line = line.decode('utf-8').strip()
if not line:
continue
if line.startswith('data: '):
data = line[6:] # Remove 'data: ' prefix
if data == '[DONE]':
break
try:
import json
chunk_data = json.loads(data)
# Extract text from chunk
if 'choices' in chunk_data and len(chunk_data['choices']) > 0:
choice = chunk_data['choices'][0]
if 'delta' in choice and 'content' in choice['delta']:
content = choice['delta']['content']
if content:
yield LlmChunk(
text=content,
in_token=None,
out_token=None,
model=model_name,
is_final=False
)
except json.JSONDecodeError:
logger.warning(f"Failed to parse chunk: {data}")
continue
# Send final chunk
yield LlmChunk(
text="",
in_token=None,
out_token=None,
model=model_name,
is_final=True
)
logger.debug("Streaming complete")
except Exception as e:
logger.error(f"TGI streaming exception ({type(e).__name__}): {e}", exc_info=True)
raise e
@staticmethod
def add_args(parser):

View file

@ -12,7 +12,7 @@ import logging
logger = logging.getLogger(__name__)
from .... exceptions import TooManyRequests
from .... base import LlmService, LlmResult
from .... base import LlmService, LlmResult, LlmChunk
default_ident = "text-completion"
@ -113,6 +113,89 @@ class Processor(LlmService):
logger.error(f"vLLM LLM exception ({type(e).__name__}): {e}", exc_info=True)
raise e
def supports_streaming(self):
"""vLLM supports streaming"""
return True
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
"""Stream content generation from vLLM"""
model_name = model or self.default_model
effective_temperature = temperature if temperature is not None else self.temperature
logger.debug(f"Using model (streaming): {model_name}")
logger.debug(f"Using temperature: {effective_temperature}")
headers = {
"Content-Type": "application/json",
}
request = {
"model": model_name,
"prompt": system + "\n\n" + prompt,
"max_tokens": self.max_output,
"temperature": effective_temperature,
"stream": True,
}
try:
url = f"{self.base_url}/completions"
async with self.session.post(
url,
headers=headers,
json=request,
) as response:
if response.status != 200:
raise RuntimeError("Bad status: " + str(response.status))
# Parse SSE stream
async for line in response.content:
line = line.decode('utf-8').strip()
if not line:
continue
if line.startswith('data: '):
data = line[6:] # Remove 'data: ' prefix
if data == '[DONE]':
break
try:
import json
chunk_data = json.loads(data)
# Extract text from chunk
if 'choices' in chunk_data and len(chunk_data['choices']) > 0:
choice = chunk_data['choices'][0]
if 'text' in choice and choice['text']:
yield LlmChunk(
text=choice['text'],
in_token=None,
out_token=None,
model=model_name,
is_final=False
)
except json.JSONDecodeError:
logger.warning(f"Failed to parse chunk: {data}")
continue
# Send final chunk
yield LlmChunk(
text="",
in_token=None,
out_token=None,
model=model_name,
is_final=True
)
logger.debug("Streaming complete")
except Exception as e:
logger.error(f"vLLM streaming exception ({type(e).__name__}): {e}", exc_info=True)
raise e
@staticmethod
def add_args(parser):

View file

@ -101,6 +101,9 @@ class Processor(FlowProcessor):
kind = v.id
# Check if streaming is requested
streaming = getattr(v, 'streaming', False)
try:
logger.debug(f"Prompt terms: {v.terms}")
@ -109,16 +112,68 @@ class Processor(FlowProcessor):
k: json.loads(v)
for k, v in v.terms.items()
}
logger.debug(f"Handling prompt kind {kind}...")
logger.debug(f"Handling prompt kind {kind}... (streaming={streaming})")
# If streaming, we need to handle it differently
if streaming:
# For streaming, we need to intercept LLM responses
# and forward them as they arrive
async def llm_streaming(system, prompt):
logger.debug(f"System prompt: {system}")
logger.debug(f"User prompt: {prompt}")
# Use the text completion client with recipient handler
client = flow("text-completion-request")
async def forward_chunks(resp):
if resp.error:
raise RuntimeError(resp.error.message)
is_final = getattr(resp, 'end_of_stream', False)
# Always send a message if there's content OR if it's the final message
if resp.response or is_final:
# Forward each chunk immediately
r = PromptResponse(
text=resp.response if resp.response else "",
object=None,
error=None,
end_of_stream=is_final,
)
await flow("response").send(r, properties={"id": id})
# Return True when end_of_stream
return is_final
await client.request(
TextCompletionRequest(
system=system, prompt=prompt, streaming=True
),
recipient=forward_chunks,
timeout=600
)
# Return empty string since we already sent all chunks
return ""
try:
await self.manager.invoke(kind, input, llm_streaming)
except Exception as e:
logger.error(f"Prompt streaming exception: {e}", exc_info=True)
raise e
return
# Non-streaming path (original behavior)
async def llm(system, prompt):
logger.debug(f"System prompt: {system}")
logger.debug(f"User prompt: {prompt}")
resp = await flow("text-completion-request").text_completion(
system = system, prompt = prompt,
system = system, prompt = prompt, streaming = False,
)
try:
@ -143,6 +198,7 @@ class Processor(FlowProcessor):
text=resp,
object=None,
error=None,
end_of_stream=True,
)
await flow("response").send(r, properties={"id": id})
@ -158,6 +214,7 @@ class Processor(FlowProcessor):
text=None,
object=json.dumps(resp),
error=None,
end_of_stream=True,
)
await flow("response").send(r, properties={"id": id})
@ -175,27 +232,13 @@ class Processor(FlowProcessor):
type = "llm-error",
message = str(e),
),
response=None,
text=None,
object=None,
end_of_stream=True,
)
await flow("response").send(r, properties={"id": id})
except Exception as e:
logger.error(f"Prompt service exception: {e}", exc_info=True)
logger.debug("Sending error response...")
r = PromptResponse(
error=Error(
type = "llm-error",
message = str(e),
),
response=None,
)
await self.send(r, properties={"id": id})
@staticmethod
def add_args(parser):