Feature/streaming llm phase 1 (#566)

* Tidy up duplicate tech specs in doc directory

* Streaming LLM text-completion service tech spec.

* text-completion and prompt interfaces

* streaming change applied to all LLMs, so far tested with VertexAI

* Skip Pinecone unit tests, upstream module issue is affecting things, tests are passing again

* Added agent streaming, not working and has broken tests
This commit is contained in:
cybermaggedon 2025-11-26 09:59:10 +00:00 committed by GitHub
parent 943a9d83b0
commit 310a2deb06
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
44 changed files with 2684 additions and 937 deletions

View file

@ -0,0 +1,339 @@
"""
Streaming parser for ReAct responses.
This parser handles text chunks from LLM streaming responses and parses them
into ReAct format (Thought/Action/Args or Thought/Final Answer). It maintains
state across chunk boundaries to handle cases where delimiters or JSON are split.
Key challenges:
- Delimiters may be split across chunks: "Tho" + "ught:" or "Final An" + "swer:"
- JSON arguments may be split: '{"loc' + 'ation": "NYC"}'
- Need to emit thought/answer chunks as they arrive for streaming
"""
import json
import logging
import re
from enum import Enum
from typing import Optional, Callable, Any
from . types import Action, Final
logger = logging.getLogger(__name__)
class ParserState(Enum):
"""States for the streaming ReAct parser state machine"""
INITIAL = "initial" # Waiting for first content
THOUGHT = "thought" # Accumulating thought content
ACTION = "action" # Found "Action:", collecting action name
ARGS = "args" # Found "Args:", collecting JSON arguments
FINAL_ANSWER = "final_answer" # Found "Final Answer:", collecting answer
COMPLETE = "complete" # Parsing complete, object ready
class StreamingReActParser:
"""
Stateful parser for streaming ReAct responses.
Expected format:
Thought: [reasoning about what to do next]
Action: [tool_name]
Args: {
"param": "value"
}
OR
Thought: [reasoning about the final answer]
Final Answer: [the answer]
Usage:
parser = StreamingReActParser(
on_thought_chunk=lambda chunk: print(f"Thought: {chunk}"),
on_answer_chunk=lambda chunk: print(f"Answer: {chunk}"),
)
for chunk in llm_stream:
parser.feed(chunk)
if parser.is_complete():
result = parser.get_result()
break
"""
# Delimiters we're looking for
THOUGHT_DELIMITER = "Thought:"
ACTION_DELIMITER = "Action:"
ARGS_DELIMITER = "Args:"
FINAL_ANSWER_DELIMITER = "Final Answer:"
# Maximum buffer size for delimiter detection (longest delimiter + safety margin)
MAX_DELIMITER_BUFFER = 20
def __init__(
self,
on_thought_chunk: Optional[Callable[[str], Any]] = None,
on_answer_chunk: Optional[Callable[[str], Any]] = None,
):
"""
Initialize streaming parser.
Args:
on_thought_chunk: Callback for thought text chunks as they arrive
on_answer_chunk: Callback for final answer text chunks as they arrive
"""
self.on_thought_chunk = on_thought_chunk
self.on_answer_chunk = on_answer_chunk
# Parser state
self.state = ParserState.INITIAL
# Buffers for accumulating content
self.line_buffer = "" # For detecting delimiters across chunk boundaries
self.thought_buffer = "" # Accumulated thought text
self.action_buffer = "" # Action name
self.args_buffer = "" # JSON arguments text
self.answer_buffer = "" # Final answer text
# JSON parsing state for Args
self.brace_count = 0
self.args_started = False
# Result object (Action or Final)
self.result = None
def feed(self, chunk: str) -> None:
"""
Feed a text chunk to the parser.
Args:
chunk: Text chunk from LLM stream
"""
if self.state == ParserState.COMPLETE:
return # Already complete, ignore further chunks
# Add chunk to line buffer for delimiter detection
self.line_buffer += chunk
# Remove markdown code blocks if present
self.line_buffer = re.sub(r'^```[^\n]*\n', '', self.line_buffer)
self.line_buffer = re.sub(r'\n```$', '', self.line_buffer)
# Process based on current state
while self.line_buffer and self.state != ParserState.COMPLETE:
if self.state == ParserState.INITIAL:
self._process_initial()
elif self.state == ParserState.THOUGHT:
self._process_thought()
elif self.state == ParserState.ACTION:
self._process_action()
elif self.state == ParserState.ARGS:
self._process_args()
elif self.state == ParserState.FINAL_ANSWER:
self._process_final_answer()
def _process_initial(self) -> None:
"""Process INITIAL state - looking for 'Thought:' delimiter"""
idx = self.line_buffer.find(self.THOUGHT_DELIMITER)
if idx >= 0:
# Found thought delimiter
# Discard any content before it
self.line_buffer = self.line_buffer[idx + len(self.THOUGHT_DELIMITER):]
self.state = ParserState.THOUGHT
elif len(self.line_buffer) >= self.MAX_DELIMITER_BUFFER:
# Buffer getting too large, probably junk before thought
# Keep only the tail that might contain partial delimiter
self.line_buffer = self.line_buffer[-self.MAX_DELIMITER_BUFFER:]
def _process_thought(self) -> None:
"""Process THOUGHT state - accumulating thought content"""
# Check for Action or Final Answer delimiter
action_idx = self.line_buffer.find(self.ACTION_DELIMITER)
final_idx = self.line_buffer.find(self.FINAL_ANSWER_DELIMITER)
# Find which delimiter comes first (if any)
next_delimiter_idx = -1
next_state = None
if action_idx >= 0 and (final_idx < 0 or action_idx < final_idx):
next_delimiter_idx = action_idx
next_state = ParserState.ACTION
delimiter_len = len(self.ACTION_DELIMITER)
elif final_idx >= 0:
next_delimiter_idx = final_idx
next_state = ParserState.FINAL_ANSWER
delimiter_len = len(self.FINAL_ANSWER_DELIMITER)
if next_delimiter_idx >= 0:
# Found next delimiter
thought_chunk = self.line_buffer[:next_delimiter_idx].strip()
if thought_chunk:
self.thought_buffer += thought_chunk
if self.on_thought_chunk:
self.on_thought_chunk(thought_chunk)
self.line_buffer = self.line_buffer[next_delimiter_idx + delimiter_len:]
self.state = next_state
else:
# No delimiter found yet
# Keep tail in buffer (might contain partial delimiter)
# Emit the rest as thought chunk
if len(self.line_buffer) > self.MAX_DELIMITER_BUFFER:
emittable = self.line_buffer[:-self.MAX_DELIMITER_BUFFER]
self.thought_buffer += emittable
if self.on_thought_chunk:
self.on_thought_chunk(emittable)
self.line_buffer = self.line_buffer[-self.MAX_DELIMITER_BUFFER:]
def _process_action(self) -> None:
"""Process ACTION state - collecting action name"""
# Action name is on one line (or at least until newline or Args:)
newline_idx = self.line_buffer.find('\n')
args_idx = self.line_buffer.find(self.ARGS_DELIMITER)
# Find which comes first
if args_idx >= 0 and (newline_idx < 0 or args_idx < newline_idx):
# Args delimiter found first
self.action_buffer = self.line_buffer[:args_idx].strip().strip('"')
self.line_buffer = self.line_buffer[args_idx + len(self.ARGS_DELIMITER):]
self.state = ParserState.ARGS
elif newline_idx >= 0:
# Newline found, action name complete
self.action_buffer = self.line_buffer[:newline_idx].strip().strip('"')
self.line_buffer = self.line_buffer[newline_idx + 1:]
# Stay in ACTION state or move to ARGS if we find delimiter
# Actually, check if next line has Args:
if self.line_buffer.lstrip().startswith(self.ARGS_DELIMITER):
args_start = self.line_buffer.find(self.ARGS_DELIMITER)
self.line_buffer = self.line_buffer[args_start + len(self.ARGS_DELIMITER):]
self.state = ParserState.ARGS
else:
# Not enough content yet, keep buffering
# But if buffer is getting large, action name is probably complete
if len(self.line_buffer) > 100:
self.action_buffer = self.line_buffer.strip().strip('"')
self.line_buffer = ""
# Assume Args comes next, but we need more content
self.state = ParserState.ARGS
def _process_args(self) -> None:
"""Process ARGS state - collecting JSON arguments"""
# Process character by character to track brace matching
i = 0
while i < len(self.line_buffer):
char = self.line_buffer[i]
self.args_buffer += char
if char == '{':
self.brace_count += 1
self.args_started = True
elif char == '}':
self.brace_count -= 1
# Check if JSON is complete
if self.args_started and self.brace_count == 0:
# JSON complete, try to parse
try:
args_dict = json.loads(self.args_buffer.strip())
# Success! Create Action result
self.result = Action(
thought=self.thought_buffer.strip(),
name=self.action_buffer,
arguments=args_dict,
observation=""
)
self.state = ParserState.COMPLETE
self.line_buffer = "" # Clear buffer
return
except json.JSONDecodeError as e:
logger.error(f"Failed to parse JSON args: {self.args_buffer}")
raise ValueError(f"Invalid JSON in Args: {e}")
i += 1
# Consumed entire buffer, clear it and wait for more chunks
self.line_buffer = ""
def _process_final_answer(self) -> None:
"""Process FINAL_ANSWER state - collecting final answer"""
# For final answer, we consume everything until we decide we're done
# In streaming mode, we can't know when answer is complete until stream ends
# So we emit chunks and accumulate
# Check if this might be JSON
is_json = self.answer_buffer.strip().startswith('{') or \
self.line_buffer.strip().startswith('{')
if is_json:
# Handle JSON final answer
self.answer_buffer += self.line_buffer
# Count braces to detect completion
brace_count = self.answer_buffer.count('{') - self.answer_buffer.count('}')
if brace_count == 0 and '{' in self.answer_buffer:
# JSON might be complete
# Note: We can't be 100% sure without trying to parse
# But in streaming mode, we'll finish when stream ends
pass
# Emit chunk
if self.on_answer_chunk:
self.on_answer_chunk(self.line_buffer)
self.line_buffer = ""
else:
# Regular text answer - emit everything
if self.line_buffer:
self.answer_buffer += self.line_buffer
if self.on_answer_chunk:
self.on_answer_chunk(self.line_buffer)
self.line_buffer = ""
def finalize(self) -> None:
"""
Call this when the stream is complete to finalize parsing.
This handles any remaining buffered content.
"""
if self.state == ParserState.COMPLETE:
return
# Flush any remaining thought chunks
if self.state == ParserState.THOUGHT and self.line_buffer:
self.thought_buffer += self.line_buffer
if self.on_thought_chunk:
self.on_thought_chunk(self.line_buffer)
self.line_buffer = ""
# Finalize final answer
if self.state == ParserState.FINAL_ANSWER:
# Flush any remaining answer content
if self.line_buffer:
self.answer_buffer += self.line_buffer
if self.on_answer_chunk:
self.on_answer_chunk(self.line_buffer)
self.line_buffer = ""
# Create Final result
self.result = Final(
thought=self.thought_buffer.strip(),
final=self.answer_buffer.strip()
)
self.state = ParserState.COMPLETE
# If we're in other states, something went wrong
if self.state not in [ParserState.COMPLETE, ParserState.FINAL_ANSWER]:
if self.thought_buffer:
raise ValueError(
f"Stream ended in {self.state.value} state with incomplete parsing. "
f"Thought: {self.thought_buffer[:100]}..."
)
else:
raise ValueError(f"Stream ended in {self.state.value} state with no content")
def is_complete(self) -> bool:
"""Check if parsing is complete"""
return self.state == ParserState.COMPLETE
def get_result(self) -> Optional[Action | Final]:
"""Get the parsed result (Action or Final)"""
return self.result