Streaming LLM part 2 (#567)

* Updates for agent API with streaming support * Added tg-dump-queues tool to dump Pulsar queues to a log * Updated tg-invoke-agent, incremental output * Queue dumper CLI - might be useful for debug * Updating for tests
2026-04-28 18:06:21 +02:00 · 2025-11-26 15:16:17 +00:00 · 2025-11-26 15:16:17 +00:00 · b1cc724f7d
commit b1cc724f7d
parent 310a2deb06
8 changed files with 609 additions and 51 deletions
--- a/trustgraph-flow/trustgraph/agent/react/agent_manager.py
+++ b/trustgraph-flow/trustgraph/agent/react/agent_manager.py
@ -220,32 +220,72 @@ class AgentManager:

        logger.info(f"prompt: {variables}")

+        logger.info(f"DEBUG: streaming={streaming}, think={think is not None}")
+
        # Streaming path - use StreamingReActParser
        if streaming and think:
+            logger.info("DEBUG: Entering streaming path")
            from .streaming_parser import StreamingReActParser

-            # Create parser with streaming callbacks
-            # Thought chunks go to think(), answer chunks go to answer()
+            logger.info("DEBUG: Creating StreamingReActParser")
+
+            # Collect chunks to send via async callbacks
+            thought_chunks = []
+            answer_chunks = []
+
+            # Create parser with synchronous callbacks that just collect chunks
            parser = StreamingReActParser(
-                on_thought_chunk=lambda chunk: asyncio.create_task(think(chunk)),
-                on_answer_chunk=lambda chunk: asyncio.create_task(answer(chunk) if answer else think(chunk)),
+                on_thought_chunk=lambda chunk: thought_chunks.append(chunk),
+                on_answer_chunk=lambda chunk: answer_chunks.append(chunk),
            )
+            logger.info("DEBUG: StreamingReActParser created")

-            # Create async chunk callback that feeds parser
+            # Create async chunk callback that feeds parser and sends collected chunks
            async def on_chunk(text):
-                parser.feed(text)
+                logger.info(f"DEBUG: on_chunk called with {len(text)} chars")

+                # Track what we had before
+                prev_thought_count = len(thought_chunks)
+                prev_answer_count = len(answer_chunks)
+
+                # Feed the parser (synchronous)
+                logger.info(f"DEBUG: About to call parser.feed")
+                parser.feed(text)
+                logger.info(f"DEBUG: parser.feed returned")
+
+                # Send any new thought chunks
+                for i in range(prev_thought_count, len(thought_chunks)):
+                    logger.info(f"DEBUG: Sending thought chunk {i}")
+                    await think(thought_chunks[i])
+
+                # Send any new answer chunks
+                for i in range(prev_answer_count, len(answer_chunks)):
+                    logger.info(f"DEBUG: Sending answer chunk {i}")
+                    if answer:
+                        await answer(answer_chunks[i])
+                    else:
+                        await think(answer_chunks[i])
+
+            logger.info("DEBUG: Getting prompt-request client from context")
+            client = context("prompt-request")
+            logger.info(f"DEBUG: Got client: {client}")
+
+            logger.info("DEBUG: About to call agent_react with streaming=True")
            # Get streaming response
-            response_text = await context("prompt-request").agent_react(
+            response_text = await client.agent_react(
                variables=variables,
                streaming=True,
                chunk_callback=on_chunk
            )
+            logger.info(f"DEBUG: agent_react returned, got {len(response_text) if response_text else 0} chars")

            # Finalize parser
+            logger.info("DEBUG: Finalizing parser")
            parser.finalize()
+            logger.info("DEBUG: Parser finalized")

            # Get result
+            logger.info("DEBUG: Getting result from parser")
            result = parser.get_result()
            if result is None:
                raise RuntimeError("Parser failed to produce a result")
@ -254,11 +294,18 @@ class AgentManager:
            return result

        else:
+            logger.info("DEBUG: Entering NON-streaming path")
            # Non-streaming path - get complete text and parse
-            response_text = await context("prompt-request").agent_react(
+            logger.info("DEBUG: Getting prompt-request client from context")
+            client = context("prompt-request")
+            logger.info(f"DEBUG: Got client: {client}")
+
+            logger.info("DEBUG: About to call agent_react with streaming=False")
+            response_text = await client.agent_react(
                variables=variables,
                streaming=False
            )
+            logger.info(f"DEBUG: agent_react returned, got response")

            logger.debug(f"Response text:\n{response_text}")

--- a/trustgraph-flow/trustgraph/agent/react/streaming_parser.py
+++ b/trustgraph-flow/trustgraph/agent/react/streaming_parser.py
@ -118,7 +118,11 @@ class StreamingReActParser:
        self.line_buffer = re.sub(r'\n```$', '', self.line_buffer)

        # Process based on current state
+        # Track previous state to detect if we're making progress
        while self.line_buffer and self.state != ParserState.COMPLETE:
+            prev_buffer_len = len(self.line_buffer)
+            prev_state = self.state
+
            if self.state == ParserState.INITIAL:
                self._process_initial()
            elif self.state == ParserState.THOUGHT:
@ -130,14 +134,19 @@ class StreamingReActParser:
            elif self.state == ParserState.FINAL_ANSWER:
                self._process_final_answer()

+            # If no progress was made (buffer unchanged AND state unchanged), break
+            # to avoid infinite loop. We'll process more when the next chunk arrives.
+            if len(self.line_buffer) == prev_buffer_len and self.state == prev_state:
+                break
+
    def _process_initial(self) -> None:
        """Process INITIAL state - looking for 'Thought:' delimiter"""
        idx = self.line_buffer.find(self.THOUGHT_DELIMITER)

        if idx >= 0:
            # Found thought delimiter
-            # Discard any content before it
-            self.line_buffer = self.line_buffer[idx + len(self.THOUGHT_DELIMITER):]
+            # Discard any content before it and strip leading whitespace after delimiter
+            self.line_buffer = self.line_buffer[idx + len(self.THOUGHT_DELIMITER):].lstrip()
            self.state = ParserState.THOUGHT
        elif len(self.line_buffer) >= self.MAX_DELIMITER_BUFFER:
            # Buffer getting too large, probably junk before thought
@ -171,7 +180,7 @@ class StreamingReActParser:
                if self.on_thought_chunk:
                    self.on_thought_chunk(thought_chunk)

-            self.line_buffer = self.line_buffer[next_delimiter_idx + delimiter_len:]
+            self.line_buffer = self.line_buffer[next_delimiter_idx + delimiter_len:].lstrip()
            self.state = next_state
        else:
            # No delimiter found yet
@ -194,7 +203,7 @@ class StreamingReActParser:
        if args_idx >= 0 and (newline_idx < 0 or args_idx < newline_idx):
            # Args delimiter found first
            self.action_buffer = self.line_buffer[:args_idx].strip().strip('"')
-            self.line_buffer = self.line_buffer[args_idx + len(self.ARGS_DELIMITER):]
+            self.line_buffer = self.line_buffer[args_idx + len(self.ARGS_DELIMITER):].lstrip()
            self.state = ParserState.ARGS
        elif newline_idx >= 0:
            # Newline found, action name complete
@ -204,7 +213,7 @@ class StreamingReActParser:
            # Actually, check if next line has Args:
            if self.line_buffer.lstrip().startswith(self.ARGS_DELIMITER):
                args_start = self.line_buffer.find(self.ARGS_DELIMITER)
-                self.line_buffer = self.line_buffer[args_start + len(self.ARGS_DELIMITER):]
+                self.line_buffer = self.line_buffer[args_start + len(self.ARGS_DELIMITER):].lstrip()
                self.state = ParserState.ARGS
        else:
            # Not enough content yet, keep buffering