Feature/streaming llm phase 1 (#566)

* Tidy up duplicate tech specs in doc directory * Streaming LLM text-completion service tech spec. * text-completion and prompt interfaces * streaming change applied to all LLMs, so far tested with VertexAI * Skip Pinecone unit tests, upstream module issue is affecting things, tests are passing again * Added agent streaming, not working and has broken tests
2026-04-29 02:23:44 +02:00 · 2025-11-26 09:59:10 +00:00 · 2025-11-26 09:59:10 +00:00 · 310a2deb06
commit 310a2deb06
parent 943a9d83b0
44 changed files with 2684 additions and 937 deletions
--- a/trustgraph-flow/trustgraph/agent/react/agent_manager.py
+++ b/trustgraph-flow/trustgraph/agent/react/agent_manager.py
@ -2,6 +2,7 @@
 import logging
 import json
 import re
+import asyncio

 from . types import Action, Final

@ -169,7 +170,7 @@ class AgentManager:
        
        raise ValueError(f"Could not parse response: {text}")

-    async def reason(self, question, history, context):
+    async def reason(self, question, history, context, streaming=False, think=None, observe=None, answer=None):

        logger.debug(f"calling reason: {question}")

@ -219,25 +220,62 @@ class AgentManager:

        logger.info(f"prompt: {variables}")

-        # Get text response from prompt service
-        response_text = await context("prompt-request").agent_react(variables)
+        # Streaming path - use StreamingReActParser
+        if streaming and think:
+            from .streaming_parser import StreamingReActParser

-        logger.debug(f"Response text:\n{response_text}")
+            # Create parser with streaming callbacks
+            # Thought chunks go to think(), answer chunks go to answer()
+            parser = StreamingReActParser(
+                on_thought_chunk=lambda chunk: asyncio.create_task(think(chunk)),
+                on_answer_chunk=lambda chunk: asyncio.create_task(answer(chunk) if answer else think(chunk)),
+            )

-        logger.info(f"response: {response_text}")
+            # Create async chunk callback that feeds parser
+            async def on_chunk(text):
+                parser.feed(text)
+
+            # Get streaming response
+            response_text = await context("prompt-request").agent_react(
+                variables=variables,
+                streaming=True,
+                chunk_callback=on_chunk
+            )
+
+            # Finalize parser
+            parser.finalize()
+
+            # Get result
+            result = parser.get_result()
+            if result is None:
+                raise RuntimeError("Parser failed to produce a result")

-        # Parse the text response
-        try:
-            result = self.parse_react_response(response_text)
            logger.info(f"Parsed result: {result}")
            return result
-        except ValueError as e:
-            logger.error(f"Failed to parse response: {e}")
-            # Try to provide a helpful error message
-            logger.error(f"Response was: {response_text}")
-            raise RuntimeError(f"Failed to parse agent response: {e}")

-    async def react(self, question, history, think, observe, context):
+        else:
+            # Non-streaming path - get complete text and parse
+            response_text = await context("prompt-request").agent_react(
+                variables=variables,
+                streaming=False
+            )
+
+            logger.debug(f"Response text:\n{response_text}")
+
+            logger.info(f"response: {response_text}")
+
+            # Parse the text response
+            try:
+                result = self.parse_react_response(response_text)
+                logger.info(f"Parsed result: {result}")
+                return result
+            except ValueError as e:
+                logger.error(f"Failed to parse response: {e}")
+                # Try to provide a helpful error message
+                logger.error(f"Response was: {response_text}")
+                raise RuntimeError(f"Failed to parse agent response: {e}")
+
+    async def react(self, question, history, think, observe, context, streaming=False, answer=None):

        logger.info(f"question: {question}")

@ -245,17 +283,27 @@ class AgentManager:
            question = question,
            history = history,
            context = context,
+            streaming = streaming,
+            think = think,
+            observe = observe,
+            answer = answer,
        )
        logger.info(f"act: {act}")

        if isinstance(act, Final):

-            await think(act.thought)
+            # In non-streaming mode, send complete thought
+            # In streaming mode, thoughts were already sent as chunks
+            if not streaming:
+                await think(act.thought)
            return act

        else:

-            await think(act.thought)
+            # In non-streaming mode, send complete thought
+            # In streaming mode, thoughts were already sent as chunks
+            if not streaming:
+                await think(act.thought)

            logger.debug(f"ACTION: {act.name}")

--- a/trustgraph-flow/trustgraph/agent/react/service.py
+++ b/trustgraph-flow/trustgraph/agent/react/service.py
@ -191,6 +191,9 @@ class Processor(AgentService):

        try:

+            # Check if streaming is enabled
+            streaming = getattr(request, 'streaming', False)
+
            if request.history:
                history = [
                    Action(
@ -215,12 +218,27 @@ class Processor(AgentService):

                logger.debug(f"Think: {x}")

-                r = AgentResponse(
-                    answer=None,
-                    error=None,
-                    thought=x,
-                    observation=None,
-                )
+                if streaming:
+                    # Streaming format
+                    r = AgentResponse(
+                        chunk_type="thought",
+                        content=x,
+                        end_of_message=True,
+                        end_of_dialog=False,
+                        # Legacy fields for backward compatibility
+                        answer=None,
+                        error=None,
+                        thought=x,
+                        observation=None,
+                    )
+                else:
+                    # Legacy format
+                    r = AgentResponse(
+                        answer=None,
+                        error=None,
+                        thought=x,
+                        observation=None,
+                    )

                await respond(r)

@ -228,12 +246,55 @@ class Processor(AgentService):

                logger.debug(f"Observe: {x}")

-                r = AgentResponse(
-                    answer=None,
-                    error=None,
-                    thought=None,
-                    observation=x,
-                )
+                if streaming:
+                    # Streaming format
+                    r = AgentResponse(
+                        chunk_type="observation",
+                        content=x,
+                        end_of_message=True,
+                        end_of_dialog=False,
+                        # Legacy fields for backward compatibility
+                        answer=None,
+                        error=None,
+                        thought=None,
+                        observation=x,
+                    )
+                else:
+                    # Legacy format
+                    r = AgentResponse(
+                        answer=None,
+                        error=None,
+                        thought=None,
+                        observation=x,
+                    )
+
+                await respond(r)
+
+            async def answer(x):
+
+                logger.debug(f"Answer: {x}")
+
+                if streaming:
+                    # Streaming format
+                    r = AgentResponse(
+                        chunk_type="answer",
+                        content=x,
+                        end_of_message=False,  # More chunks may follow
+                        end_of_dialog=False,
+                        # Legacy fields for backward compatibility
+                        answer=None,
+                        error=None,
+                        thought=None,
+                        observation=None,
+                    )
+                else:
+                    # Legacy format - shouldn't be called in non-streaming mode
+                    r = AgentResponse(
+                        answer=x,
+                        error=None,
+                        thought=None,
+                        observation=None,
+                    )

                await respond(r)

@ -273,7 +334,9 @@ class Processor(AgentService):
                history = history,
                think = think,
                observe = observe,
+                answer = answer,
                context = UserAwareContext(flow, request.user),
+                streaming = streaming,
            )

            logger.debug(f"Action: {act}")
@ -287,11 +350,26 @@ class Processor(AgentService):
                else:
                    f = json.dumps(act.final)

-                r = AgentResponse(
-                    answer=act.final,
-                    error=None,
-                    thought=None,
-                )
+                if streaming:
+                    # Streaming format - send end-of-dialog marker
+                    # Answer chunks were already sent via think() callback during parsing
+                    r = AgentResponse(
+                        chunk_type="answer",
+                        content="",  # Empty content, just marking end of dialog
+                        end_of_message=True,
+                        end_of_dialog=True,
+                        # Legacy fields for backward compatibility
+                        answer=act.final,
+                        error=None,
+                        thought=None,
+                    )
+                else:
+                    # Legacy format - send complete answer
+                    r = AgentResponse(
+                        answer=act.final,
+                        error=None,
+                        thought=None,
+                    )

                await respond(r)

@ -321,7 +399,9 @@ class Processor(AgentService):
                        observation=h.observation
                    )
                    for h in history
-                ]
+                ],
+                user=request.user,
+                streaming=streaming,
            )

            await next(r)
@ -336,14 +416,32 @@ class Processor(AgentService):

            logger.debug("Send error response...")

-            r = AgentResponse(
-                error=Error(
-                    type = "agent-error",
-                    message = str(e),
-                ),
-                response=None,
+            error_obj = Error(
+                type = "agent-error",
+                message = str(e),
            )

+            # Check if streaming was enabled (may not be set if error occurred early)
+            streaming = getattr(request, 'streaming', False) if 'request' in locals() else False
+
+            if streaming:
+                # Streaming format
+                r = AgentResponse(
+                    chunk_type="error",
+                    content=str(e),
+                    end_of_message=True,
+                    end_of_dialog=True,
+                    # Legacy fields for backward compatibility
+                    error=error_obj,
+                    response=None,
+                )
+            else:
+                # Legacy format
+                r = AgentResponse(
+                    error=error_obj,
+                    response=None,
+                )
+
            await respond(r)

    @staticmethod
--- a/trustgraph-flow/trustgraph/agent/react/streaming_parser.py
+++ b/trustgraph-flow/trustgraph/agent/react/streaming_parser.py
@ -0,0 +1,339 @@
+"""
+Streaming parser for ReAct responses.
+
+This parser handles text chunks from LLM streaming responses and parses them
+into ReAct format (Thought/Action/Args or Thought/Final Answer). It maintains
+state across chunk boundaries to handle cases where delimiters or JSON are split.
+
+Key challenges:
+- Delimiters may be split across chunks: "Tho" + "ught:" or "Final An" + "swer:"
+- JSON arguments may be split: '{"loc' + 'ation": "NYC"}'
+- Need to emit thought/answer chunks as they arrive for streaming
+"""
+
+import json
+import logging
+import re
+from enum import Enum
+from typing import Optional, Callable, Any
+from . types import Action, Final
+
+logger = logging.getLogger(__name__)
+
+
+class ParserState(Enum):
+    """States for the streaming ReAct parser state machine"""
+    INITIAL = "initial"              # Waiting for first content
+    THOUGHT = "thought"              # Accumulating thought content
+    ACTION = "action"                # Found "Action:", collecting action name
+    ARGS = "args"                    # Found "Args:", collecting JSON arguments
+    FINAL_ANSWER = "final_answer"    # Found "Final Answer:", collecting answer
+    COMPLETE = "complete"            # Parsing complete, object ready
+
+
+class StreamingReActParser:
+    """
+    Stateful parser for streaming ReAct responses.
+
+    Expected format:
+        Thought: [reasoning about what to do next]
+        Action: [tool_name]
+        Args: {
+            "param": "value"
+        }
+
+    OR
+        Thought: [reasoning about the final answer]
+        Final Answer: [the answer]
+
+    Usage:
+        parser = StreamingReActParser(
+            on_thought_chunk=lambda chunk: print(f"Thought: {chunk}"),
+            on_answer_chunk=lambda chunk: print(f"Answer: {chunk}"),
+        )
+
+        for chunk in llm_stream:
+            parser.feed(chunk)
+            if parser.is_complete():
+                result = parser.get_result()
+                break
+    """
+
+    # Delimiters we're looking for
+    THOUGHT_DELIMITER = "Thought:"
+    ACTION_DELIMITER = "Action:"
+    ARGS_DELIMITER = "Args:"
+    FINAL_ANSWER_DELIMITER = "Final Answer:"
+
+    # Maximum buffer size for delimiter detection (longest delimiter + safety margin)
+    MAX_DELIMITER_BUFFER = 20
+
+    def __init__(
+        self,
+        on_thought_chunk: Optional[Callable[[str], Any]] = None,
+        on_answer_chunk: Optional[Callable[[str], Any]] = None,
+    ):
+        """
+        Initialize streaming parser.
+
+        Args:
+            on_thought_chunk: Callback for thought text chunks as they arrive
+            on_answer_chunk: Callback for final answer text chunks as they arrive
+        """
+        self.on_thought_chunk = on_thought_chunk
+        self.on_answer_chunk = on_answer_chunk
+
+        # Parser state
+        self.state = ParserState.INITIAL
+
+        # Buffers for accumulating content
+        self.line_buffer = ""           # For detecting delimiters across chunk boundaries
+        self.thought_buffer = ""        # Accumulated thought text
+        self.action_buffer = ""         # Action name
+        self.args_buffer = ""           # JSON arguments text
+        self.answer_buffer = ""         # Final answer text
+
+        # JSON parsing state for Args
+        self.brace_count = 0
+        self.args_started = False
+
+        # Result object (Action or Final)
+        self.result = None
+
+    def feed(self, chunk: str) -> None:
+        """
+        Feed a text chunk to the parser.
+
+        Args:
+            chunk: Text chunk from LLM stream
+        """
+        if self.state == ParserState.COMPLETE:
+            return  # Already complete, ignore further chunks
+
+        # Add chunk to line buffer for delimiter detection
+        self.line_buffer += chunk
+
+        # Remove markdown code blocks if present
+        self.line_buffer = re.sub(r'^```[^\n]*\n', '', self.line_buffer)
+        self.line_buffer = re.sub(r'\n```$', '', self.line_buffer)
+
+        # Process based on current state
+        while self.line_buffer and self.state != ParserState.COMPLETE:
+            if self.state == ParserState.INITIAL:
+                self._process_initial()
+            elif self.state == ParserState.THOUGHT:
+                self._process_thought()
+            elif self.state == ParserState.ACTION:
+                self._process_action()
+            elif self.state == ParserState.ARGS:
+                self._process_args()
+            elif self.state == ParserState.FINAL_ANSWER:
+                self._process_final_answer()
+
+    def _process_initial(self) -> None:
+        """Process INITIAL state - looking for 'Thought:' delimiter"""
+        idx = self.line_buffer.find(self.THOUGHT_DELIMITER)
+
+        if idx >= 0:
+            # Found thought delimiter
+            # Discard any content before it
+            self.line_buffer = self.line_buffer[idx + len(self.THOUGHT_DELIMITER):]
+            self.state = ParserState.THOUGHT
+        elif len(self.line_buffer) >= self.MAX_DELIMITER_BUFFER:
+            # Buffer getting too large, probably junk before thought
+            # Keep only the tail that might contain partial delimiter
+            self.line_buffer = self.line_buffer[-self.MAX_DELIMITER_BUFFER:]
+
+    def _process_thought(self) -> None:
+        """Process THOUGHT state - accumulating thought content"""
+        # Check for Action or Final Answer delimiter
+        action_idx = self.line_buffer.find(self.ACTION_DELIMITER)
+        final_idx = self.line_buffer.find(self.FINAL_ANSWER_DELIMITER)
+
+        # Find which delimiter comes first (if any)
+        next_delimiter_idx = -1
+        next_state = None
+
+        if action_idx >= 0 and (final_idx < 0 or action_idx < final_idx):
+            next_delimiter_idx = action_idx
+            next_state = ParserState.ACTION
+            delimiter_len = len(self.ACTION_DELIMITER)
+        elif final_idx >= 0:
+            next_delimiter_idx = final_idx
+            next_state = ParserState.FINAL_ANSWER
+            delimiter_len = len(self.FINAL_ANSWER_DELIMITER)
+
+        if next_delimiter_idx >= 0:
+            # Found next delimiter
+            thought_chunk = self.line_buffer[:next_delimiter_idx].strip()
+            if thought_chunk:
+                self.thought_buffer += thought_chunk
+                if self.on_thought_chunk:
+                    self.on_thought_chunk(thought_chunk)
+
+            self.line_buffer = self.line_buffer[next_delimiter_idx + delimiter_len:]
+            self.state = next_state
+        else:
+            # No delimiter found yet
+            # Keep tail in buffer (might contain partial delimiter)
+            # Emit the rest as thought chunk
+            if len(self.line_buffer) > self.MAX_DELIMITER_BUFFER:
+                emittable = self.line_buffer[:-self.MAX_DELIMITER_BUFFER]
+                self.thought_buffer += emittable
+                if self.on_thought_chunk:
+                    self.on_thought_chunk(emittable)
+                self.line_buffer = self.line_buffer[-self.MAX_DELIMITER_BUFFER:]
+
+    def _process_action(self) -> None:
+        """Process ACTION state - collecting action name"""
+        # Action name is on one line (or at least until newline or Args:)
+        newline_idx = self.line_buffer.find('\n')
+        args_idx = self.line_buffer.find(self.ARGS_DELIMITER)
+
+        # Find which comes first
+        if args_idx >= 0 and (newline_idx < 0 or args_idx < newline_idx):
+            # Args delimiter found first
+            self.action_buffer = self.line_buffer[:args_idx].strip().strip('"')
+            self.line_buffer = self.line_buffer[args_idx + len(self.ARGS_DELIMITER):]
+            self.state = ParserState.ARGS
+        elif newline_idx >= 0:
+            # Newline found, action name complete
+            self.action_buffer = self.line_buffer[:newline_idx].strip().strip('"')
+            self.line_buffer = self.line_buffer[newline_idx + 1:]
+            # Stay in ACTION state or move to ARGS if we find delimiter
+            # Actually, check if next line has Args:
+            if self.line_buffer.lstrip().startswith(self.ARGS_DELIMITER):
+                args_start = self.line_buffer.find(self.ARGS_DELIMITER)
+                self.line_buffer = self.line_buffer[args_start + len(self.ARGS_DELIMITER):]
+                self.state = ParserState.ARGS
+        else:
+            # Not enough content yet, keep buffering
+            # But if buffer is getting large, action name is probably complete
+            if len(self.line_buffer) > 100:
+                self.action_buffer = self.line_buffer.strip().strip('"')
+                self.line_buffer = ""
+                # Assume Args comes next, but we need more content
+                self.state = ParserState.ARGS
+
+    def _process_args(self) -> None:
+        """Process ARGS state - collecting JSON arguments"""
+        # Process character by character to track brace matching
+        i = 0
+        while i < len(self.line_buffer):
+            char = self.line_buffer[i]
+            self.args_buffer += char
+
+            if char == '{':
+                self.brace_count += 1
+                self.args_started = True
+            elif char == '}':
+                self.brace_count -= 1
+
+            # Check if JSON is complete
+            if self.args_started and self.brace_count == 0:
+                # JSON complete, try to parse
+                try:
+                    args_dict = json.loads(self.args_buffer.strip())
+                    # Success! Create Action result
+                    self.result = Action(
+                        thought=self.thought_buffer.strip(),
+                        name=self.action_buffer,
+                        arguments=args_dict,
+                        observation=""
+                    )
+                    self.state = ParserState.COMPLETE
+                    self.line_buffer = ""  # Clear buffer
+                    return
+                except json.JSONDecodeError as e:
+                    logger.error(f"Failed to parse JSON args: {self.args_buffer}")
+                    raise ValueError(f"Invalid JSON in Args: {e}")
+
+            i += 1
+
+        # Consumed entire buffer, clear it and wait for more chunks
+        self.line_buffer = ""
+
+    def _process_final_answer(self) -> None:
+        """Process FINAL_ANSWER state - collecting final answer"""
+        # For final answer, we consume everything until we decide we're done
+        # In streaming mode, we can't know when answer is complete until stream ends
+        # So we emit chunks and accumulate
+
+        # Check if this might be JSON
+        is_json = self.answer_buffer.strip().startswith('{') or \
+                  self.line_buffer.strip().startswith('{')
+
+        if is_json:
+            # Handle JSON final answer
+            self.answer_buffer += self.line_buffer
+
+            # Count braces to detect completion
+            brace_count = self.answer_buffer.count('{') - self.answer_buffer.count('}')
+
+            if brace_count == 0 and '{' in self.answer_buffer:
+                # JSON might be complete
+                # Note: We can't be 100% sure without trying to parse
+                # But in streaming mode, we'll finish when stream ends
+                pass
+
+            # Emit chunk
+            if self.on_answer_chunk:
+                self.on_answer_chunk(self.line_buffer)
+
+            self.line_buffer = ""
+        else:
+            # Regular text answer - emit everything
+            if self.line_buffer:
+                self.answer_buffer += self.line_buffer
+                if self.on_answer_chunk:
+                    self.on_answer_chunk(self.line_buffer)
+                self.line_buffer = ""
+
+    def finalize(self) -> None:
+        """
+        Call this when the stream is complete to finalize parsing.
+        This handles any remaining buffered content.
+        """
+        if self.state == ParserState.COMPLETE:
+            return
+
+        # Flush any remaining thought chunks
+        if self.state == ParserState.THOUGHT and self.line_buffer:
+            self.thought_buffer += self.line_buffer
+            if self.on_thought_chunk:
+                self.on_thought_chunk(self.line_buffer)
+            self.line_buffer = ""
+
+        # Finalize final answer
+        if self.state == ParserState.FINAL_ANSWER:
+            # Flush any remaining answer content
+            if self.line_buffer:
+                self.answer_buffer += self.line_buffer
+                if self.on_answer_chunk:
+                    self.on_answer_chunk(self.line_buffer)
+                self.line_buffer = ""
+
+            # Create Final result
+            self.result = Final(
+                thought=self.thought_buffer.strip(),
+                final=self.answer_buffer.strip()
+            )
+            self.state = ParserState.COMPLETE
+
+        # If we're in other states, something went wrong
+        if self.state not in [ParserState.COMPLETE, ParserState.FINAL_ANSWER]:
+            if self.thought_buffer:
+                raise ValueError(
+                    f"Stream ended in {self.state.value} state with incomplete parsing. "
+                    f"Thought: {self.thought_buffer[:100]}..."
+                )
+            else:
+                raise ValueError(f"Stream ended in {self.state.value} state with no content")
+
+    def is_complete(self) -> bool:
+        """Check if parsing is complete"""
+        return self.state == ParserState.COMPLETE
+
+    def get_result(self) -> Optional[Action | Final]:
+        """Get the parsed result (Action or Final)"""
+        return self.result
--- a/trustgraph-flow/trustgraph/librarian/blob_store.py
+++ b/trustgraph-flow/trustgraph/librarian/blob_store.py
@ -34,9 +34,9 @@ class BlobStore:
    def ensure_bucket(self):

        # Make the bucket if it doesn't exist.
-        found = self.minio.bucket_exists(self.bucket_name)
+        found = self.minio.bucket_exists(bucket_name=self.bucket_name)
        if not found:
-            self.minio.make_bucket(self.bucket_name)
+            self.minio.make_bucket(bucket_name=self.bucket_name)
            logger.info(f"Created bucket {self.bucket_name}")
        else:
            logger.debug(f"Bucket {self.bucket_name} already exists")
--- a/trustgraph-flow/trustgraph/model/text_completion/azure/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/azure/llm.py
@ -11,7 +11,7 @@ import os
 import logging

 from .... exceptions import TooManyRequests
-from .... base import LlmService, LlmResult
+from .... base import LlmService, LlmResult, LlmChunk

 # Module logger
 logger = logging.getLogger(__name__)
@ -55,7 +55,7 @@ class Processor(LlmService):
        self.max_output = max_output
        self.default_model = model

-    def build_prompt(self, system, content, temperature=None):
+    def build_prompt(self, system, content, temperature=None, stream=False):
        # Use provided temperature or fall back to default
        effective_temperature = temperature if temperature is not None else self.temperature

@ -73,6 +73,9 @@ class Processor(LlmService):
            "top_p": 1
        }

+        if stream:
+            data["stream"] = True
+
        body = json.dumps(data)

        return body
@ -157,6 +160,84 @@ class Processor(LlmService):

        logger.debug("Azure LLM processing complete")

+    def supports_streaming(self):
+        """Azure serverless endpoints support streaming"""
+        return True
+
+    async def generate_content_stream(self, system, prompt, model=None, temperature=None):
+        """Stream content generation from Azure serverless endpoint"""
+        model_name = model or self.default_model
+        effective_temperature = temperature if temperature is not None else self.temperature
+
+        logger.debug(f"Using model (streaming): {model_name}")
+        logger.debug(f"Using temperature: {effective_temperature}")
+
+        try:
+            body = self.build_prompt(system, prompt, effective_temperature, stream=True)
+
+            url = self.endpoint
+            api_key = self.token
+
+            headers = {
+                'Content-Type': 'application/json',
+                'Authorization': f'Bearer {api_key}'
+            }
+
+            response = requests.post(url, data=body, headers=headers, stream=True)
+
+            if response.status_code == 429:
+                raise TooManyRequests()
+
+            if response.status_code != 200:
+                raise RuntimeError("LLM failure")
+
+            # Parse SSE stream
+            for line in response.iter_lines():
+                if line:
+                    line = line.decode('utf-8').strip()
+                    if line.startswith('data: '):
+                        data = line[6:]  # Remove 'data: ' prefix
+
+                        if data == '[DONE]':
+                            break
+
+                        try:
+                            chunk_data = json.loads(data)
+
+                            if 'choices' in chunk_data and len(chunk_data['choices']) > 0:
+                                delta = chunk_data['choices'][0].get('delta', {})
+                                content = delta.get('content')
+                                if content:
+                                    yield LlmChunk(
+                                        text=content,
+                                        in_token=None,
+                                        out_token=None,
+                                        model=model_name,
+                                        is_final=False
+                                    )
+                        except json.JSONDecodeError:
+                            logger.warning(f"Failed to parse chunk: {data}")
+                            continue
+
+            # Send final chunk
+            yield LlmChunk(
+                text="",
+                in_token=None,
+                out_token=None,
+                model=model_name,
+                is_final=True
+            )
+
+            logger.debug("Streaming complete")
+
+        except TooManyRequests:
+            logger.warning("Rate limit exceeded during streaming")
+            raise TooManyRequests()
+
+        except Exception as e:
+            logger.error(f"Azure streaming exception ({type(e).__name__}): {e}", exc_info=True)
+            raise e
+
    @staticmethod
    def add_args(parser):

--- a/trustgraph-flow/trustgraph/model/text_completion/azure_openai/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/azure_openai/llm.py
@ -14,7 +14,7 @@ import logging
 logger = logging.getLogger(__name__)

 from .... exceptions import TooManyRequests
-from .... base import LlmService, LlmResult
+from .... base import LlmService, LlmResult, LlmChunk

 default_ident = "text-completion"

@ -125,6 +125,75 @@ class Processor(LlmService):

        logger.debug("Azure OpenAI LLM processing complete")

+    def supports_streaming(self):
+        """Azure OpenAI supports streaming"""
+        return True
+
+    async def generate_content_stream(self, system, prompt, model=None, temperature=None):
+        """
+        Stream content generation from Azure OpenAI.
+        Yields LlmChunk objects with is_final=True on the last chunk.
+        """
+        # Use provided model or fall back to default
+        model_name = model or self.default_model
+        # Use provided temperature or fall back to default
+        effective_temperature = temperature if temperature is not None else self.temperature
+
+        logger.debug(f"Using model (streaming): {model_name}")
+        logger.debug(f"Using temperature: {effective_temperature}")
+
+        prompt = system + "\n\n" + prompt
+
+        try:
+            response = self.openai.chat.completions.create(
+                model=model_name,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": prompt
+                            }
+                        ]
+                    }
+                ],
+                temperature=effective_temperature,
+                max_tokens=self.max_output,
+                top_p=1,
+                stream=True  # Enable streaming
+            )
+
+            # Stream chunks
+            for chunk in response:
+                if chunk.choices and chunk.choices[0].delta.content:
+                    yield LlmChunk(
+                        text=chunk.choices[0].delta.content,
+                        in_token=None,
+                        out_token=None,
+                        model=model_name,
+                        is_final=False
+                    )
+
+            # Send final chunk
+            yield LlmChunk(
+                text="",
+                in_token=None,
+                out_token=None,
+                model=model_name,
+                is_final=True
+            )
+
+            logger.debug("Streaming complete")
+
+        except RateLimitError:
+            logger.warning("Rate limit exceeded during streaming")
+            raise TooManyRequests()
+
+        except Exception as e:
+            logger.error(f"Azure OpenAI streaming exception ({type(e).__name__}): {e}", exc_info=True)
+            raise e
+
    @staticmethod
    def add_args(parser):

--- a/trustgraph-flow/trustgraph/model/text_completion/claude/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/claude/llm.py
@ -9,7 +9,7 @@ import os
 import logging

 from .... exceptions import TooManyRequests
-from .... base import LlmService, LlmResult
+from .... base import LlmService, LlmResult, LlmChunk

 # Module logger
 logger = logging.getLogger(__name__)
@ -106,6 +106,65 @@ class Processor(LlmService):
            logger.error(f"Claude LLM exception ({type(e).__name__}): {e}", exc_info=True)
            raise e

+    def supports_streaming(self):
+        """Claude/Anthropic supports streaming"""
+        return True
+
+    async def generate_content_stream(self, system, prompt, model=None, temperature=None):
+        """Stream content generation from Claude"""
+        model_name = model or self.default_model
+        effective_temperature = temperature if temperature is not None else self.temperature
+
+        logger.debug(f"Using model (streaming): {model_name}")
+        logger.debug(f"Using temperature: {effective_temperature}")
+
+        try:
+            with self.claude.messages.stream(
+                model=model_name,
+                max_tokens=self.max_output,
+                temperature=effective_temperature,
+                system=system,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": prompt
+                            }
+                        ]
+                    }
+                ]
+            ) as stream:
+                for text in stream.text_stream:
+                    yield LlmChunk(
+                        text=text,
+                        in_token=None,
+                        out_token=None,
+                        model=model_name,
+                        is_final=False
+                    )
+
+                # Get final message for token counts
+                final_message = stream.get_final_message()
+                yield LlmChunk(
+                    text="",
+                    in_token=final_message.usage.input_tokens,
+                    out_token=final_message.usage.output_tokens,
+                    model=model_name,
+                    is_final=True
+                )
+
+            logger.debug("Streaming complete")
+
+        except anthropic.RateLimitError:
+            logger.warning("Rate limit exceeded during streaming")
+            raise TooManyRequests()
+
+        except Exception as e:
+            logger.error(f"Claude streaming exception ({type(e).__name__}): {e}", exc_info=True)
+            raise e
+
    @staticmethod
    def add_args(parser):

--- a/trustgraph-flow/trustgraph/model/text_completion/cohere/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/cohere/llm.py
@ -13,7 +13,7 @@ import logging
 logger = logging.getLogger(__name__)

 from .... exceptions import TooManyRequests
-from .... base import LlmService, LlmResult
+from .... base import LlmService, LlmResult, LlmChunk

 default_ident = "text-completion"

@ -98,6 +98,68 @@ class Processor(LlmService):
            logger.error(f"Cohere LLM exception ({type(e).__name__}): {e}", exc_info=True)
            raise e

+    def supports_streaming(self):
+        """Cohere supports streaming"""
+        return True
+
+    async def generate_content_stream(self, system, prompt, model=None, temperature=None):
+        """Stream content generation from Cohere"""
+        model_name = model or self.default_model
+        effective_temperature = temperature if temperature is not None else self.temperature
+
+        logger.debug(f"Using model (streaming): {model_name}")
+        logger.debug(f"Using temperature: {effective_temperature}")
+
+        try:
+            stream = self.cohere.chat_stream(
+                model=model_name,
+                message=prompt,
+                preamble=system,
+                temperature=effective_temperature,
+                chat_history=[],
+                prompt_truncation='auto',
+                connectors=[]
+            )
+
+            total_input_tokens = 0
+            total_output_tokens = 0
+
+            for event in stream:
+                if event.event_type == "text-generation":
+                    if hasattr(event, 'text') and event.text:
+                        yield LlmChunk(
+                            text=event.text,
+                            in_token=None,
+                            out_token=None,
+                            model=model_name,
+                            is_final=False
+                        )
+                elif event.event_type == "stream-end":
+                    # Extract token counts from final event
+                    if hasattr(event, 'response') and hasattr(event.response, 'meta'):
+                        if hasattr(event.response.meta, 'billed_units'):
+                            total_input_tokens = int(event.response.meta.billed_units.input_tokens)
+                            total_output_tokens = int(event.response.meta.billed_units.output_tokens)
+
+            # Send final chunk with token counts
+            yield LlmChunk(
+                text="",
+                in_token=total_input_tokens,
+                out_token=total_output_tokens,
+                model=model_name,
+                is_final=True
+            )
+
+            logger.debug("Streaming complete")
+
+        except cohere.TooManyRequestsError:
+            logger.warning("Rate limit exceeded during streaming")
+            raise TooManyRequests()
+
+        except Exception as e:
+            logger.error(f"Cohere streaming exception ({type(e).__name__}): {e}", exc_info=True)
+            raise e
+
    @staticmethod
    def add_args(parser):

--- a/trustgraph-flow/trustgraph/model/text_completion/googleaistudio/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/googleaistudio/llm.py
@ -23,7 +23,7 @@ import logging
 logger = logging.getLogger(__name__)

 from .... exceptions import TooManyRequests
-from .... base import LlmService, LlmResult
+from .... base import LlmService, LlmResult, LlmChunk

 default_ident = "text-completion"

@ -159,6 +159,67 @@ class Processor(LlmService):
            logger.error(f"GoogleAIStudio LLM exception ({type(e).__name__}): {e}", exc_info=True)
            raise e

+    def supports_streaming(self):
+        """Google AI Studio supports streaming"""
+        return True
+
+    async def generate_content_stream(self, system, prompt, model=None, temperature=None):
+        """Stream content generation from Google AI Studio"""
+        model_name = model or self.default_model
+        effective_temperature = temperature if temperature is not None else self.temperature
+
+        logger.debug(f"Using model (streaming): {model_name}")
+        logger.debug(f"Using temperature: {effective_temperature}")
+
+        generation_config = self._get_or_create_config(model_name, effective_temperature)
+        generation_config.system_instruction = system
+
+        try:
+            response = self.client.models.generate_content_stream(
+                model=model_name,
+                config=generation_config,
+                contents=prompt,
+            )
+
+            total_input_tokens = 0
+            total_output_tokens = 0
+
+            for chunk in response:
+                if hasattr(chunk, 'text') and chunk.text:
+                    yield LlmChunk(
+                        text=chunk.text,
+                        in_token=None,
+                        out_token=None,
+                        model=model_name,
+                        is_final=False
+                    )
+
+                # Accumulate token counts if available
+                if hasattr(chunk, 'usage_metadata'):
+                    if hasattr(chunk.usage_metadata, 'prompt_token_count'):
+                        total_input_tokens = int(chunk.usage_metadata.prompt_token_count)
+                    if hasattr(chunk.usage_metadata, 'candidates_token_count'):
+                        total_output_tokens = int(chunk.usage_metadata.candidates_token_count)
+
+            # Send final chunk with token counts
+            yield LlmChunk(
+                text="",
+                in_token=total_input_tokens,
+                out_token=total_output_tokens,
+                model=model_name,
+                is_final=True
+            )
+
+            logger.debug("Streaming complete")
+
+        except ResourceExhausted:
+            logger.warning("Rate limit exceeded during streaming")
+            raise TooManyRequests()
+
+        except Exception as e:
+            logger.error(f"GoogleAIStudio streaming exception ({type(e).__name__}): {e}", exc_info=True)
+            raise e
+
    @staticmethod
    def add_args(parser):

--- a/trustgraph-flow/trustgraph/model/text_completion/llamafile/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/llamafile/llm.py
@ -12,7 +12,7 @@ import logging
 logger = logging.getLogger(__name__)

 from .... exceptions import TooManyRequests
-from .... base import LlmService, LlmResult
+from .... base import LlmService, LlmResult, LlmChunk

 default_ident = "text-completion"

@ -102,6 +102,57 @@ class Processor(LlmService):
            logger.error(f"Llamafile LLM exception ({type(e).__name__}): {e}", exc_info=True)
            raise e

+    def supports_streaming(self):
+        """LlamaFile supports streaming"""
+        return True
+
+    async def generate_content_stream(self, system, prompt, model=None, temperature=None):
+        """Stream content generation from LlamaFile"""
+        model_name = model or self.default_model
+        effective_temperature = temperature if temperature is not None else self.temperature
+
+        logger.debug(f"Using model (streaming): {model_name}")
+        logger.debug(f"Using temperature: {effective_temperature}")
+
+        prompt = system + "\n\n" + prompt
+
+        try:
+            response = self.openai.chat.completions.create(
+                model=model_name,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=effective_temperature,
+                max_tokens=self.max_output,
+                top_p=1,
+                frequency_penalty=0,
+                presence_penalty=0,
+                response_format={"type": "text"},
+                stream=True
+            )
+
+            for chunk in response:
+                if chunk.choices and chunk.choices[0].delta.content:
+                    yield LlmChunk(
+                        text=chunk.choices[0].delta.content,
+                        in_token=None,
+                        out_token=None,
+                        model=model_name,
+                        is_final=False
+                    )
+
+            yield LlmChunk(
+                text="",
+                in_token=None,
+                out_token=None,
+                model=model_name,
+                is_final=True
+            )
+
+            logger.debug("Streaming complete")
+
+        except Exception as e:
+            logger.error(f"LlamaFile streaming exception ({type(e).__name__}): {e}", exc_info=True)
+            raise e
+
    @staticmethod
    def add_args(parser):

--- a/trustgraph-flow/trustgraph/model/text_completion/lmstudio/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/lmstudio/llm.py
@ -12,7 +12,7 @@ import logging
 logger = logging.getLogger(__name__)

 from .... exceptions import TooManyRequests
-from .... base import LlmService, LlmResult
+from .... base import LlmService, LlmResult, LlmChunk

 default_ident = "text-completion"

@ -106,6 +106,57 @@ class Processor(LlmService):
            logger.error(f"LMStudio LLM exception ({type(e).__name__}): {e}", exc_info=True)
            raise e

+    def supports_streaming(self):
+        """LM Studio supports streaming"""
+        return True
+
+    async def generate_content_stream(self, system, prompt, model=None, temperature=None):
+        """Stream content generation from LM Studio"""
+        model_name = model or self.default_model
+        effective_temperature = temperature if temperature is not None else self.temperature
+
+        logger.debug(f"Using model (streaming): {model_name}")
+        logger.debug(f"Using temperature: {effective_temperature}")
+
+        prompt = system + "\n\n" + prompt
+
+        try:
+            response = self.openai.chat.completions.create(
+                model=model_name,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=effective_temperature,
+                max_tokens=self.max_output,
+                top_p=1,
+                frequency_penalty=0,
+                presence_penalty=0,
+                response_format={"type": "text"},
+                stream=True
+            )
+
+            for chunk in response:
+                if chunk.choices and chunk.choices[0].delta.content:
+                    yield LlmChunk(
+                        text=chunk.choices[0].delta.content,
+                        in_token=None,
+                        out_token=None,
+                        model=model_name,
+                        is_final=False
+                    )
+
+            yield LlmChunk(
+                text="",
+                in_token=None,
+                out_token=None,
+                model=model_name,
+                is_final=True
+            )
+
+            logger.debug("Streaming complete")
+
+        except Exception as e:
+            logger.error(f"LMStudio streaming exception ({type(e).__name__}): {e}", exc_info=True)
+            raise e
+
    @staticmethod
    def add_args(parser):

--- a/trustgraph-flow/trustgraph/model/text_completion/mistral/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/mistral/llm.py
@ -12,7 +12,7 @@ import logging
 logger = logging.getLogger(__name__)

 from .... exceptions import TooManyRequests
-from .... base import LlmService, LlmResult
+from .... base import LlmService, LlmResult, LlmChunk

 default_ident = "text-completion"

@ -120,6 +120,67 @@ class Processor(LlmService):
            logger.error(f"Mistral LLM exception ({type(e).__name__}): {e}", exc_info=True)
            raise e

+    def supports_streaming(self):
+        """Mistral supports streaming"""
+        return True
+
+    async def generate_content_stream(self, system, prompt, model=None, temperature=None):
+        """Stream content generation from Mistral"""
+        model_name = model or self.default_model
+        effective_temperature = temperature if temperature is not None else self.temperature
+
+        logger.debug(f"Using model (streaming): {model_name}")
+        logger.debug(f"Using temperature: {effective_temperature}")
+
+        prompt = system + "\n\n" + prompt
+
+        try:
+            stream = self.mistral.chat.stream(
+                model=model_name,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": prompt
+                            }
+                        ]
+                    }
+                ],
+                temperature=effective_temperature,
+                max_tokens=self.max_output,
+                top_p=1,
+                frequency_penalty=0,
+                presence_penalty=0,
+                response_format={"type": "text"}
+            )
+
+            for chunk in stream:
+                if chunk.data.choices and chunk.data.choices[0].delta.content:
+                    yield LlmChunk(
+                        text=chunk.data.choices[0].delta.content,
+                        in_token=None,
+                        out_token=None,
+                        model=model_name,
+                        is_final=False
+                    )
+
+            # Send final chunk
+            yield LlmChunk(
+                text="",
+                in_token=None,
+                out_token=None,
+                model=model_name,
+                is_final=True
+            )
+
+            logger.debug("Streaming complete")
+
+        except Exception as e:
+            logger.error(f"Mistral streaming exception ({type(e).__name__}): {e}", exc_info=True)
+            raise e
+
    @staticmethod
    def add_args(parser):

--- a/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py
@ -12,7 +12,7 @@ import logging
 logger = logging.getLogger(__name__)

 from .... exceptions import TooManyRequests
-from .... base import LlmService, LlmResult
+from .... base import LlmService, LlmResult, LlmChunk

 default_ident = "text-completion"

@ -79,6 +79,62 @@ class Processor(LlmService):
            logger.error(f"Ollama LLM exception ({type(e).__name__}): {e}", exc_info=True)
            raise e

+    def supports_streaming(self):
+        """Ollama supports streaming"""
+        return True
+
+    async def generate_content_stream(self, system, prompt, model=None, temperature=None):
+        """Stream content generation from Ollama"""
+        model_name = model or self.default_model
+        effective_temperature = temperature if temperature is not None else self.temperature
+
+        logger.debug(f"Using model (streaming): {model_name}")
+        logger.debug(f"Using temperature: {effective_temperature}")
+
+        prompt = system + "\n\n" + prompt
+
+        try:
+            stream = self.llm.generate(
+                model_name,
+                prompt,
+                options={'temperature': effective_temperature},
+                stream=True
+            )
+
+            total_input_tokens = 0
+            total_output_tokens = 0
+
+            for chunk in stream:
+                if 'response' in chunk and chunk['response']:
+                    yield LlmChunk(
+                        text=chunk['response'],
+                        in_token=None,
+                        out_token=None,
+                        model=model_name,
+                        is_final=False
+                    )
+
+                # Accumulate token counts if available
+                if 'prompt_eval_count' in chunk:
+                    total_input_tokens = int(chunk['prompt_eval_count'])
+                if 'eval_count' in chunk:
+                    total_output_tokens = int(chunk['eval_count'])
+
+            # Send final chunk with token counts
+            yield LlmChunk(
+                text="",
+                in_token=total_input_tokens,
+                out_token=total_output_tokens,
+                model=model_name,
+                is_final=True
+            )
+
+            logger.debug("Streaming complete")
+
+        except Exception as e:
+            logger.error(f"Ollama streaming exception ({type(e).__name__}): {e}", exc_info=True)
+            raise e
+
    @staticmethod
    def add_args(parser):

--- a/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py
@ -9,7 +9,7 @@ import os
 import logging

 from .... exceptions import TooManyRequests
-from .... base import LlmService, LlmResult
+from .... base import LlmService, LlmResult, LlmChunk

 # Module logger
 logger = logging.getLogger(__name__)
@ -118,6 +118,75 @@ class Processor(LlmService):
            logger.error(f"OpenAI LLM exception ({type(e).__name__}): {e}", exc_info=True)
            raise e

+    def supports_streaming(self):
+        """OpenAI supports streaming"""
+        return True
+
+    async def generate_content_stream(self, system, prompt, model=None, temperature=None):
+        """
+        Stream content generation from OpenAI.
+        Yields LlmChunk objects with is_final=True on the last chunk.
+        """
+        # Use provided model or fall back to default
+        model_name = model or self.default_model
+        # Use provided temperature or fall back to default
+        effective_temperature = temperature if temperature is not None else self.temperature
+
+        logger.debug(f"Using model (streaming): {model_name}")
+        logger.debug(f"Using temperature: {effective_temperature}")
+
+        prompt = system + "\n\n" + prompt
+
+        try:
+            response = self.openai.chat.completions.create(
+                model=model_name,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": prompt
+                            }
+                        ]
+                    }
+                ],
+                temperature=effective_temperature,
+                max_tokens=self.max_output,
+                stream=True  # Enable streaming
+            )
+
+            # Stream chunks
+            for chunk in response:
+                if chunk.choices and chunk.choices[0].delta.content:
+                    yield LlmChunk(
+                        text=chunk.choices[0].delta.content,
+                        in_token=None,
+                        out_token=None,
+                        model=model_name,
+                        is_final=False
+                    )
+
+            # Note: OpenAI doesn't provide token counts in streaming mode
+            # Send final chunk without token counts
+            yield LlmChunk(
+                text="",
+                in_token=None,
+                out_token=None,
+                model=model_name,
+                is_final=True
+            )
+
+            logger.debug("Streaming complete")
+
+        except RateLimitError:
+            logger.warning("Hit rate limit during streaming")
+            raise TooManyRequests()
+
+        except Exception as e:
+            logger.error(f"OpenAI streaming exception ({type(e).__name__}): {e}", exc_info=True)
+            raise e
+
    @staticmethod
    def add_args(parser):

--- a/trustgraph-flow/trustgraph/model/text_completion/tgi/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/tgi/llm.py
@ -12,7 +12,7 @@ import logging
 logger = logging.getLogger(__name__)

 from .... exceptions import TooManyRequests
-from .... base import LlmService, LlmResult
+from .... base import LlmService, LlmResult, LlmChunk

 default_ident = "text-completion"

@ -121,6 +121,100 @@ class Processor(LlmService):
            logger.error(f"TGI LLM exception ({type(e).__name__}): {e}", exc_info=True)
            raise e

+    def supports_streaming(self):
+        """TGI supports streaming"""
+        return True
+
+    async def generate_content_stream(self, system, prompt, model=None, temperature=None):
+        """Stream content generation from TGI"""
+        model_name = model or self.default_model
+        effective_temperature = temperature if temperature is not None else self.temperature
+
+        logger.debug(f"Using model (streaming): {model_name}")
+        logger.debug(f"Using temperature: {effective_temperature}")
+
+        headers = {
+            "Content-Type": "application/json",
+        }
+
+        request = {
+            "model": model_name,
+            "messages": [
+                {
+                    "role": "system",
+                    "content": system,
+                },
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            "max_tokens": self.max_output,
+            "temperature": effective_temperature,
+            "stream": True,
+        }
+
+        try:
+            url = f"{self.base_url}/chat/completions"
+
+            async with self.session.post(
+                    url,
+                    headers=headers,
+                    json=request,
+            ) as response:
+
+                if response.status != 200:
+                    raise RuntimeError("Bad status: " + str(response.status))
+
+                # Parse SSE stream
+                async for line in response.content:
+                    line = line.decode('utf-8').strip()
+
+                    if not line:
+                        continue
+
+                    if line.startswith('data: '):
+                        data = line[6:]  # Remove 'data: ' prefix
+
+                        if data == '[DONE]':
+                            break
+
+                        try:
+                            import json
+                            chunk_data = json.loads(data)
+
+                            # Extract text from chunk
+                            if 'choices' in chunk_data and len(chunk_data['choices']) > 0:
+                                choice = chunk_data['choices'][0]
+                                if 'delta' in choice and 'content' in choice['delta']:
+                                    content = choice['delta']['content']
+                                    if content:
+                                        yield LlmChunk(
+                                            text=content,
+                                            in_token=None,
+                                            out_token=None,
+                                            model=model_name,
+                                            is_final=False
+                                        )
+                        except json.JSONDecodeError:
+                            logger.warning(f"Failed to parse chunk: {data}")
+                            continue
+
+                # Send final chunk
+                yield LlmChunk(
+                    text="",
+                    in_token=None,
+                    out_token=None,
+                    model=model_name,
+                    is_final=True
+                )
+
+                logger.debug("Streaming complete")
+
+        except Exception as e:
+            logger.error(f"TGI streaming exception ({type(e).__name__}): {e}", exc_info=True)
+            raise e
+
    @staticmethod
    def add_args(parser):

--- a/trustgraph-flow/trustgraph/model/text_completion/vllm/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/vllm/llm.py
@ -12,7 +12,7 @@ import logging
 logger = logging.getLogger(__name__)

 from .... exceptions import TooManyRequests
-from .... base import LlmService, LlmResult
+from .... base import LlmService, LlmResult, LlmChunk

 default_ident = "text-completion"

@ -113,6 +113,89 @@ class Processor(LlmService):
            logger.error(f"vLLM LLM exception ({type(e).__name__}): {e}", exc_info=True)
            raise e

+    def supports_streaming(self):
+        """vLLM supports streaming"""
+        return True
+
+    async def generate_content_stream(self, system, prompt, model=None, temperature=None):
+        """Stream content generation from vLLM"""
+        model_name = model or self.default_model
+        effective_temperature = temperature if temperature is not None else self.temperature
+
+        logger.debug(f"Using model (streaming): {model_name}")
+        logger.debug(f"Using temperature: {effective_temperature}")
+
+        headers = {
+            "Content-Type": "application/json",
+        }
+
+        request = {
+            "model": model_name,
+            "prompt": system + "\n\n" + prompt,
+            "max_tokens": self.max_output,
+            "temperature": effective_temperature,
+            "stream": True,
+        }
+
+        try:
+            url = f"{self.base_url}/completions"
+
+            async with self.session.post(
+                    url,
+                    headers=headers,
+                    json=request,
+            ) as response:
+
+                if response.status != 200:
+                    raise RuntimeError("Bad status: " + str(response.status))
+
+                # Parse SSE stream
+                async for line in response.content:
+                    line = line.decode('utf-8').strip()
+
+                    if not line:
+                        continue
+
+                    if line.startswith('data: '):
+                        data = line[6:]  # Remove 'data: ' prefix
+
+                        if data == '[DONE]':
+                            break
+
+                        try:
+                            import json
+                            chunk_data = json.loads(data)
+
+                            # Extract text from chunk
+                            if 'choices' in chunk_data and len(chunk_data['choices']) > 0:
+                                choice = chunk_data['choices'][0]
+                                if 'text' in choice and choice['text']:
+                                    yield LlmChunk(
+                                        text=choice['text'],
+                                        in_token=None,
+                                        out_token=None,
+                                        model=model_name,
+                                        is_final=False
+                                    )
+                        except json.JSONDecodeError:
+                            logger.warning(f"Failed to parse chunk: {data}")
+                            continue
+
+                # Send final chunk
+                yield LlmChunk(
+                    text="",
+                    in_token=None,
+                    out_token=None,
+                    model=model_name,
+                    is_final=True
+                )
+
+                logger.debug("Streaming complete")
+
+        except Exception as e:
+            logger.error(f"vLLM streaming exception ({type(e).__name__}): {e}", exc_info=True)
+            raise e
+
    @staticmethod
    def add_args(parser):

--- a/trustgraph-flow/trustgraph/prompt/template/service.py
+++ b/trustgraph-flow/trustgraph/prompt/template/service.py
@ -101,6 +101,9 @@ class Processor(FlowProcessor):

        kind = v.id

+        # Check if streaming is requested
+        streaming = getattr(v, 'streaming', False)
+
        try:

            logger.debug(f"Prompt terms: {v.terms}")
@ -109,16 +112,68 @@ class Processor(FlowProcessor):
                k: json.loads(v)
                for k, v in v.terms.items()
            }
-            
-            logger.debug(f"Handling prompt kind {kind}...")

+            logger.debug(f"Handling prompt kind {kind}... (streaming={streaming})")
+
+            # If streaming, we need to handle it differently
+            if streaming:
+                # For streaming, we need to intercept LLM responses
+                # and forward them as they arrive
+
+                async def llm_streaming(system, prompt):
+                    logger.debug(f"System prompt: {system}")
+                    logger.debug(f"User prompt: {prompt}")
+
+                    # Use the text completion client with recipient handler
+                    client = flow("text-completion-request")
+
+                    async def forward_chunks(resp):
+                        if resp.error:
+                            raise RuntimeError(resp.error.message)
+
+                        is_final = getattr(resp, 'end_of_stream', False)
+
+                        # Always send a message if there's content OR if it's the final message
+                        if resp.response or is_final:
+                            # Forward each chunk immediately
+                            r = PromptResponse(
+                                text=resp.response if resp.response else "",
+                                object=None,
+                                error=None,
+                                end_of_stream=is_final,
+                            )
+                            await flow("response").send(r, properties={"id": id})
+
+                        # Return True when end_of_stream
+                        return is_final
+
+                    await client.request(
+                        TextCompletionRequest(
+                            system=system, prompt=prompt, streaming=True
+                        ),
+                        recipient=forward_chunks,
+                        timeout=600
+                    )
+
+                    # Return empty string since we already sent all chunks
+                    return ""
+
+                try:
+                    await self.manager.invoke(kind, input, llm_streaming)
+                except Exception as e:
+                    logger.error(f"Prompt streaming exception: {e}", exc_info=True)
+                    raise e
+
+                return
+
+            # Non-streaming path (original behavior)
            async def llm(system, prompt):

                logger.debug(f"System prompt: {system}")
                logger.debug(f"User prompt: {prompt}")

                resp = await flow("text-completion-request").text_completion(
-                    system = system, prompt = prompt,
+                    system = system, prompt = prompt, streaming = False,
                )

                try:
@ -143,6 +198,7 @@ class Processor(FlowProcessor):
                    text=resp,
                    object=None,
                    error=None,
+                    end_of_stream=True,
                )

                await flow("response").send(r, properties={"id": id})
@ -158,6 +214,7 @@ class Processor(FlowProcessor):
                    text=None,
                    object=json.dumps(resp),
                    error=None,
+                    end_of_stream=True,
                )

                await flow("response").send(r, properties={"id": id})
@ -175,27 +232,13 @@ class Processor(FlowProcessor):
                    type = "llm-error",
                    message = str(e),
                ),
-                response=None,
+                text=None,
+                object=None,
+                end_of_stream=True,
            )

            await flow("response").send(r, properties={"id": id})

-        except Exception as e:
-
-            logger.error(f"Prompt service exception: {e}", exc_info=True)
-
-            logger.debug("Sending error response...")
-
-            r = PromptResponse(
-                error=Error(
-                    type = "llm-error",
-                    message = str(e),
-                ),
-                response=None,
-            )
-
-            await self.send(r, properties={"id": id})
-
    @staticmethod
    def add_args(parser):