Feature/streaming llm phase 1 (#566)

* Tidy up duplicate tech specs in doc directory * Streaming LLM text-completion service tech spec. * text-completion and prompt interfaces * streaming change applied to all LLMs, so far tested with VertexAI * Skip Pinecone unit tests, upstream module issue is affecting things, tests are passing again * Added agent streaming, not working and has broken tests
2026-04-27 09:26:22 +02:00 · 2025-11-26 09:59:10 +00:00 · 2025-11-26 09:59:10 +00:00 · 310a2deb06
commit 310a2deb06
parent 943a9d83b0
44 changed files with 2684 additions and 937 deletions
--- a/trustgraph-flow/trustgraph/model/text_completion/claude/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/claude/llm.py
@ -9,7 +9,7 @@ import os
 import logging

 from .... exceptions import TooManyRequests
-from .... base import LlmService, LlmResult
+from .... base import LlmService, LlmResult, LlmChunk

 # Module logger
 logger = logging.getLogger(__name__)
@ -106,6 +106,65 @@ class Processor(LlmService):
            logger.error(f"Claude LLM exception ({type(e).__name__}): {e}", exc_info=True)
            raise e

+    def supports_streaming(self):
+        """Claude/Anthropic supports streaming"""
+        return True
+
+    async def generate_content_stream(self, system, prompt, model=None, temperature=None):
+        """Stream content generation from Claude"""
+        model_name = model or self.default_model
+        effective_temperature = temperature if temperature is not None else self.temperature
+
+        logger.debug(f"Using model (streaming): {model_name}")
+        logger.debug(f"Using temperature: {effective_temperature}")
+
+        try:
+            with self.claude.messages.stream(
+                model=model_name,
+                max_tokens=self.max_output,
+                temperature=effective_temperature,
+                system=system,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": prompt
+                            }
+                        ]
+                    }
+                ]
+            ) as stream:
+                for text in stream.text_stream:
+                    yield LlmChunk(
+                        text=text,
+                        in_token=None,
+                        out_token=None,
+                        model=model_name,
+                        is_final=False
+                    )
+
+                # Get final message for token counts
+                final_message = stream.get_final_message()
+                yield LlmChunk(
+                    text="",
+                    in_token=final_message.usage.input_tokens,
+                    out_token=final_message.usage.output_tokens,
+                    model=model_name,
+                    is_final=True
+                )
+
+            logger.debug("Streaming complete")
+
+        except anthropic.RateLimitError:
+            logger.warning("Rate limit exceeded during streaming")
+            raise TooManyRequests()
+
+        except Exception as e:
+            logger.error(f"Claude streaming exception ({type(e).__name__}): {e}", exc_info=True)
+            raise e
+
    @staticmethod
    def add_args(parser):