feat: merge geekan:dev

2026-07-05 16:02:14 +02:00 · 2024-02-02 16:47:52 +08:00 · 2024-02-02 16:47:52 +08:00 · dadd09bfb5
commit dadd09bfb5
parent 1b0dfbc846 0118712ff8
105 changed files with 5201 additions and 350 deletions
--- a/metagpt/provider/base_llm.py
+++ b/metagpt/provider/base_llm.py
@ -39,8 +39,26 @@ class BaseLLM(ABC):
    def __init__(self, config: LLMConfig):
        pass

-    def _user_msg(self, msg: str) -> dict[str, str]:
-        return {"role": "user", "content": msg}
+    def _user_msg(self, msg: str, images: Optional[Union[str, list[str]]] = None) -> dict[str, Union[str, dict]]:
+        if images:
+            # as gpt-4v, chat with image
+            return self._user_msg_with_imgs(msg, images)
+        else:
+            return {"role": "user", "content": msg}
+
+    def _user_msg_with_imgs(self, msg: str, images: Optional[Union[str, list[str]]]):
+        """
+        images: can be list of http(s) url or base64
+        """
+        if isinstance(images, str):
+            images = [images]
+        content = [{"type": "text", "text": msg}]
+        for image in images:
+            # image url or image base64
+            url = image if image.startswith("http") else f"data:image/jpeg;base64,{image}"
+            # it can with multiple-image inputs
+            content.append({"type": "image_url", "image_url": url})
+        return {"role": "user", "content": content}

    def _assistant_msg(self, msg: str) -> dict[str, str]:
        return {"role": "assistant", "content": msg}
@ -59,6 +77,7 @@ class BaseLLM(ABC):
        msg: str,
        system_msgs: Optional[list[str]] = None,
        format_msgs: Optional[list[dict[str, str]]] = None,
+        images: Optional[Union[str, list[str]]] = None,
        timeout=3,
        stream=True,
    ) -> str:
@ -70,7 +89,7 @@ class BaseLLM(ABC):
            message = []
        if format_msgs:
            message.extend(format_msgs)
-        message.append(self._user_msg(msg))
+        message.append(self._user_msg(msg, images=images))
        logger.debug(message)
        rsp = await self.acompletion_text(message, stream=stream, timeout=timeout)
        return rsp
--- a/metagpt/provider/openai_api.py
+++ b/metagpt/provider/openai_api.py
@ -29,6 +29,7 @@ from metagpt.provider.base_llm import BaseLLM
 from metagpt.provider.constant import GENERAL_FUNCTION_SCHEMA, GENERAL_TOOL_CHOICE
 from metagpt.provider.llm_provider_registry import register_provider
 from metagpt.schema import Message
+from metagpt.utils.common import decode_image
 from metagpt.utils.cost_manager import CostManager, Costs
 from metagpt.utils.exceptions import handle_exception
 from metagpt.utils.token_counter import (
@ -101,7 +102,7 @@ class OpenAILLM(BaseLLM):
            "messages": messages,
            "max_tokens": self._get_max_tokens(messages),
            "n": 1,
-            "stop": None,
+            # "stop": None,  # default it's None and gpt4-v can't have this one
            "temperature": 0.3,
            "model": self.model,
            "timeout": max(self.config.timeout, timeout),
@ -240,3 +241,24 @@ class OpenAILLM(BaseLLM):
    async def aspeech_to_text(self, **kwargs):
        """speech to text"""
        return await self.aclient.audio.transcriptions.create(**kwargs)
+
+    async def gen_image(
+        self,
+        prompt: str,
+        size: str = "1024x1024",
+        quality: str = "standard",
+        model: str = None,
+        resp_format: str = "url",
+    ) -> list["Image"]:
+        """image generate"""
+        assert resp_format in ["url", "b64_json"]
+        if not model:
+            model = self.model
+        res = await self.aclient.images.generate(
+            model=model, prompt=prompt, size=size, quality=quality, n=1, response_format=resp_format
+        )
+        imgs = []
+        for item in res.data:
+            img_url_or_b64 = item.url if resp_format == "url" else item.b64_json
+            imgs.append(decode_image(img_url_or_b64))
+        return imgs