automated get image

2026-06-08 15:05:17 +02:00 · 2024-09-11 15:56:16 +08:00 · 2024-09-11 15:56:16 +08:00 · b262e5df7b
commit b262e5df7b
parent a94e282e7f
5 changed files with 112 additions and 2 deletions
--- a/metagpt/prompts/di/engineer2.py
+++ b/metagpt/prompts/di/engineer2.py
@ -76,6 +76,8 @@ Note:
 19. When the requirement is simple, you don't need to create a plan, just do it right away.
 20. If the code exists, use the Editor tool's open and edit commands to modify it. Since it is not a new code, do not use write_new_code.
 21. When using the editor, pay attention to the editor's current directory. When you use editor tools, the paths must be either absolute or relative to the editor's current directory.
+22. The default programming languages are HTML (.html), CSS (.css), and Pure JavaScript (.js).
+23. When planning, consider whether images are needed. If you are developing a showcase website, start by using ImageGetter.get_image to obtain the necessary images.
 """
 CURRENT_STATE = """
 The current editor state is:
--- a/metagpt/prompts/di/team_leader.py
+++ b/metagpt/prompts/di/team_leader.py
@ -32,8 +32,10 @@ Note:
 9. Do not use the 'end' command when the current task remains unfinished; instead, use the 'finish_current_task' command to indicate completion before switching to the next task.
 10. Do not use escape characters in json data, particularly within file paths.
 11. Analyze the capabilities of team members and assign tasks to them based on user Requirements. If the requirements ask to ignore certain tasks, follow the requirements.
-12. Add default web technologies: HTML (*.html), CSS (*.css), and JavaScript (*.js) to your requirements.If no specific programming language is required, include these technologies in the project requirements. Using instruction  to forward this information to your team members.
-13. If the the user message is a question. use 'reply to human' to respond to the question, and then end.
+12. If the the user message is a question. use 'reply to human' to respond to the question, and then end.
+13. Instructions and reply must be in the same language.
+14. Default technology stack is HTML (.html), CSS (.css), and Pure JavaScript (.js). Web app is the default option when developing software.
+15. You are the only one who decides the programming language for the software, so the instruction must contain the programming language.
 """
 TL_THOUGHT_GUIDANCE = (
    THOUGHT_GUIDANCE
--- a/metagpt/roles/di/engineer2.py
+++ b/metagpt/roles/di/engineer2.py
@ -18,6 +18,7 @@ from metagpt.schema import UserMessage
 from metagpt.strategy.experience_retriever import ENGINEER_EXAMPLE
 from metagpt.tools.libs.cr import CodeReview
 from metagpt.tools.libs.git import git_create_pull
+from metagpt.tools.libs.image_getter import ImageGetter
 from metagpt.tools.libs.terminal import Terminal
 from metagpt.tools.tool_registry import register_tool
 from metagpt.utils.common import CodeParser, awrite
@ -42,6 +43,7 @@ class Engineer2(RoleZero):
        "SearchEnhancedQA",
        "Engineer2",
        "CodeReview",
+        "ImageGetter",
    ]
    # SWE Agent parameter
    run_eval: bool = False
@ -84,10 +86,12 @@ class Engineer2(RoleZero):
            )
        else:
            # Default tool map
+            image_getter = ImageGetter()
            self.tool_execution_map.update(
                {
                    "git_create_pull": git_create_pull,
                    "Engineer2.write_new_code": self.write_new_code,
+                    "ImageGetter.get_image": image_getter.get_image,
                    "CodeReview.review": cr.review,
                    "CodeReview.fix": cr.fix,
                    "Terminal.run_command": self.terminal.run_command,
--- a/metagpt/strategy/experience_retriever.py
+++ b/metagpt/strategy/experience_retriever.py
@ -1026,6 +1026,21 @@ Thought: Now that the changes have been pushed to the remote repository, due to
        }
 ]
 ```
+
+## example 11
+The requirements is a product website contain some goods including cap, dress and tshit. 
+I think the website should conatin the picture of the goods,but user did not provide, so i will get the image first.
+```json
+[
+    {
+        "command_name": "ImageGetter.get_image",
+        "args": {
+            "search_term": "cap",
+            "save_file_path": "/tmp/workspace/images/cap.png",
+        }
+    }
+]
+```
 """

 WEB_SCRAPING_EXAMPLE = """
--- a/metagpt/tools/libs/image_getter.py
+++ b/metagpt/tools/libs/image_getter.py
@ -0,0 +1,87 @@
+from __future__ import annotations
+
+import base64
+import os
+import re
+from pathlib import Path
+from typing import Optional
+
+from playwright.async_api import Browser as Browser_
+from playwright.async_api import BrowserContext, Page, Playwright, async_playwright
+from pydantic import BaseModel, ConfigDict, Field
+
+from metagpt.tools.tool_registry import register_tool
+from metagpt.utils.proxy_env import get_proxy_from_env
+from metagpt.utils.report import BrowserReporter
+
+
+@register_tool(include_functions=["get_image"])
+class ImageGetter(BaseModel):
+    """
+    A tool to get images.
+    """
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    playwright: Optional[Playwright] = Field(default=None, exclude=True)
+    browser_instance: Optional[Browser_] = Field(default=None, exclude=True)
+    browser_ctx: Optional[BrowserContext] = Field(default=None, exclude=True)
+    page: Optional[Page] = Field(default=None, exclude=True)
+    headless: bool = Field(default=True)
+    proxy: Optional[dict] = Field(default_factory=get_proxy_from_env)
+    reporter: BrowserReporter = Field(default_factory=BrowserReporter)
+
+    async def start(self) -> None:
+        """Starts Playwright and launches a browser"""
+        if self.playwright is None:
+            self.playwright = playwright = await async_playwright().start()
+            browser = self.browser_instance = await playwright.chromium.launch(headless=self.headless, proxy=self.proxy)
+            browser_ctx = self.browser_ctx = await browser.new_context()
+            self.page = await browser_ctx.new_page()
+
+    async def get_image(self, search_term, save_file_path):
+        """
+        Get an image related to the search term.
+
+        Args:
+            search_term (str): The term to search for the image.
+            save_file_path (str): The file path where the image will
+        """
+        # Seach image
+        url = f"https://unsplash.com/s/photos/{search_term}/"
+        if self.page is None:
+            await self.start()
+        await self.page.goto(url, wait_until="domcontentloaded")
+        # Wait for the element
+        try:
+            await self.page.wait_for_selector(".zNNw1 > div > img:nth-of-type(2)")
+        except TimeoutError:
+            return f"{search_term} not found. Please broaden the search term."
+
+        image_base64 = await self.page.evaluate(
+            """async () => {
+            var img = document.querySelector('.zNNw1 > div > img:nth-of-type(2)');
+            if (img && img.src) {
+                const response = await fetch(img.src);
+                if (response.ok) {
+                    const blob = await response.blob();
+                    return await new Promise(resolve => {
+                        const reader = new FileReader();
+                        reader.onloadend = () => resolve(reader.result);
+                        reader.readAsDataURL(blob);
+                    });
+                }
+            }
+            return null;
+        }"""
+        )
+        if image_base64:
+            file_path = Path(save_file_path)
+            os.makedirs(file_path.parent, exist_ok=True)
+            with open(save_file_path, "wb") as f:
+                imgstr = re.sub("data:image/.*?;base64,", "", image_base64)
+                image_data = base64.b64decode(imgstr)
+                f.write(image_data)
+            return f"{search_term} found. The image is saved in {save_file_path}."
+        else:
+            return f"{search_term} not found. Please broaden the search term."