Merge branch 'automated-get-image' into 'mgx_ops'

engineer 自动下载图片 See merge request pub/MetaGPT!374
2026-07-02 16:01:04 +02:00 · 2024-09-12 10:14:29 +00:00 · 2024-09-12 10:14:29 +00:00 · cf4a449aef
commit cf4a449aef
parent 49b5c7faf3 a939efcd7a
4 changed files with 106 additions and 1 deletions
--- a/metagpt/prompts/di/engineer2.py
+++ b/metagpt/prompts/di/engineer2.py
@ -77,6 +77,7 @@ Note:
 20. If the code exists, use the Editor tool's open and edit commands to modify it. Since it is not a new code, do not use write_new_code.
 21. When using the editor, pay attention to the editor's current directory. When you use editor tools, the paths must be either absolute or relative to the editor's current directory.
 22. The default programming languages are HTML (.html), CSS (.css), and Pure JavaScript (.js).
+23. When planning, consider whether images are needed. If you are developing a showcase website, start by using ImageGetter.get_image to obtain the necessary images.
 """
 CURRENT_STATE = """
 The current editor state is:
--- a/metagpt/roles/di/engineer2.py
+++ b/metagpt/roles/di/engineer2.py
@ -18,6 +18,7 @@ from metagpt.schema import UserMessage
 from metagpt.strategy.experience_retriever import ENGINEER_EXAMPLE
 from metagpt.tools.libs.cr import CodeReview
 from metagpt.tools.libs.git import git_create_pull
+from metagpt.tools.libs.image_getter import ImageGetter
 from metagpt.tools.libs.terminal import Terminal
 from metagpt.tools.tool_registry import register_tool
 from metagpt.utils.common import CodeParser, awrite
@ -42,6 +43,7 @@ class Engineer2(RoleZero):
        "SearchEnhancedQA",
        "Engineer2",
        "CodeReview",
+        "ImageGetter",
    ]
    # SWE Agent parameter
    run_eval: bool = False
@ -84,10 +86,12 @@ class Engineer2(RoleZero):
            )
        else:
            # Default tool map
+            image_getter = ImageGetter()
            self.tool_execution_map.update(
                {
                    "git_create_pull": git_create_pull,
                    "Engineer2.write_new_code": self.write_new_code,
+                    "ImageGetter.get_image": image_getter.get_image,
                    "CodeReview.review": cr.review,
                    "CodeReview.fix": cr.fix,
                    "Terminal.run_command": self.terminal.run_command,
--- a/metagpt/strategy/experience_retriever.py
+++ b/metagpt/strategy/experience_retriever.py
@ -845,7 +845,7 @@ Consider this example only after you have obtained the content of system design
 Suppose the system design and project schedule prescribes three files index.html, style.css, script.js, to follow the design and schedule, I will create a plan consisting of three tasks, each corresponding to the creation of one of the required files: `index.html`, `style.css`, and `script.js`. 

 Here's the plan:
-
+[Optional] 0. **Task 0**: Obtain images before coding.
 1. **Task 1**: Create `index.html` - This file will contain the HTML structure necessary for the game's UI.
 2. **Task 2**: Create `style.css` - This file will define the CSS styles to make the game visually appealing and responsive.
 3. **Task 3**: Create `script.js` - This file will contain the Pure JavaScript code for the game logic and UI interactions.
@ -1026,6 +1026,21 @@ Thought: Now that the changes have been pushed to the remote repository, due to
        }
 ]
 ```
+
+## example 12
+The requirement is to create a product website featuring goods such as caps, dresses, and T-shirts. 
+I believe pictures would improve the site, so I will get the images first.
+```json
+[
+    {
+        "command_name": "ImageGetter.get_image",
+        "args": {
+            "search_term": "cap",
+            "save_file_path": "/tmp/workspace/images/cap.png",
+        }
+    }
+]
+```
 """

 WEB_SCRAPING_EXAMPLE = """
--- a/metagpt/tools/libs/image_getter.py
+++ b/metagpt/tools/libs/image_getter.py
@ -0,0 +1,85 @@
+from __future__ import annotations
+
+from typing import Optional
+
+from playwright.async_api import Browser as Browser_
+from playwright.async_api import BrowserContext, Page, Playwright, async_playwright
+from pydantic import BaseModel, ConfigDict, Field
+
+from metagpt.tools.tool_registry import register_tool
+from metagpt.utils.common import decode_image
+from metagpt.utils.proxy_env import get_proxy_from_env
+from metagpt.utils.report import BrowserReporter
+
+DOWNLOAD_PICTURE_JAVASCRIPT = """
+async () => {{
+    var img = document.querySelector('{img_element_selector}');
+    if (img && img.src) {{
+        const response = await fetch(img.src);
+        if (response.ok) {{
+            const blob = await response.blob();
+            return await new Promise(resolve => {{
+                const reader = new FileReader();
+                reader.onloadend = () => resolve(reader.result);
+                reader.readAsDataURL(blob);
+            }});
+        }}
+    }}
+    return null;
+}}
+"""
+
+
+@register_tool(include_functions=["get_image"])
+class ImageGetter(BaseModel):
+    """
+    A tool to get images.
+    """
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    playwright: Optional[Playwright] = Field(default=None, exclude=True)
+    browser_instance: Optional[Browser_] = Field(default=None, exclude=True)
+    browser_ctx: Optional[BrowserContext] = Field(default=None, exclude=True)
+    page: Optional[Page] = Field(default=None, exclude=True)
+    headless: bool = Field(default=True)
+    proxy: Optional[dict] = Field(default_factory=get_proxy_from_env)
+    reporter: BrowserReporter = Field(default_factory=BrowserReporter)
+    url: str = "https://unsplash.com/s/photos/{search_term}/"
+    img_element_selector: str = ".zNNw1 > div > img:nth-of-type(2)"
+
+    async def start(self) -> None:
+        """Starts Playwright and launches a browser"""
+        if self.playwright is None:
+            self.playwright = playwright = await async_playwright().start()
+            browser = self.browser_instance = await playwright.chromium.launch(headless=self.headless, proxy=self.proxy)
+            browser_ctx = self.browser_ctx = await browser.new_context()
+            self.page = await browser_ctx.new_page()
+
+    async def get_image(self, search_term, image_save_path):
+        """
+        Get an image related to the search term.
+
+        Args:
+            search_term (str): The term to search for the image. The search term must be in English. Using any other language may lead to a mismatch.
+            image_save_path (str): The file path where the image will be saved.
+        """
+        # Search for images from https://unsplash.com/s/photos/
+
+        if self.page is None:
+            await self.start()
+        await self.page.goto(self.url.format(search_term=search_term), wait_until="domcontentloaded")
+        # Wait until the image element is loaded
+        try:
+            await self.page.wait_for_selector(self.img_element_selector)
+        except TimeoutError:
+            return f"{search_term} not found. Please broaden the search term."
+        # Get the base64 code of the first  retrieved image
+        image_base64 = await self.page.evaluate(
+            DOWNLOAD_PICTURE_JAVASCRIPT.format(img_element_selector=self.img_element_selector)
+        )
+        if image_base64:
+            image = decode_image(image_base64)
+            image.save(image_save_path)
+            return f"{search_term} found. The image is saved in {image_save_path}."
+        return f"{search_term} not found. Please broaden the search term."