From b262e5df7b27177bcad7c366ceb5ebe8f107ae32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=84=E4=BC=9F=E9=9F=AC?= Date: Wed, 11 Sep 2024 15:56:16 +0800 Subject: [PATCH] automated get image --- metagpt/prompts/di/engineer2.py | 2 + metagpt/prompts/di/team_leader.py | 6 +- metagpt/roles/di/engineer2.py | 4 ++ metagpt/strategy/experience_retriever.py | 15 ++++ metagpt/tools/libs/image_getter.py | 87 ++++++++++++++++++++++++ 5 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 metagpt/tools/libs/image_getter.py diff --git a/metagpt/prompts/di/engineer2.py b/metagpt/prompts/di/engineer2.py index 70e45acb5..140c1a929 100644 --- a/metagpt/prompts/di/engineer2.py +++ b/metagpt/prompts/di/engineer2.py @@ -76,6 +76,8 @@ Note: 19. When the requirement is simple, you don't need to create a plan, just do it right away. 20. If the code exists, use the Editor tool's open and edit commands to modify it. Since it is not a new code, do not use write_new_code. 21. When using the editor, pay attention to the editor's current directory. When you use editor tools, the paths must be either absolute or relative to the editor's current directory. +22. The default programming languages are HTML (.html), CSS (.css), and Pure JavaScript (.js). +23. When planning, consider whether images are needed. If you are developing a showcase website, start by using ImageGetter.get_image to obtain the necessary images. """ CURRENT_STATE = """ The current editor state is: diff --git a/metagpt/prompts/di/team_leader.py b/metagpt/prompts/di/team_leader.py index 8d85a8cf3..d5bf6cb72 100644 --- a/metagpt/prompts/di/team_leader.py +++ b/metagpt/prompts/di/team_leader.py @@ -32,8 +32,10 @@ Note: 9. Do not use the 'end' command when the current task remains unfinished; instead, use the 'finish_current_task' command to indicate completion before switching to the next task. 10. Do not use escape characters in json data, particularly within file paths. 11. Analyze the capabilities of team members and assign tasks to them based on user Requirements. If the requirements ask to ignore certain tasks, follow the requirements. -12. Add default web technologies: HTML (*.html), CSS (*.css), and JavaScript (*.js) to your requirements.If no specific programming language is required, include these technologies in the project requirements. Using instruction to forward this information to your team members. -13. If the the user message is a question. use 'reply to human' to respond to the question, and then end. +12. If the the user message is a question. use 'reply to human' to respond to the question, and then end. +13. Instructions and reply must be in the same language. +14. Default technology stack is HTML (.html), CSS (.css), and Pure JavaScript (.js). Web app is the default option when developing software. +15. You are the only one who decides the programming language for the software, so the instruction must contain the programming language. """ TL_THOUGHT_GUIDANCE = ( THOUGHT_GUIDANCE diff --git a/metagpt/roles/di/engineer2.py b/metagpt/roles/di/engineer2.py index 1a224623b..e235c5a43 100644 --- a/metagpt/roles/di/engineer2.py +++ b/metagpt/roles/di/engineer2.py @@ -18,6 +18,7 @@ from metagpt.schema import UserMessage from metagpt.strategy.experience_retriever import ENGINEER_EXAMPLE from metagpt.tools.libs.cr import CodeReview from metagpt.tools.libs.git import git_create_pull +from metagpt.tools.libs.image_getter import ImageGetter from metagpt.tools.libs.terminal import Terminal from metagpt.tools.tool_registry import register_tool from metagpt.utils.common import CodeParser, awrite @@ -42,6 +43,7 @@ class Engineer2(RoleZero): "SearchEnhancedQA", "Engineer2", "CodeReview", + "ImageGetter", ] # SWE Agent parameter run_eval: bool = False @@ -84,10 +86,12 @@ class Engineer2(RoleZero): ) else: # Default tool map + image_getter = ImageGetter() self.tool_execution_map.update( { "git_create_pull": git_create_pull, "Engineer2.write_new_code": self.write_new_code, + "ImageGetter.get_image": image_getter.get_image, "CodeReview.review": cr.review, "CodeReview.fix": cr.fix, "Terminal.run_command": self.terminal.run_command, diff --git a/metagpt/strategy/experience_retriever.py b/metagpt/strategy/experience_retriever.py index 416e16279..d5601a6a1 100644 --- a/metagpt/strategy/experience_retriever.py +++ b/metagpt/strategy/experience_retriever.py @@ -1026,6 +1026,21 @@ Thought: Now that the changes have been pushed to the remote repository, due to } ] ``` + +## example 11 +The requirements is a product website contain some goods including cap, dress and tshit. +I think the website should conatin the picture of the goods,but user did not provide, so i will get the image first. +```json +[ + { + "command_name": "ImageGetter.get_image", + "args": { + "search_term": "cap", + "save_file_path": "/tmp/workspace/images/cap.png", + } + } +] +``` """ WEB_SCRAPING_EXAMPLE = """ diff --git a/metagpt/tools/libs/image_getter.py b/metagpt/tools/libs/image_getter.py new file mode 100644 index 000000000..bda939ea9 --- /dev/null +++ b/metagpt/tools/libs/image_getter.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +import base64 +import os +import re +from pathlib import Path +from typing import Optional + +from playwright.async_api import Browser as Browser_ +from playwright.async_api import BrowserContext, Page, Playwright, async_playwright +from pydantic import BaseModel, ConfigDict, Field + +from metagpt.tools.tool_registry import register_tool +from metagpt.utils.proxy_env import get_proxy_from_env +from metagpt.utils.report import BrowserReporter + + +@register_tool(include_functions=["get_image"]) +class ImageGetter(BaseModel): + """ + A tool to get images. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + playwright: Optional[Playwright] = Field(default=None, exclude=True) + browser_instance: Optional[Browser_] = Field(default=None, exclude=True) + browser_ctx: Optional[BrowserContext] = Field(default=None, exclude=True) + page: Optional[Page] = Field(default=None, exclude=True) + headless: bool = Field(default=True) + proxy: Optional[dict] = Field(default_factory=get_proxy_from_env) + reporter: BrowserReporter = Field(default_factory=BrowserReporter) + + async def start(self) -> None: + """Starts Playwright and launches a browser""" + if self.playwright is None: + self.playwright = playwright = await async_playwright().start() + browser = self.browser_instance = await playwright.chromium.launch(headless=self.headless, proxy=self.proxy) + browser_ctx = self.browser_ctx = await browser.new_context() + self.page = await browser_ctx.new_page() + + async def get_image(self, search_term, save_file_path): + """ + Get an image related to the search term. + + Args: + search_term (str): The term to search for the image. + save_file_path (str): The file path where the image will + """ + # Seach image + url = f"https://unsplash.com/s/photos/{search_term}/" + if self.page is None: + await self.start() + await self.page.goto(url, wait_until="domcontentloaded") + # Wait for the element + try: + await self.page.wait_for_selector(".zNNw1 > div > img:nth-of-type(2)") + except TimeoutError: + return f"{search_term} not found. Please broaden the search term." + + image_base64 = await self.page.evaluate( + """async () => { + var img = document.querySelector('.zNNw1 > div > img:nth-of-type(2)'); + if (img && img.src) { + const response = await fetch(img.src); + if (response.ok) { + const blob = await response.blob(); + return await new Promise(resolve => { + const reader = new FileReader(); + reader.onloadend = () => resolve(reader.result); + reader.readAsDataURL(blob); + }); + } + } + return null; + }""" + ) + if image_base64: + file_path = Path(save_file_path) + os.makedirs(file_path.parent, exist_ok=True) + with open(save_file_path, "wb") as f: + imgstr = re.sub("data:image/.*?;base64,", "", image_base64) + image_data = base64.b64decode(imgstr) + f.write(image_data) + return f"{search_term} found. The image is saved in {save_file_path}." + else: + return f"{search_term} not found. Please broaden the search term."