From b262e5df7b27177bcad7c366ceb5ebe8f107ae32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=84=E4=BC=9F=E9=9F=AC?= Date: Wed, 11 Sep 2024 15:56:16 +0800 Subject: [PATCH 1/5] automated get image --- metagpt/prompts/di/engineer2.py | 2 + metagpt/prompts/di/team_leader.py | 6 +- metagpt/roles/di/engineer2.py | 4 ++ metagpt/strategy/experience_retriever.py | 15 ++++ metagpt/tools/libs/image_getter.py | 87 ++++++++++++++++++++++++ 5 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 metagpt/tools/libs/image_getter.py diff --git a/metagpt/prompts/di/engineer2.py b/metagpt/prompts/di/engineer2.py index 70e45acb5..140c1a929 100644 --- a/metagpt/prompts/di/engineer2.py +++ b/metagpt/prompts/di/engineer2.py @@ -76,6 +76,8 @@ Note: 19. When the requirement is simple, you don't need to create a plan, just do it right away. 20. If the code exists, use the Editor tool's open and edit commands to modify it. Since it is not a new code, do not use write_new_code. 21. When using the editor, pay attention to the editor's current directory. When you use editor tools, the paths must be either absolute or relative to the editor's current directory. +22. The default programming languages are HTML (.html), CSS (.css), and Pure JavaScript (.js). +23. When planning, consider whether images are needed. If you are developing a showcase website, start by using ImageGetter.get_image to obtain the necessary images. """ CURRENT_STATE = """ The current editor state is: diff --git a/metagpt/prompts/di/team_leader.py b/metagpt/prompts/di/team_leader.py index 8d85a8cf3..d5bf6cb72 100644 --- a/metagpt/prompts/di/team_leader.py +++ b/metagpt/prompts/di/team_leader.py @@ -32,8 +32,10 @@ Note: 9. Do not use the 'end' command when the current task remains unfinished; instead, use the 'finish_current_task' command to indicate completion before switching to the next task. 10. Do not use escape characters in json data, particularly within file paths. 11. Analyze the capabilities of team members and assign tasks to them based on user Requirements. If the requirements ask to ignore certain tasks, follow the requirements. -12. Add default web technologies: HTML (*.html), CSS (*.css), and JavaScript (*.js) to your requirements.If no specific programming language is required, include these technologies in the project requirements. Using instruction to forward this information to your team members. -13. If the the user message is a question. use 'reply to human' to respond to the question, and then end. +12. If the the user message is a question. use 'reply to human' to respond to the question, and then end. +13. Instructions and reply must be in the same language. +14. Default technology stack is HTML (.html), CSS (.css), and Pure JavaScript (.js). Web app is the default option when developing software. +15. You are the only one who decides the programming language for the software, so the instruction must contain the programming language. """ TL_THOUGHT_GUIDANCE = ( THOUGHT_GUIDANCE diff --git a/metagpt/roles/di/engineer2.py b/metagpt/roles/di/engineer2.py index 1a224623b..e235c5a43 100644 --- a/metagpt/roles/di/engineer2.py +++ b/metagpt/roles/di/engineer2.py @@ -18,6 +18,7 @@ from metagpt.schema import UserMessage from metagpt.strategy.experience_retriever import ENGINEER_EXAMPLE from metagpt.tools.libs.cr import CodeReview from metagpt.tools.libs.git import git_create_pull +from metagpt.tools.libs.image_getter import ImageGetter from metagpt.tools.libs.terminal import Terminal from metagpt.tools.tool_registry import register_tool from metagpt.utils.common import CodeParser, awrite @@ -42,6 +43,7 @@ class Engineer2(RoleZero): "SearchEnhancedQA", "Engineer2", "CodeReview", + "ImageGetter", ] # SWE Agent parameter run_eval: bool = False @@ -84,10 +86,12 @@ class Engineer2(RoleZero): ) else: # Default tool map + image_getter = ImageGetter() self.tool_execution_map.update( { "git_create_pull": git_create_pull, "Engineer2.write_new_code": self.write_new_code, + "ImageGetter.get_image": image_getter.get_image, "CodeReview.review": cr.review, "CodeReview.fix": cr.fix, "Terminal.run_command": self.terminal.run_command, diff --git a/metagpt/strategy/experience_retriever.py b/metagpt/strategy/experience_retriever.py index 416e16279..d5601a6a1 100644 --- a/metagpt/strategy/experience_retriever.py +++ b/metagpt/strategy/experience_retriever.py @@ -1026,6 +1026,21 @@ Thought: Now that the changes have been pushed to the remote repository, due to } ] ``` + +## example 11 +The requirements is a product website contain some goods including cap, dress and tshit. +I think the website should conatin the picture of the goods,but user did not provide, so i will get the image first. +```json +[ + { + "command_name": "ImageGetter.get_image", + "args": { + "search_term": "cap", + "save_file_path": "/tmp/workspace/images/cap.png", + } + } +] +``` """ WEB_SCRAPING_EXAMPLE = """ diff --git a/metagpt/tools/libs/image_getter.py b/metagpt/tools/libs/image_getter.py new file mode 100644 index 000000000..bda939ea9 --- /dev/null +++ b/metagpt/tools/libs/image_getter.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +import base64 +import os +import re +from pathlib import Path +from typing import Optional + +from playwright.async_api import Browser as Browser_ +from playwright.async_api import BrowserContext, Page, Playwright, async_playwright +from pydantic import BaseModel, ConfigDict, Field + +from metagpt.tools.tool_registry import register_tool +from metagpt.utils.proxy_env import get_proxy_from_env +from metagpt.utils.report import BrowserReporter + + +@register_tool(include_functions=["get_image"]) +class ImageGetter(BaseModel): + """ + A tool to get images. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + playwright: Optional[Playwright] = Field(default=None, exclude=True) + browser_instance: Optional[Browser_] = Field(default=None, exclude=True) + browser_ctx: Optional[BrowserContext] = Field(default=None, exclude=True) + page: Optional[Page] = Field(default=None, exclude=True) + headless: bool = Field(default=True) + proxy: Optional[dict] = Field(default_factory=get_proxy_from_env) + reporter: BrowserReporter = Field(default_factory=BrowserReporter) + + async def start(self) -> None: + """Starts Playwright and launches a browser""" + if self.playwright is None: + self.playwright = playwright = await async_playwright().start() + browser = self.browser_instance = await playwright.chromium.launch(headless=self.headless, proxy=self.proxy) + browser_ctx = self.browser_ctx = await browser.new_context() + self.page = await browser_ctx.new_page() + + async def get_image(self, search_term, save_file_path): + """ + Get an image related to the search term. + + Args: + search_term (str): The term to search for the image. + save_file_path (str): The file path where the image will + """ + # Seach image + url = f"https://unsplash.com/s/photos/{search_term}/" + if self.page is None: + await self.start() + await self.page.goto(url, wait_until="domcontentloaded") + # Wait for the element + try: + await self.page.wait_for_selector(".zNNw1 > div > img:nth-of-type(2)") + except TimeoutError: + return f"{search_term} not found. Please broaden the search term." + + image_base64 = await self.page.evaluate( + """async () => { + var img = document.querySelector('.zNNw1 > div > img:nth-of-type(2)'); + if (img && img.src) { + const response = await fetch(img.src); + if (response.ok) { + const blob = await response.blob(); + return await new Promise(resolve => { + const reader = new FileReader(); + reader.onloadend = () => resolve(reader.result); + reader.readAsDataURL(blob); + }); + } + } + return null; + }""" + ) + if image_base64: + file_path = Path(save_file_path) + os.makedirs(file_path.parent, exist_ok=True) + with open(save_file_path, "wb") as f: + imgstr = re.sub("data:image/.*?;base64,", "", image_base64) + image_data = base64.b64decode(imgstr) + f.write(image_data) + return f"{search_term} found. The image is saved in {save_file_path}." + else: + return f"{search_term} not found. Please broaden the search term." From 2857a990ff61ff2bc9410207a76da5bfd2908791 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=84=E4=BC=9F=E9=9F=AC?= Date: Wed, 11 Sep 2024 16:15:51 +0800 Subject: [PATCH 2/5] fix prompt issues. --- metagpt/prompts/di/team_leader.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/metagpt/prompts/di/team_leader.py b/metagpt/prompts/di/team_leader.py index d5bf6cb72..8d85a8cf3 100644 --- a/metagpt/prompts/di/team_leader.py +++ b/metagpt/prompts/di/team_leader.py @@ -32,10 +32,8 @@ Note: 9. Do not use the 'end' command when the current task remains unfinished; instead, use the 'finish_current_task' command to indicate completion before switching to the next task. 10. Do not use escape characters in json data, particularly within file paths. 11. Analyze the capabilities of team members and assign tasks to them based on user Requirements. If the requirements ask to ignore certain tasks, follow the requirements. -12. If the the user message is a question. use 'reply to human' to respond to the question, and then end. -13. Instructions and reply must be in the same language. -14. Default technology stack is HTML (.html), CSS (.css), and Pure JavaScript (.js). Web app is the default option when developing software. -15. You are the only one who decides the programming language for the software, so the instruction must contain the programming language. +12. Add default web technologies: HTML (*.html), CSS (*.css), and JavaScript (*.js) to your requirements.If no specific programming language is required, include these technologies in the project requirements. Using instruction to forward this information to your team members. +13. If the the user message is a question. use 'reply to human' to respond to the question, and then end. """ TL_THOUGHT_GUIDANCE = ( THOUGHT_GUIDANCE From d9dacb630541de34e7bcfeeac53938bdc8021048 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=84=E4=BC=9F=E9=9F=AC?= Date: Wed, 11 Sep 2024 17:24:00 +0800 Subject: [PATCH 3/5] update prompt --- metagpt/strategy/experience_retriever.py | 6 +++--- metagpt/tools/libs/image_getter.py | 17 +++++++++-------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/metagpt/strategy/experience_retriever.py b/metagpt/strategy/experience_retriever.py index d5601a6a1..5cb0573b2 100644 --- a/metagpt/strategy/experience_retriever.py +++ b/metagpt/strategy/experience_retriever.py @@ -1027,9 +1027,9 @@ Thought: Now that the changes have been pushed to the remote repository, due to ] ``` -## example 11 -The requirements is a product website contain some goods including cap, dress and tshit. -I think the website should conatin the picture of the goods,but user did not provide, so i will get the image first. +## example 12 +The requirement is to create a product website featuring goods such as caps, dresses, and T-shirts. +I believe pictures would improve the site, so I will get the images first. ```json [ { diff --git a/metagpt/tools/libs/image_getter.py b/metagpt/tools/libs/image_getter.py index bda939ea9..1420f9d7f 100644 --- a/metagpt/tools/libs/image_getter.py +++ b/metagpt/tools/libs/image_getter.py @@ -39,25 +39,25 @@ class ImageGetter(BaseModel): browser_ctx = self.browser_ctx = await browser.new_context() self.page = await browser_ctx.new_page() - async def get_image(self, search_term, save_file_path): + async def get_image(self, search_term, image_save_path): """ Get an image related to the search term. Args: search_term (str): The term to search for the image. - save_file_path (str): The file path where the image will + image_save_path (str): The file path where the image will be saved. """ - # Seach image + # Search for images from https://unsplash.com/s/photos/ url = f"https://unsplash.com/s/photos/{search_term}/" if self.page is None: await self.start() await self.page.goto(url, wait_until="domcontentloaded") - # Wait for the element + # Wait until the image element is loaded try: await self.page.wait_for_selector(".zNNw1 > div > img:nth-of-type(2)") except TimeoutError: return f"{search_term} not found. Please broaden the search term." - + # Get the base64 code of the first retrieved image image_base64 = await self.page.evaluate( """async () => { var img = document.querySelector('.zNNw1 > div > img:nth-of-type(2)'); @@ -76,12 +76,13 @@ class ImageGetter(BaseModel): }""" ) if image_base64: - file_path = Path(save_file_path) + # Save image + file_path = Path(image_save_path) os.makedirs(file_path.parent, exist_ok=True) - with open(save_file_path, "wb") as f: + with open(image_save_path, "wb") as f: imgstr = re.sub("data:image/.*?;base64,", "", image_base64) image_data = base64.b64decode(imgstr) f.write(image_data) - return f"{search_term} found. The image is saved in {save_file_path}." + return f"{search_term} found. The image is saved in {image_save_path}." else: return f"{search_term} not found. Please broaden the search term." From 65dc47c90d105b5ef93b59ff9db94ca92492211e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=84=E4=BC=9F=E9=9F=AC?= Date: Thu, 12 Sep 2024 13:26:35 +0800 Subject: [PATCH 4/5] fix code issues --- metagpt/strategy/experience_retriever.py | 2 +- metagpt/tools/libs/image_getter.py | 61 +++++++++++------------- 2 files changed, 30 insertions(+), 33 deletions(-) diff --git a/metagpt/strategy/experience_retriever.py b/metagpt/strategy/experience_retriever.py index 5cb0573b2..e974e41a0 100644 --- a/metagpt/strategy/experience_retriever.py +++ b/metagpt/strategy/experience_retriever.py @@ -845,7 +845,7 @@ Consider this example only after you have obtained the content of system design Suppose the system design and project schedule prescribes three files index.html, style.css, script.js, to follow the design and schedule, I will create a plan consisting of three tasks, each corresponding to the creation of one of the required files: `index.html`, `style.css`, and `script.js`. Here's the plan: - +[Optional] 0. **Task 0**: Obtain images before coding. 1. **Task 1**: Create `index.html` - This file will contain the HTML structure necessary for the game's UI. 2. **Task 2**: Create `style.css` - This file will define the CSS styles to make the game visually appealing and responsive. 3. **Task 3**: Create `script.js` - This file will contain the JavaScript code for the game logic and UI interactions. diff --git a/metagpt/tools/libs/image_getter.py b/metagpt/tools/libs/image_getter.py index 1420f9d7f..a8f982fa5 100644 --- a/metagpt/tools/libs/image_getter.py +++ b/metagpt/tools/libs/image_getter.py @@ -1,9 +1,5 @@ from __future__ import annotations -import base64 -import os -import re -from pathlib import Path from typing import Optional from playwright.async_api import Browser as Browser_ @@ -11,9 +7,28 @@ from playwright.async_api import BrowserContext, Page, Playwright, async_playwri from pydantic import BaseModel, ConfigDict, Field from metagpt.tools.tool_registry import register_tool +from metagpt.utils.common import decode_image from metagpt.utils.proxy_env import get_proxy_from_env from metagpt.utils.report import BrowserReporter +DOWNLOAD_PICTURE_JAVASCRIPT = """ +async () => {{ + var img = document.querySelector('{img_element_selector}'); + if (img && img.src) {{ + const response = await fetch(img.src); + if (response.ok) {{ + const blob = await response.blob(); + return await new Promise(resolve => {{ + const reader = new FileReader(); + reader.onloadend = () => resolve(reader.result); + reader.readAsDataURL(blob); + }}); + }} + }} + return null; +}} +""" + @register_tool(include_functions=["get_image"]) class ImageGetter(BaseModel): @@ -30,6 +45,8 @@ class ImageGetter(BaseModel): headless: bool = Field(default=True) proxy: Optional[dict] = Field(default_factory=get_proxy_from_env) reporter: BrowserReporter = Field(default_factory=BrowserReporter) + url: str = "https://unsplash.com/s/photos/{search_term}/" + img_element_selector: str = ".zNNw1 > div > img:nth-of-type(2)" async def start(self) -> None: """Starts Playwright and launches a browser""" @@ -44,45 +61,25 @@ class ImageGetter(BaseModel): Get an image related to the search term. Args: - search_term (str): The term to search for the image. + search_term (str): The term to search for the image. The search term must be in English; using any other language may lead to a mismatch. image_save_path (str): The file path where the image will be saved. """ # Search for images from https://unsplash.com/s/photos/ - url = f"https://unsplash.com/s/photos/{search_term}/" + if self.page is None: await self.start() - await self.page.goto(url, wait_until="domcontentloaded") + await self.page.goto(self.url.format(search_term=search_term), wait_until="domcontentloaded") # Wait until the image element is loaded try: - await self.page.wait_for_selector(".zNNw1 > div > img:nth-of-type(2)") + await self.page.wait_for_selector(self.img_element_selector) except TimeoutError: return f"{search_term} not found. Please broaden the search term." # Get the base64 code of the first retrieved image image_base64 = await self.page.evaluate( - """async () => { - var img = document.querySelector('.zNNw1 > div > img:nth-of-type(2)'); - if (img && img.src) { - const response = await fetch(img.src); - if (response.ok) { - const blob = await response.blob(); - return await new Promise(resolve => { - const reader = new FileReader(); - reader.onloadend = () => resolve(reader.result); - reader.readAsDataURL(blob); - }); - } - } - return null; - }""" + DOWNLOAD_PICTURE_JAVASCRIPT.format(img_element_selector=self.img_element_selector) ) if image_base64: - # Save image - file_path = Path(image_save_path) - os.makedirs(file_path.parent, exist_ok=True) - with open(image_save_path, "wb") as f: - imgstr = re.sub("data:image/.*?;base64,", "", image_base64) - image_data = base64.b64decode(imgstr) - f.write(image_data) + image = decode_image(image_base64) + image.save(image_save_path) return f"{search_term} found. The image is saved in {image_save_path}." - else: - return f"{search_term} not found. Please broaden the search term." + return f"{search_term} not found. Please broaden the search term." From d7abceb67f0df74ff5ba723cb5ffd8b0e7f3522d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=84=E4=BC=9F=E9=9F=AC?= Date: Thu, 12 Sep 2024 13:46:01 +0800 Subject: [PATCH 5/5] update comment --- metagpt/tools/libs/image_getter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metagpt/tools/libs/image_getter.py b/metagpt/tools/libs/image_getter.py index a8f982fa5..ecbaaf510 100644 --- a/metagpt/tools/libs/image_getter.py +++ b/metagpt/tools/libs/image_getter.py @@ -61,7 +61,7 @@ class ImageGetter(BaseModel): Get an image related to the search term. Args: - search_term (str): The term to search for the image. The search term must be in English; using any other language may lead to a mismatch. + search_term (str): The term to search for the image. The search term must be in English. Using any other language may lead to a mismatch. image_save_path (str): The file path where the image will be saved. """ # Search for images from https://unsplash.com/s/photos/