diff --git a/examples/andriod_assistant/actions/manual_record.py b/examples/andriod_assistant/actions/manual_record.py index 5deafa680..b56ed569d 100644 --- a/examples/andriod_assistant/actions/manual_record.py +++ b/examples/andriod_assistant/actions/manual_record.py @@ -6,6 +6,7 @@ from pathlib import Path import cv2 +from examples.andriod_assistant.utils.const import ADB_EXEC_FAIL from examples.andriod_assistant.utils.schema import ( ActionOp, AndroidActionOutput, @@ -15,9 +16,13 @@ from examples.andriod_assistant.utils.schema import ( from examples.andriod_assistant.utils.utils import draw_bbox_multi, traverse_xml_tree from metagpt.actions.action import Action from metagpt.config2 import config -from metagpt.const import ADB_EXEC_FAIL from metagpt.environment.android_env.android_env import AndroidEnv -from metagpt.environment.api.env_api import EnvAPIAbstract +from metagpt.environment.android_env.env_space import ( + EnvAction, + EnvActionType, + EnvObsParams, + EnvObsType, +) from metagpt.logs import logger @@ -53,19 +58,13 @@ class ManualRecord(Action): step = 0 while True: step += 1 - screenshot_path: Path = await env.observe( - EnvAPIAbstract( - api_name="get_screenshot", - # kwargs={"ss_name": f"{demo_name}_{step}", "local_save_dir": self.screenshot_before_path} - kwargs={"ss_name": f"{step}", "local_save_dir": self.screenshot_before_path}, + screenshot_path: Path = env.observe( + EnvObsParams( + obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{step}", local_save_dir=self.screenshot_before_path ) ) - xml_path: Path = await env.observe( - EnvAPIAbstract( - api_name="get_xml", - # kwargs={"xml_name": f"{demo_name}_{step}", "local_save_dir": self.xml_path} - kwargs={"xml_name": f"{step}", "local_save_dir": self.xml_path}, - ) + xml_path: Path = env.observe( + EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{step}", local_save_dir=self.xml_path) ) if not screenshot_path.exists() or not xml_path.exists(): return AndroidActionOutput(action_state=RunState.FAIL) @@ -103,8 +102,8 @@ class ManualRecord(Action): user_input = "xxx" logger.info( - "Choose one of the following actions you want to perform on the current screen:\ntap, text, long_press," - "swipe, stop", + "Choose one of the following actions you want to perform on the current screen:\n" + "tap, text, long_press, swipe, stop", "blue", ) @@ -126,10 +125,8 @@ class ManualRecord(Action): user_input = input() tl, br = elem_list[int(user_input) - 1].bbox x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2 - ret = await env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y})) - if ret == ADB_EXEC_FAIL: - return AndroidActionOutput(action_state=RunState.FAIL) - record_file.write(f"tap({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n") + action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y)) + log_str = f"tap({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n" elif user_input.lower() == ActionOp.TEXT.value: logger.info( f"Which element do you want to input the text string? Choose a numeric tag from 1 to " @@ -143,8 +140,8 @@ class ManualRecord(Action): user_input = "" while not user_input: user_input = input() - await env.step(EnvAPIAbstract(api_name="user_input", kwargs={"input_txt": user_input})) - record_file.write(f'text({input_area}:sep:"{user_input}"):::{elem_list[int(input_area) - 1].uid}\n') + action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=user_input) + log_str = f"text({input_area}:sep:'{user_input}'):::{elem_list[int(input_area) - 1].uid}\n" elif user_input.lower() == ActionOp.LONG_PRESS.value: logger.info( f"Which element do you want to long press? Choose a numeric tag from 1 to {len(elem_list)}:", "blue" @@ -154,14 +151,12 @@ class ManualRecord(Action): user_input = input() tl, br = elem_list[int(user_input) - 1].bbox x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2 - ret = await env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y})) - if ret == ADB_EXEC_FAIL: - return AndroidActionOutput(action_state=RunState.FAIL) - record_file.write(f"long_press({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n") + action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y)) + log_str = f"long_press({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n" elif user_input.lower() == ActionOp.SWIPE.value: logger.info( - "What is the direction of your swipe? Choose one from the following options:\nup, down, left," - " right", + "What is the direction of your swipe? Choose one from the following options:\n" + "up, down, left, right", "blue", ) user_input = "" @@ -178,16 +173,20 @@ class ManualRecord(Action): user_input = input() tl, br = elem_list[int(user_input) - 1].bbox x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2 - ret = await env.step( - EnvAPIAbstract(api_name="user_swipe", kwargs={"x": x, "y": y, "orient": swipe_dir}) - ) - if ret == ADB_EXEC_FAIL: - return AndroidActionOutput(action_state=RunState.FAIL) - record_file.write(f"swipe({int(user_input)}:sep:{swipe_dir}):::{elem_list[int(user_input) - 1].uid}\n") + + action = EnvAction(action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=swipe_dir) + log_str = f"swipe({int(user_input)}:sep:{swipe_dir}):::{elem_list[int(user_input) - 1].uid}\n" elif user_input.lower() == ActionOp.STOP.value: record_file.write("stop\n") record_file.close() break else: break + + obs, _, _, _, info = env.step(action) + action_res = info["res"] + if action_res == ADB_EXEC_FAIL: + return AndroidActionOutput(action_state=RunState.FAIL) + record_file.write(log_str) + time.sleep(3) diff --git a/examples/andriod_assistant/actions/screenshot_parse.py b/examples/andriod_assistant/actions/screenshot_parse.py index f3dd7da6c..62360f0a7 100644 --- a/examples/andriod_assistant/actions/screenshot_parse.py +++ b/examples/andriod_assistant/actions/screenshot_parse.py @@ -10,6 +10,7 @@ from examples.andriod_assistant.prompts.assistant_prompt import ( screenshot_parse_template, screenshot_parse_with_grid_template, ) +from examples.andriod_assistant.utils.const import ADB_EXEC_FAIL from examples.andriod_assistant.utils.schema import ( AndroidActionOutput, AndroidElement, @@ -34,9 +35,13 @@ from examples.andriod_assistant.utils.utils import ( ) from metagpt.actions.action import Action from metagpt.config2 import config -from metagpt.const import ADB_EXEC_FAIL from metagpt.environment.android_env.android_env import AndroidEnv -from metagpt.environment.api.env_api import EnvAPIAbstract +from metagpt.environment.android_env.env_space import ( + EnvAction, + EnvActionType, + EnvObsParams, + EnvObsType, +) from metagpt.utils.common import encode_image @@ -91,14 +96,11 @@ class ScreenshotParse(Action): for path in [task_dir, docs_dir]: if not path.exists(): path.mkdir(parents=True, exist_ok=True) - - screenshot_path: Path = await env.observe( - EnvAPIAbstract( - api_name="get_screenshot", kwargs={"ss_name": f"{round_count}_before", "local_save_dir": task_dir} - ) + screenshot_path: Path = env.observe( + EnvObsParams(obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{round_count}_before", local_save_dir=task_dir) ) - xml_path: Path = await env.observe( - EnvAPIAbstract(api_name="get_xml", kwargs={"xml_name": f"{round_count}", "local_save_dir": task_dir}) + xml_path: Path = env.observe( + EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{round_count}", local_save_dir=task_dir) ) width, height = env.device_shape if not screenshot_path.exists() or not xml_path.exists(): @@ -150,41 +152,26 @@ class ScreenshotParse(Action): if isinstance(op_param, TapOp): x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) - res = await env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y})) - if res == ADB_EXEC_FAIL: - return AndroidActionOutput(action_state=RunState.FAIL) + action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y)) elif isinstance(op_param, TextOp): - res = await env.step(EnvAPIAbstract(api_name="user_input", kwargs={"input_txt": op_param.input_str})) - if res == ADB_EXEC_FAIL: - return AndroidActionOutput(action_state=RunState.FAIL) + action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=op_param.input_str) elif isinstance(op_param, LongPressOp): x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) - res = await env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y})) - if res == ADB_EXEC_FAIL: - return AndroidActionOutput(action_state=RunState.FAIL) + action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y)) elif isinstance(op_param, SwipeOp_3): x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) - res = await env.step( - EnvAPIAbstract( - api_name="user_swipe", - kwargs={"x": x, "y": y, "orient": op_param.swipe_orient, "dist": op_param.dist}, - ) + action = EnvAction( + action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=op_param.swipe_orient, dist=op_param.dist ) - if res == ADB_EXEC_FAIL: - return AndroidActionOutput(action_state=RunState.FAIL) elif isinstance(op_param, GridOp): grid_on = True elif isinstance(op_param, TapGridOp) or isinstance(op_param, LongPressGridOp): x, y = area_to_xy(op_param.area, op_param.subarea, env.width, env.height, env.rows, env.cols) if isinstance(op_param, TapGridOp): - res = await env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y})) - if res == ADB_EXEC_FAIL: - return AndroidActionOutput(action_state=RunState.FAIL) + action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y)) else: # LongPressGridOp - res = await env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y})) - if res == ADB_EXEC_FAIL: - return AndroidActionOutput(action_state=RunState.FAIL) + action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y)) elif isinstance(op_param, SwipeGridOp): start_x, start_y = area_to_xy( op_param.start_area, op_param.start_subarea, env.width, env.height, env.rows, env.cols @@ -192,11 +179,14 @@ class ScreenshotParse(Action): end_x, end_y = area_to_xy( op_param.end_area, op_param.end_subarea, env.width, env.height, env.rows, env.cols ) - res = await env.step( - EnvAPIAbstract(api_name="user_swipe_to", kwargs={"start": (start_x, start_y), "end": (end_x, end_y)}) + action = EnvAction( + action_type=EnvActionType.USER_SWIPE_TO, coord=(start_x, start_y), tgt_coord=(end_x, end_y) ) - if res == ADB_EXEC_FAIL: - return AndroidActionOutput(action_state=RunState.FAIL) + + obs, _, _, _, info = env.step(action) + action_res = info["res"] + if action_res == ADB_EXEC_FAIL: + return AndroidActionOutput(action_state=RunState.FAIL) if op_param.act_name != "grid": grid_on = True diff --git a/examples/andriod_assistant/actions/self_learn_and_reflect.py b/examples/andriod_assistant/actions/self_learn_and_reflect.py index 780985947..ae7b7f038 100644 --- a/examples/andriod_assistant/actions/self_learn_and_reflect.py +++ b/examples/andriod_assistant/actions/self_learn_and_reflect.py @@ -15,6 +15,7 @@ from examples.andriod_assistant.prompts.assistant_prompt import ( from examples.andriod_assistant.prompts.assistant_prompt import ( screenshot_parse_self_explore_template, ) +from examples.andriod_assistant.utils.const import ADB_EXEC_FAIL from examples.andriod_assistant.utils.schema import ( ActionOp, AndroidActionOutput, @@ -39,9 +40,13 @@ from examples.andriod_assistant.utils.utils import ( ) from metagpt.actions.action import Action from metagpt.config2 import config -from metagpt.const import ADB_EXEC_FAIL from metagpt.environment.android_env.android_env import AndroidEnv -from metagpt.environment.api.env_api import EnvAPIAbstract +from metagpt.environment.android_env.env_space import ( + EnvAction, + EnvActionType, + EnvObsParams, + EnvObsType, +) from metagpt.logs import logger from metagpt.utils.common import encode_image @@ -71,13 +76,11 @@ class SelfLearnAndReflect(Action): async def run_self_learn( self, round_count: int, task_desc: str, last_act: str, task_dir: Path, env: AndroidEnv ) -> AndroidActionOutput: - screenshot_path: Path = await env.observe( - EnvAPIAbstract( - api_name="get_screenshot", kwargs={"ss_name": f"{round_count}_before", "local_save_dir": task_dir} - ) + screenshot_path: Path = env.observe( + EnvObsParams(obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{round_count}_before", local_save_dir=task_dir) ) - xml_path: Path = await env.observe( - EnvAPIAbstract(api_name="get_xml", kwargs={"xml_name": f"{round_count}", "local_save_dir": task_dir}) + xml_path: Path = env.observe( + EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{round_count}", local_save_dir=task_dir) ) if not screenshot_path.exists() or not xml_path.exists(): return AndroidActionOutput(action_state=RunState.FAIL) @@ -116,7 +119,7 @@ class SelfLearnAndReflect(Action): context = self_explore_template.format(task_description=task_desc, last_act=last_act) node = await SCREENSHOT_PARSE_NODE.fill(context=context, llm=self.llm, images=[img_base64]) - print(f"fill result:{node}") + logger.debug(f"fill result:{node}") if "error" in node.content: return AndroidActionOutput(action_state=RunState.FAIL) prompt = node.compile(context=context, schema="json", mode="auto") @@ -132,31 +135,25 @@ class SelfLearnAndReflect(Action): if isinstance(op_param, TapOp): self.ui_area = op_param.area x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) - res = await env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y})) - if res == ADB_EXEC_FAIL: - return AndroidActionOutput(action_state=RunState.FAIL) + action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y)) elif isinstance(op_param, TextOp): - res = await env.step(EnvAPIAbstract(api_name="user_input", kwargs={"input_txt": op_param.input_str})) - if res == ADB_EXEC_FAIL: - return AndroidActionOutput(action_state=RunState.FAIL) + action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=op_param.input_str) elif isinstance(op_param, LongPressOp): self.ui_area = op_param.area x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) - res = await env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y})) - if res == ADB_EXEC_FAIL: - return AndroidActionOutput(action_state=RunState.FAIL) + action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y)) elif isinstance(op_param, SwipeOp_3): self.ui_area = op_param.area self.swipe_orient = op_param.swipe_orient x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) - res = await env.step( - EnvAPIAbstract( - api_name="user_swipe", - kwargs={"x": x, "y": y, "orient": op_param.swipe_orient, "dist": op_param.dist}, - ) + action = EnvAction( + action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=op_param.swipe_orient, dist=op_param.dist ) - if res == ADB_EXEC_FAIL: - return AndroidActionOutput(action_state=RunState.FAIL) + + obs, _, _, _, info = env.step(action) + action_res = info["res"] + if action_res == ADB_EXEC_FAIL: + return AndroidActionOutput(action_state=RunState.FAIL) self.elem_list = elem_list self.act_name = op_param.act_name @@ -165,10 +162,8 @@ class SelfLearnAndReflect(Action): async def run_reflect( self, round_count: int, task_desc: str, last_act: str, task_dir: Path, docs_dir: Path, env: AndroidEnv ) -> AndroidActionOutput: - screenshot_path: Path = await env.observe( - EnvAPIAbstract( - api_name="get_screenshot", kwargs={"ss_name": f"{round_count}_after", "local_save_dir": task_dir} - ) + screenshot_path: Path = env.observe( + EnvObsParams(obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{round_count}_after", local_save_dir=task_dir) ) if not screenshot_path.exists(): return AndroidActionOutput(action_state=RunState.FAIL) @@ -226,8 +221,9 @@ class SelfLearnAndReflect(Action): self.useless_list.append(resource_id) last_act = "NONE" if op_param.decision == Decision.BACK.value: - res = await env.step(EnvAPIAbstract(api_name="system_back")) - if res == ADB_EXEC_FAIL: + action = EnvAction(action_type=EnvActionType.SYSTEM_BACK) + obs, _, _, _, info = env.step(action) + if info["res"] == ADB_EXEC_FAIL: return AndroidActionOutput(action_state=RunState.FAIL) doc = op_param.documentation doc_path = docs_dir.joinpath(f"{resource_id}.txt") diff --git a/examples/andriod_assistant/requirements.txt b/examples/andriod_assistant/requirements.txt index ec86cd815..155863613 100644 --- a/examples/andriod_assistant/requirements.txt +++ b/examples/andriod_assistant/requirements.txt @@ -1 +1,2 @@ pyshine==0.0.9 +opencv-python==4.6.0.66 \ No newline at end of file diff --git a/examples/andriod_assistant/roles/android_assistant.py b/examples/andriod_assistant/roles/android_assistant.py index cf97b5fcd..eccea298e 100644 --- a/examples/andriod_assistant/roles/android_assistant.py +++ b/examples/andriod_assistant/roles/android_assistant.py @@ -77,7 +77,7 @@ class AndroidAssistant(Role): async def react(self) -> Message: self.round_count += 1 result = await super().react() - print(f"react result {result}") + logger.debug(f"react result {result}") return result async def _act(self) -> Message: diff --git a/examples/andriod_assistant/tests/test.py b/examples/andriod_assistant/tests/test.py index c223665c4..ee60d654b 100644 --- a/examples/andriod_assistant/tests/test.py +++ b/examples/andriod_assistant/tests/test.py @@ -2,22 +2,20 @@ # -*- coding: utf-8 -*- # @Desc : test case (imgs from appagent's) - import ast import asyncio import re from pathlib import Path -from actions.parse_record_an import RECORD_PARSE_NODE -from prompts.operation_prompt import ( +from examples.andriod_assistant.actions.parse_record_an import RECORD_PARSE_NODE +from examples.andriod_assistant.prompts.operation_prompt import ( long_press_doc_template, refine_doc_suffix, swipe_doc_template, tap_doc_template, text_doc_template, ) -from utils.schema import ActionOp, SwipeOp - +from examples.andriod_assistant.utils.schema import ActionOp, SwipeOp from metagpt.actions.action import Action from metagpt.config2 import config from metagpt.logs import logger @@ -62,7 +60,7 @@ async def manual_test(): prompt_template = swipe_doc_template context = prompt_template.format(swipe_dir=swipe_dir, ui_element=swipe_area) else: - print("Error occurs") + logger.error("Error occurs") task_desc_path = TASK_DESC_PATH task_desc = open(task_desc_path, "r").read() @@ -108,4 +106,3 @@ if __name__ == "__main__": loop = asyncio.get_event_loop() loop.run_until_complete(manual_test()) loop.close() - print("OK") diff --git a/examples/andriod_assistant/tests/test_for_an.py b/examples/andriod_assistant/tests/test_for_an.py index 7dddaabf5..a5aa1fb09 100644 --- a/examples/andriod_assistant/tests/test_for_an.py +++ b/examples/andriod_assistant/tests/test_for_an.py @@ -1,15 +1,17 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # @Desc : test on android emulator action. After Modify Role Test, this script is discarded. + import asyncio import time from pathlib import Path -from actions.manual_record import ManualRecord -from actions.parse_record import ParseRecord -from actions.screenshot_parse import ScreenshotParse -from actions.self_learn_and_reflect import SelfLearnAndReflect - +from examples.andriod_assistant.actions.manual_record import ManualRecord +from examples.andriod_assistant.actions.parse_record import ParseRecord +from examples.andriod_assistant.actions.screenshot_parse import ScreenshotParse +from examples.andriod_assistant.actions.self_learn_and_reflect import ( + SelfLearnAndReflect, +) from metagpt.environment.android_env.android_env import AndroidEnv TASK_PATH = Path("apps/Contacts") @@ -77,4 +79,3 @@ if __name__ == "__main__": loop.run_until_complete(asyncio.gather(*test_action_list)) loop.close() - print("Finish") diff --git a/examples/andriod_assistant/utils/const.py b/examples/andriod_assistant/utils/const.py new file mode 100644 index 000000000..5c373a6f9 --- /dev/null +++ b/examples/andriod_assistant/utils/const.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Desc : + +ADB_EXEC_FAIL = "FAILED"