diff --git a/examples/andriod_assistant/actions/screenshot_parse.py b/examples/andriod_assistant/actions/screenshot_parse.py index 3c5691a9b..40082bc04 100644 --- a/examples/andriod_assistant/actions/screenshot_parse.py +++ b/examples/andriod_assistant/actions/screenshot_parse.py @@ -19,13 +19,14 @@ from examples.andriod_assistant.utils.schema import ( OpLogItem, RunState, SwipeGridOp, - SwipeOp, + SwipeOp_3, TapGridOp, TapOp, TextOp, ) from examples.andriod_assistant.utils.utils import ( area_to_xy, + draw_grid, draw_bbox_multi, elem_bbox_to_xy, screenshot_parse_extract, @@ -47,9 +48,9 @@ class ScreenshotParse(Action): return "" ui_doc = """ -You also have access to the following documentations that describes the functionalities of UI -elements you can interact on the screen. These docs are crucial for you to determine the target of your -next action. You should always prioritize these documented elements for interaction:""" + You also have access to the following documentations that describes the functionalities of UI + elements you can interact on the screen. These docs are crucial for you to determine the target of your + next action. You should always prioritize these documented elements for interaction:""" for i, elem in enumerate(elem_list): doc_path = docs_idr.joinpath(f"{elem.uid}.txt") if not doc_path.exists(): @@ -78,23 +79,32 @@ next action. You should always prioritize these documented elements for interact return ui_doc async def run( - self, - round_count: int, - task_desc: str, - last_act: str, - task_dir: Path, - docs_dir: Path, - grid_on: bool, - env: AndroidEnv, + self, + round_count: int, + task_desc: str, + last_act: str, + task_dir: Path, + docs_dir: Path, + grid_on: bool, + env: AndroidEnv, ): - screenshot_path: Path = env.step( + for path in [task_dir, docs_dir]: + if not path.exists(): + path.mkdir(parents=True, exist_ok=True) + + screenshot_path: Path = env.observe( EnvAPIAbstract( - api_name="get_screenshot", kwargs={"ss_name": f"{round_count}_before", "local_save_dir": task_dir} + api_name="get_screenshot", + kwargs={"ss_name": f"{round_count}_before", "local_save_dir": task_dir} ) ) - xml_path: Path = env.step( - EnvAPIAbstract(api_name="get_xml", kwargs={"xml_name": f"{round_count}", "local_save_dir": task_dir}) + xml_path: Path = env.observe( + EnvAPIAbstract( + api_name="get_xml", + kwargs={"xml_name": f"{round_count}", "local_save_dir": task_dir} + ) ) + width, height = env.device_shape if not screenshot_path.exists() or not xml_path.exists(): return AndroidActionOutput(action_state=RunState.FAIL) @@ -111,18 +121,23 @@ next action. You should always prioritize these documented elements for interact bbox = e.bbox center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2 dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5 - if dist <= config.get_other("min_dist"): + # TODO Modify config to default 30. It should be modified back config after single action test + # if dist <= config.get_other("min_dist"): + if dist <= 30: close = True break if not close: elem_list.append(elem) - screenshot_labeled_path = task_dir.joinpath(f"{task_dir}_{round_count}_labeled.png") + screenshot_labeled_path = task_dir.joinpath(f"{round_count}_labeled.png") draw_bbox_multi(screenshot_path, screenshot_labeled_path, elem_list) img_base64 = encode_image(screenshot_labeled_path) parse_template = screenshot_parse_with_grid_template if grid_on else screenshot_parse_template + if grid_on: + rows, cols = draw_grid(screenshot_path, task_dir / f"{round_count}_grid.png") + ui_doc = self._makeup_ui_document(elem_list, docs_dir) context = parse_template.format(ui_document=ui_doc, task_description=task_desc, last_act=last_act) node = await SCREENSHOT_PARSE_NODE.fill(context=context, llm=self.llm, images=[img_base64]) @@ -131,7 +146,7 @@ next action. You should always prioritize these documented elements for interact return AndroidActionOutput(action_state=RunState.FAIL) prompt = node.compile(context=context, schema="json", mode="auto") - OpLogItem(step=round_count, prompt=prompt, image=screenshot_labeled_path, response=node.content) + OpLogItem(step=round_count, prompt=prompt, image=str(screenshot_labeled_path), response=node.content) op_param = screenshot_parse_extract(node.instruct_content.model_dump(), grid_on) if op_param.param_state == RunState.FINISH: @@ -141,23 +156,24 @@ next action. You should always prioritize these documented elements for interact if isinstance(op_param, TapOp): x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) - res = env.step(EnvAPIAbstract("system_tap", kwargs={"x": x, "y": y})) + res = env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y})) if res == ADB_EXEC_FAIL: return AndroidActionOutput(action_state=RunState.FAIL) elif isinstance(op_param, TextOp): - res = env.step(EnvAPIAbstract("user_input", kwargs={"input_txt": op_param.input_str})) + res = env.step(EnvAPIAbstract(api_name="user_input", kwargs={"input_txt": op_param.input_str})) if res == ADB_EXEC_FAIL: return AndroidActionOutput(action_state=RunState.FAIL) elif isinstance(op_param, LongPressOp): x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) - res = env.step(EnvAPIAbstract("user_longpress", kwargs={"x": x, "y": y})) + res = env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y})) if res == ADB_EXEC_FAIL: return AndroidActionOutput(action_state=RunState.FAIL) - elif isinstance(op_param, SwipeOp): + elif isinstance(op_param, SwipeOp_3): x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) res = env.step( EnvAPIAbstract( - "user_swipe", kwargs={"x": x, "y": y, "orient": op_param.swipe_orient, "dist": op_param.dist} + api_name="user_swipe", + kwargs={"x": x, "y": y, "orient": op_param.swipe_orient, "dist": op_param.dist} ) ) if res == ADB_EXEC_FAIL: @@ -167,18 +183,19 @@ next action. You should always prioritize these documented elements for interact elif isinstance(op_param, TapGridOp) or isinstance(op_param, LongPressGridOp): x, y = area_to_xy(op_param.area, op_param.subarea, env.width, env.height, env.rows, env.cols) if isinstance(op_param, TapGridOp): - res = env.step(EnvAPIAbstract("system_tap", kwargs={"x": x, "y": y})) + res = env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y})) if res == ADB_EXEC_FAIL: return AndroidActionOutput(action_state=RunState.FAIL) else: # LongPressGridOp - res = env.step(EnvAPIAbstract("user_longpress", kwargs={"x": x, "y": y})) + res = env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y})) if res == ADB_EXEC_FAIL: return AndroidActionOutput(action_state=RunState.FAIL) elif isinstance(op_param, SwipeGridOp): - start_x, start_y = area_to_xy(op_param.start_area, op_param.start_subarea) - end_x, end_y = area_to_xy(op_param.end_area, op_param.end_subarea) - res = env.step(EnvAPIAbstract("user_swipe_to", kwargs={"start": (start_x, start_y), "end": (end_x, end_y)})) + start_x, start_y = area_to_xy(op_param.start_area, op_param.start_subarea, width, height, rows, cols) + end_x, end_y = area_to_xy(op_param.end_area, op_param.end_subarea, width, height, rows, cols) + res = env.step( + EnvAPIAbstract(api_name="user_swipe_to", kwargs={"start": (start_x, start_y), "end": (end_x, end_y)})) if res == ADB_EXEC_FAIL: return AndroidActionOutput(action_state=RunState.FAIL) diff --git a/examples/andriod_assistant/test_for_an.py b/examples/andriod_assistant/test_for_an.py index d526192df..8f6fb9b91 100644 --- a/examples/andriod_assistant/test_for_an.py +++ b/examples/andriod_assistant/test_for_an.py @@ -7,6 +7,7 @@ from pathlib import Path from actions.manual_record import ManualRecord from actions.parse_record import ParseRecord from actions.self_learn_and_reflect import SelfLearnAndReflect +from actions.screenshot_parse import ScreenshotParse from metagpt.environment.android_env.android_env import AndroidEnv TASK_PATH = Path("apps/Contacts") @@ -29,64 +30,49 @@ test_env_manual_learn_android = AndroidEnv( test_manual_record = ManualRecord() test_manual_parse = ParseRecord() -# 虚拟机效果实现 -# 不同 Action Node 结果符合预期(Action Node) +test_env_screenshot_parse_android = AndroidEnv( + device_id="emulator-5554", + xml_dir=Path("/sdcard"), + screenshot_dir=Path("/sdcard/Pictures/Screenshots"), +) +test_screenshot_parse = ScreenshotParse() if __name__ == "__main__": loop = asyncio.get_event_loop() - loop.run_until_complete( + + test_action_list = [ + test_self_learning.run( + round_count=20, + task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ", + last_act="", + task_dir=TASK_PATH / "demos" / f"self_learning_{DEMO_NAME}", + docs_dir=SELF_EXPLORE_DOC_PATH, + env=test_env_self_learn_android + ), + test_manual_record.run( + demo_name=DEMO_NAME, + task_dir=TASK_PATH / "demos" / f"manual_record_{DEMO_NAME}", + task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ", + env=test_env_manual_learn_android + ), test_manual_parse.run( app_name="Contacts", - demo_name="1708753998.5757847", - task_dir=TASK_PATH / "demos" / f"manual_record_1708753998.5757847", # 修要修改 - docs_dir=PARSE_RECORD_DOC_PATH, # 需要修改 + demo_name=DEMO_NAME, + task_dir=TASK_PATH / "demos" / f"manual_record_{DEMO_NAME}", # 修要修改 + docs_dir=PARSE_RECORD_DOC_PATH, # 需要修改 env=test_env_manual_learn_android - )) - # test_action_list = [ - # # test_self_learning.run( - # # round_count=20, - # # task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ", - # # last_act="", - # # task_dir= TASK_PATH / "demos" / f"self_learning_{DEMO_NAME}", - # # docs_dir=DOC_PATH, - # # env=test_env_self_learn_android - # # ), - # test_manual_record.run( - # demo_name=DEMO_NAME, - # task_dir=TASK_PATH / "demos" / f"manual_record_{DEMO_NAME}", - # task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ", - # env=test_env_manual_learn_android - # ), - # test_manual_parse.run( - # app_name="Contacts", - # demo_name=DEMO_NAME, - # task_dir=TASK_PATH / "demos" / f"manual_record_{DEMO_NAME}", # 修要修改 - # docs_dir=PARSE_RECORD_DOC_PATH, # 需要修改 - # env=test_env_manual_learn_android - # ) - # ] - # test_action_list = [ - # test_self_learning.run( - # round_count=20, - # task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ", - # last_act="", - # task_dir=TASK_PATH, - # docs_dir=DOC_PATH, - # env=test_env_self_learn_android - # ), - # test_manual_record.run( - # demo_name=DEMO_NAME, - # task_dir=TASK_PATH, - # env=test_env_manual_learn_android - # ), - # test_manual_parse.run( - # app_name="Contacts", - # demo_name=DEMO_NAME, - # task_dir=TASK_PATH, - # docs_dir=DOC_PATH, - # env=test_env_manual_learn_android - # ) - # ] - # loop.run_until_complete(asyncio.gather(*test_action_list)) + ), + test_screenshot_parse.run( + round_count=20, + task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ", + last_act="", + task_dir=TASK_PATH / f"act_{DEMO_NAME}", + docs_dir=PARSE_RECORD_DOC_PATH, + env=test_env_screenshot_parse_android, + grid_on=False + ) + ] + + loop.run_until_complete(asyncio.gather(*test_action_list)) loop.close() print("Finish") diff --git a/examples/andriod_assistant/utils/utils.py b/examples/andriod_assistant/utils/utils.py index d696ac4f0..b82c656a4 100644 --- a/examples/andriod_assistant/utils/utils.py +++ b/examples/andriod_assistant/utils/utils.py @@ -163,7 +163,7 @@ def area_to_xy(area: int, subarea: str, width: int, height: int, rows: int, cols return x, y -def elem_bbox_to_xy(bbox: tuple[tuple[int, int]]) -> tuple[int, int]: +def elem_bbox_to_xy(bbox: tuple[tuple[int, int], tuple[int, int]]) -> tuple[int, int]: tl, br = bbox x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2 return x, y