From 13cf80b46ae55e2350d93d617b7bbd767ad447ce Mon Sep 17 00:00:00 2001 From: Jiayi Zhang Date: Thu, 22 Feb 2024 17:57:25 +0800 Subject: [PATCH] Update AppAgent's self_learn_and_self_reflect's test --- .../actions/self_learn_and_reflect.py | 26 ++++++--- examples/andriod_assistant/test_for_an.py | 43 +++++++++------ examples/andriod_assistant/utils/schema.py | 2 +- examples/andriod_assistant/utils/utils.py | 8 +-- metagpt/actions/action_node.py | 54 +++++++++---------- 5 files changed, 77 insertions(+), 56 deletions(-) diff --git a/examples/andriod_assistant/actions/self_learn_and_reflect.py b/examples/andriod_assistant/actions/self_learn_and_reflect.py index caba53150..cf3ed91ae 100644 --- a/examples/andriod_assistant/actions/self_learn_and_reflect.py +++ b/examples/andriod_assistant/actions/self_learn_and_reflect.py @@ -61,12 +61,15 @@ class SelfLearnAndReflect(Action): self, round_count: int, task_desc: str, last_act: str, task_dir: Path, docs_dir: Path, env: AndroidEnv ) -> AndroidActionOutput: resp = await self.run_self_learn(round_count, task_desc, last_act, task_dir, env) + print(resp) resp = await self.run_reflect(round_count, task_desc, last_act, task_dir, docs_dir, env) + print(resp) return resp async def run_self_learn( self, round_count: int, task_desc: str, last_act: str, task_dir: Path, env: AndroidEnv ) -> AndroidActionOutput: + logger.info('run_self_learn') screenshot_path: Path = env.observe( EnvAPIAbstract( api_name="get_screenshot", kwargs={"ss_name": f"{round_count}_before", "local_save_dir": task_dir} @@ -80,6 +83,7 @@ class SelfLearnAndReflect(Action): clickable_list = [] focusable_list = [] + # TODO Tuple Bug 从这里开始 Debug # TODO Tuple Bug traverse_xml_tree(xml_path, clickable_list, "clickable", True) traverse_xml_tree(xml_path, focusable_list, "focusable", True) @@ -98,7 +102,9 @@ class SelfLearnAndReflect(Action): bbox = e.bbox center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2 dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5 - if dist <= config.get_other("min_dist"): + # TODO Modify config to default 30. It should be modified back config after single action test + # if dist <= config.get_other("min_dist"): + if dist <= 30: close = True break if not close: @@ -113,10 +119,12 @@ class SelfLearnAndReflect(Action): context = self_explore_template.format(task_description=task_desc, last_act=last_act) node = await SCREENSHOT_PARSE_NODE.fill(context=context, llm=self.llm, images=[img_base64]) + print(f"fill result:{node}") if "error" in node.content: return AndroidActionOutput(action_state=RunState.FAIL) prompt = node.compile(context=context, schema="json", mode="auto") - OpLogItem(step=round_count, prompt=prompt, image=screenshot_before_labeled_path, response=node.content) + # Modify WindowsPath to Str + OpLogItem(step=round_count, prompt=prompt, image=str(screenshot_before_labeled_path), response=node.content) op_param = screenshot_parse_extract(node.instruct_content.model_dump(), grid_on=False) if op_param.param_state == RunState.FINISH: return AndroidActionOutput(action_state=RunState.FINISH) @@ -126,17 +134,17 @@ class SelfLearnAndReflect(Action): if isinstance(op_param, TapOp): self.ui_area = op_param.area x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) - res = env.step(EnvAPIAbstract("system_tap", kwargs={"x": x, "y": y})) + res = env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y})) if res == ADB_EXEC_FAIL: return AndroidActionOutput(action_state=RunState.FAIL) elif isinstance(op_param, TextOp): - res = env.step(EnvAPIAbstract("user_input", kwargs={"input_txt": op_param.input_str})) + res = env.step(EnvAPIAbstract(api_name="user_input", kwargs={"input_txt": op_param.input_str})) if res == ADB_EXEC_FAIL: return AndroidActionOutput(action_state=RunState.FAIL) elif isinstance(op_param, LongPressOp): self.ui_area = op_param.area x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) - res = env.step(EnvAPIAbstract("user_longpress", kwargs={"x": x, "y": y})) + res = env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y})) if res == ADB_EXEC_FAIL: return AndroidActionOutput(action_state=RunState.FAIL) elif isinstance(op_param, SwipeOp): @@ -158,6 +166,7 @@ class SelfLearnAndReflect(Action): async def run_reflect( self, round_count: int, task_desc: str, last_act: str, task_dir: Path, docs_dir: Path, env: AndroidEnv ) -> AndroidActionOutput: + logger.info("run_reflect") screenshot_path: Path = env.observe( EnvAPIAbstract( api_name="get_screenshot", kwargs={"ss_name": f"{round_count}_after", "local_save_dir": task_dir} @@ -170,6 +179,7 @@ class SelfLearnAndReflect(Action): draw_bbox_multi(screenshot_path, screenshot_after_labeled_path, elem_list=self.elem_list) img_base64 = encode_image(screenshot_after_labeled_path) + logger.info(f"act_name: {self.act_name}") if self.act_name == ActionOp.TAP.value: action = "tapping" elif self.act_name == ActionOp.LONG_PRESS.value: @@ -194,8 +204,8 @@ class SelfLearnAndReflect(Action): ReflectLogItem( step=round_count, prompt=prompt, - image_before=self.screenshot_before_path, - image_after=screenshot_after_labeled_path, + image_before=str(self.screenshot_before_path), + image_after=str(screenshot_after_labeled_path), response=node.content, ) @@ -214,7 +224,7 @@ class SelfLearnAndReflect(Action): self.useless_list.append(resource_id) last_act = "NONE" if op_param.decision == Decision.BACK.value: - res = env.step(EnvAPIAbstract("system_back")) + res = env.step(EnvAPIAbstract(api_name="system_back")) if res == ADB_EXEC_FAIL: return AndroidActionOutput(action_state=RunState.FAIL) doc = op_param.documentation diff --git a/examples/andriod_assistant/test_for_an.py b/examples/andriod_assistant/test_for_an.py index dd3d90b6a..9ab0d4bc0 100644 --- a/examples/andriod_assistant/test_for_an.py +++ b/examples/andriod_assistant/test_for_an.py @@ -34,7 +34,7 @@ test_manual_parse = ParseRecord() if __name__ == "__main__": loop = asyncio.get_event_loop() - test_action_list = [ + loop.run_until_complete( test_self_learning.run( round_count=20, task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ", @@ -42,20 +42,31 @@ if __name__ == "__main__": task_dir=TASK_PATH, docs_dir=DOC_PATH, env=test_env_self_learn_android - ), - # test_manual_record.run( - # demo_name=DEMO_NAME, - # task_dir=TASK_PATH, - # env=test_env_manual_learn_android - # ), - # test_manual_parse.run( - # app_name="Contacts", - # demo_name=DEMO_NAME, - # task_dir=TASK_PATH, - # docs_dir=DOC_PATH, - # env=test_env_manual_learn_android - # ) - ] - loop.run_until_complete(asyncio.gather(*test_action_list)) + ) + ) + + # test_action_list = [ + # test_self_learning.run( + # round_count=20, + # task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ", + # last_act="", + # task_dir=TASK_PATH, + # docs_dir=DOC_PATH, + # env=test_env_self_learn_android + # ), + # test_manual_record.run( + # demo_name=DEMO_NAME, + # task_dir=TASK_PATH, + # env=test_env_manual_learn_android + # ), + # test_manual_parse.run( + # app_name="Contacts", + # demo_name=DEMO_NAME, + # task_dir=TASK_PATH, + # docs_dir=DOC_PATH, + # env=test_env_manual_learn_android + # ) + # ] + # loop.run_until_complete(asyncio.gather(*test_action_list)) loop.close() print("Finish") diff --git a/examples/andriod_assistant/utils/schema.py b/examples/andriod_assistant/utils/schema.py index dae8c67d6..75396ac6a 100644 --- a/examples/andriod_assistant/utils/schema.py +++ b/examples/andriod_assistant/utils/schema.py @@ -38,7 +38,7 @@ class Decision(Enum): class AndroidElement(BaseModel): """UI Element""" uid: str = Field(default="") - bbox: tuple[tuple[int, int]] = Field(default={}) + bbox: tuple[tuple[int, int], tuple[int, int]] = Field(default={}) attrib: str = Field(default="") diff --git a/examples/andriod_assistant/utils/utils.py b/examples/andriod_assistant/utils/utils.py index f828e7355..bddb75f99 100644 --- a/examples/andriod_assistant/utils/utils.py +++ b/examples/andriod_assistant/utils/utils.py @@ -55,7 +55,9 @@ def traverse_xml_tree(xml_path: Path, elem_list: list[AndroidElement], attrib: s bbox = e.bbox center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2 dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5 - if dist <= config.get_other("min_dist"): + # TODO Modify config to default 30. It should be modified back config after single action test + # if dist <= config.get_other("min_dist"): + if dist <= 30: close = True break if not close: @@ -67,7 +69,7 @@ def traverse_xml_tree(xml_path: Path, elem_list: list[AndroidElement], attrib: s def draw_bbox_multi(img_path: Path, output_path: Path, elem_list: list[AndroidElement], record_mode: bool = False, dark_mode: bool = False): - imgcv = cv2.imread(img_path) + imgcv = cv2.imread(str(img_path)) count = 1 for elem in elem_list: try: @@ -97,7 +99,7 @@ def draw_bbox_multi(img_path: Path, output_path: Path, elem_list: list[AndroidEl except Exception as e: logger.error(f"ERROR: An exception occurs while labeling the image\n{e}") count += 1 - cv2.imwrite(output_path, imgcv) + cv2.imwrite(str(output_path), imgcv) return imgcv diff --git a/metagpt/actions/action_node.py b/metagpt/actions/action_node.py index 6334fefc5..6c23c4c70 100644 --- a/metagpt/actions/action_node.py +++ b/metagpt/actions/action_node.py @@ -39,7 +39,6 @@ TAG = "CONTENT" LANGUAGE_CONSTRAINT = "Language: Please use the same language as Human INPUT." FORMAT_CONSTRAINT = f"Format: output wrapped inside [{TAG}][/{TAG}] like format example, nothing else." - SIMPLE_TEMPLATE = """ ## context {context} @@ -141,14 +140,14 @@ class ActionNode: instruct_content: BaseModel def __init__( - self, - key: str, - expected_type: Type, - instruction: str, - example: Any, - content: str = "", - children: dict[str, "ActionNode"] = None, - schema: str = "", + self, + key: str, + expected_type: Type, + instruction: str, + example: Any, + content: str = "", + children: dict[str, "ActionNode"] = None, + schema: str = "", ): self.key = key self.expected_type = expected_type @@ -350,14 +349,14 @@ class ActionNode: after=general_after_log(logger), ) async def _aask_v1( - self, - prompt: str, - output_class_name: str, - output_data_mapping: dict, - images: Optional[Union[str, list[str]]] = None, - system_msgs: Optional[list[str]] = None, - schema="markdown", # compatible to original format - timeout=3, + self, + prompt: str, + output_class_name: str, + output_data_mapping: dict, + images: Optional[Union[str, list[str]]] = None, + system_msgs: Optional[list[str]] = None, + schema="markdown", # compatible to original format + timeout=3, ) -> (str, BaseModel): """Use ActionOutput to wrap the output of aask""" content = await self.llm.aask(prompt, system_msgs, images=images, timeout=timeout) @@ -391,7 +390,6 @@ class ActionNode: async def simple_fill(self, schema, mode, images: Optional[Union[str, list[str]]] = None, timeout=3, exclude=None): prompt = self.compile(context=self.context, schema=schema, mode=mode, exclude=exclude) - if schema != "raw": mapping = self.get_mapping(mode, exclude=exclude) class_name = f"{self.key}_AN" @@ -408,15 +406,15 @@ class ActionNode: return self async def fill( - self, - context, - llm, - schema="json", - mode="auto", - strgy="simple", - images: Optional[Union[str, list[str]]] = None, - timeout=3, - exclude=[], + self, + context, + llm, + schema="json", + mode="auto", + strgy="simple", + images: Optional[Union[str, list[str]]] = None, + timeout=3, + exclude=[], ): logger.info("进入fill") """Fill the node(s) with mode. @@ -562,7 +560,7 @@ class ActionNode: return nodes_output async def auto_revise( - self, revise_mode: ReviseMode = ReviseMode.AUTO, template: str = REVISE_TEMPLATE + self, revise_mode: ReviseMode = ReviseMode.AUTO, template: str = REVISE_TEMPLATE ) -> dict[str, str]: """revise the value of incorrect keys""" # generate review comments