From 13cf80b46ae55e2350d93d617b7bbd767ad447ce Mon Sep 17 00:00:00 2001
From: Jiayi Zhang <didi4goooogle@gmail.com>
Date: Thu, 22 Feb 2024 17:57:25 +0800
Subject: [PATCH] Update AppAgent's self_learn_and_self_reflect's test

---
 .../actions/self_learn_and_reflect.py         | 26 ++++++---
 examples/andriod_assistant/test_for_an.py     | 43 +++++++++------
 examples/andriod_assistant/utils/schema.py    |  2 +-
 examples/andriod_assistant/utils/utils.py     |  8 +--
 metagpt/actions/action_node.py                | 54 +++++++++----------
 5 files changed, 77 insertions(+), 56 deletions(-)

diff --git a/examples/andriod_assistant/actions/self_learn_and_reflect.py b/examples/andriod_assistant/actions/self_learn_and_reflect.py
index caba53150..cf3ed91ae 100644
--- a/examples/andriod_assistant/actions/self_learn_and_reflect.py
+++ b/examples/andriod_assistant/actions/self_learn_and_reflect.py
@@ -61,12 +61,15 @@ class SelfLearnAndReflect(Action):
             self, round_count: int, task_desc: str, last_act: str, task_dir: Path, docs_dir: Path, env: AndroidEnv
     ) -> AndroidActionOutput:
         resp = await self.run_self_learn(round_count, task_desc, last_act, task_dir, env)
+        print(resp)
         resp = await self.run_reflect(round_count, task_desc, last_act, task_dir, docs_dir, env)
+        print(resp)
         return resp
 
     async def run_self_learn(
             self, round_count: int, task_desc: str, last_act: str, task_dir: Path, env: AndroidEnv
     ) -> AndroidActionOutput:
+        logger.info('run_self_learn')
         screenshot_path: Path = env.observe(
             EnvAPIAbstract(
                 api_name="get_screenshot", kwargs={"ss_name": f"{round_count}_before", "local_save_dir": task_dir}
@@ -80,6 +83,7 @@ class SelfLearnAndReflect(Action):
 
         clickable_list = []
         focusable_list = []
+        # TODO Tuple Bug 从这里开始 Debug
         # TODO Tuple Bug
         traverse_xml_tree(xml_path, clickable_list, "clickable", True)
         traverse_xml_tree(xml_path, focusable_list, "focusable", True)
@@ -98,7 +102,9 @@ class SelfLearnAndReflect(Action):
                 bbox = e.bbox
                 center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
                 dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5
-                if dist <= config.get_other("min_dist"):
+                # TODO Modify config to default 30. It should be modified back config after single action test
+                # if dist <= config.get_other("min_dist"):
+                if dist <= 30:
                     close = True
                     break
             if not close:
@@ -113,10 +119,12 @@ class SelfLearnAndReflect(Action):
         context = self_explore_template.format(task_description=task_desc, last_act=last_act)
 
         node = await SCREENSHOT_PARSE_NODE.fill(context=context, llm=self.llm, images=[img_base64])
+        print(f"fill result:{node}")
         if "error" in node.content:
             return AndroidActionOutput(action_state=RunState.FAIL)
         prompt = node.compile(context=context, schema="json", mode="auto")
-        OpLogItem(step=round_count, prompt=prompt, image=screenshot_before_labeled_path, response=node.content)
+        # Modify WindowsPath to Str
+        OpLogItem(step=round_count, prompt=prompt, image=str(screenshot_before_labeled_path), response=node.content)
         op_param = screenshot_parse_extract(node.instruct_content.model_dump(), grid_on=False)
         if op_param.param_state == RunState.FINISH:
             return AndroidActionOutput(action_state=RunState.FINISH)
@@ -126,17 +134,17 @@ class SelfLearnAndReflect(Action):
         if isinstance(op_param, TapOp):
             self.ui_area = op_param.area
             x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
-            res = env.step(EnvAPIAbstract("system_tap", kwargs={"x": x, "y": y}))
+            res = env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y}))
             if res == ADB_EXEC_FAIL:
                 return AndroidActionOutput(action_state=RunState.FAIL)
         elif isinstance(op_param, TextOp):
-            res = env.step(EnvAPIAbstract("user_input", kwargs={"input_txt": op_param.input_str}))
+            res = env.step(EnvAPIAbstract(api_name="user_input", kwargs={"input_txt": op_param.input_str}))
             if res == ADB_EXEC_FAIL:
                 return AndroidActionOutput(action_state=RunState.FAIL)
         elif isinstance(op_param, LongPressOp):
             self.ui_area = op_param.area
             x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
-            res = env.step(EnvAPIAbstract("user_longpress", kwargs={"x": x, "y": y}))
+            res = env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y}))
             if res == ADB_EXEC_FAIL:
                 return AndroidActionOutput(action_state=RunState.FAIL)
         elif isinstance(op_param, SwipeOp):
@@ -158,6 +166,7 @@ class SelfLearnAndReflect(Action):
     async def run_reflect(
             self, round_count: int, task_desc: str, last_act: str, task_dir: Path, docs_dir: Path, env: AndroidEnv
     ) -> AndroidActionOutput:
+        logger.info("run_reflect")
         screenshot_path: Path = env.observe(
             EnvAPIAbstract(
                 api_name="get_screenshot", kwargs={"ss_name": f"{round_count}_after", "local_save_dir": task_dir}
@@ -170,6 +179,7 @@ class SelfLearnAndReflect(Action):
         draw_bbox_multi(screenshot_path, screenshot_after_labeled_path, elem_list=self.elem_list)
         img_base64 = encode_image(screenshot_after_labeled_path)
 
+        logger.info(f"act_name: {self.act_name}")
         if self.act_name == ActionOp.TAP.value:
             action = "tapping"
         elif self.act_name == ActionOp.LONG_PRESS.value:
@@ -194,8 +204,8 @@ class SelfLearnAndReflect(Action):
         ReflectLogItem(
             step=round_count,
             prompt=prompt,
-            image_before=self.screenshot_before_path,
-            image_after=screenshot_after_labeled_path,
+            image_before=str(self.screenshot_before_path),
+            image_after=str(screenshot_after_labeled_path),
             response=node.content,
         )
 
@@ -214,7 +224,7 @@ class SelfLearnAndReflect(Action):
                 self.useless_list.append(resource_id)
                 last_act = "NONE"
                 if op_param.decision == Decision.BACK.value:
-                    res = env.step(EnvAPIAbstract("system_back"))
+                    res = env.step(EnvAPIAbstract(api_name="system_back"))
                     if res == ADB_EXEC_FAIL:
                         return AndroidActionOutput(action_state=RunState.FAIL)
             doc = op_param.documentation
diff --git a/examples/andriod_assistant/test_for_an.py b/examples/andriod_assistant/test_for_an.py
index dd3d90b6a..9ab0d4bc0 100644
--- a/examples/andriod_assistant/test_for_an.py
+++ b/examples/andriod_assistant/test_for_an.py
@@ -34,7 +34,7 @@ test_manual_parse = ParseRecord()
 
 if __name__ == "__main__":
     loop = asyncio.get_event_loop()
-    test_action_list = [
+    loop.run_until_complete(
         test_self_learning.run(
             round_count=20,
             task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ",
@@ -42,20 +42,31 @@ if __name__ == "__main__":
             task_dir=TASK_PATH,
             docs_dir=DOC_PATH,
             env=test_env_self_learn_android
-        ),
-        # test_manual_record.run(
-        #     demo_name=DEMO_NAME,
-        #     task_dir=TASK_PATH,
-        #     env=test_env_manual_learn_android
-        # ),
-        # test_manual_parse.run(
-        #     app_name="Contacts",
-        #     demo_name=DEMO_NAME,
-        #     task_dir=TASK_PATH,
-        #     docs_dir=DOC_PATH,
-        #     env=test_env_manual_learn_android
-        # )
-    ]
-    loop.run_until_complete(asyncio.gather(*test_action_list))
+        )
+    )
+
+    # test_action_list = [
+    #     test_self_learning.run(
+    #         round_count=20,
+    #         task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ",
+    #         last_act="",
+    #         task_dir=TASK_PATH,
+    #         docs_dir=DOC_PATH,
+    #         env=test_env_self_learn_android
+    #     ),
+    #     test_manual_record.run(
+    #         demo_name=DEMO_NAME,
+    #         task_dir=TASK_PATH,
+    #         env=test_env_manual_learn_android
+    #     ),
+    #     test_manual_parse.run(
+    #         app_name="Contacts",
+    #         demo_name=DEMO_NAME,
+    #         task_dir=TASK_PATH,
+    #         docs_dir=DOC_PATH,
+    #         env=test_env_manual_learn_android
+    #     )
+    # ]
+    # loop.run_until_complete(asyncio.gather(*test_action_list))
     loop.close()
     print("Finish")
diff --git a/examples/andriod_assistant/utils/schema.py b/examples/andriod_assistant/utils/schema.py
index dae8c67d6..75396ac6a 100644
--- a/examples/andriod_assistant/utils/schema.py
+++ b/examples/andriod_assistant/utils/schema.py
@@ -38,7 +38,7 @@ class Decision(Enum):
 class AndroidElement(BaseModel):
     """UI Element"""
     uid: str = Field(default="")
-    bbox: tuple[tuple[int, int]] = Field(default={})
+    bbox: tuple[tuple[int, int], tuple[int, int]] = Field(default={})
     attrib: str = Field(default="")
 
 
diff --git a/examples/andriod_assistant/utils/utils.py b/examples/andriod_assistant/utils/utils.py
index f828e7355..bddb75f99 100644
--- a/examples/andriod_assistant/utils/utils.py
+++ b/examples/andriod_assistant/utils/utils.py
@@ -55,7 +55,9 @@ def traverse_xml_tree(xml_path: Path, elem_list: list[AndroidElement], attrib: s
                     bbox = e.bbox
                     center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
                     dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5
-                    if dist <= config.get_other("min_dist"):
+                    # TODO Modify config to default 30. It should be modified back config after single action test
+                    # if dist <= config.get_other("min_dist"):
+                    if dist <= 30:
                         close = True
                         break
                 if not close:
@@ -67,7 +69,7 @@ def traverse_xml_tree(xml_path: Path, elem_list: list[AndroidElement], attrib: s
 
 def draw_bbox_multi(img_path: Path, output_path: Path, elem_list: list[AndroidElement], record_mode: bool = False,
                     dark_mode: bool = False):
-    imgcv = cv2.imread(img_path)
+    imgcv = cv2.imread(str(img_path))
     count = 1
     for elem in elem_list:
         try:
@@ -97,7 +99,7 @@ def draw_bbox_multi(img_path: Path, output_path: Path, elem_list: list[AndroidEl
         except Exception as e:
             logger.error(f"ERROR: An exception occurs while labeling the image\n{e}")
         count += 1
-    cv2.imwrite(output_path, imgcv)
+    cv2.imwrite(str(output_path), imgcv)
     return imgcv
 
 
diff --git a/metagpt/actions/action_node.py b/metagpt/actions/action_node.py
index 6334fefc5..6c23c4c70 100644
--- a/metagpt/actions/action_node.py
+++ b/metagpt/actions/action_node.py
@@ -39,7 +39,6 @@ TAG = "CONTENT"
 LANGUAGE_CONSTRAINT = "Language: Please use the same language as Human INPUT."
 FORMAT_CONSTRAINT = f"Format: output wrapped inside [{TAG}][/{TAG}] like format example, nothing else."
 
-
 SIMPLE_TEMPLATE = """
 ## context
 {context}
@@ -141,14 +140,14 @@ class ActionNode:
     instruct_content: BaseModel
 
     def __init__(
-        self,
-        key: str,
-        expected_type: Type,
-        instruction: str,
-        example: Any,
-        content: str = "",
-        children: dict[str, "ActionNode"] = None,
-        schema: str = "",
+            self,
+            key: str,
+            expected_type: Type,
+            instruction: str,
+            example: Any,
+            content: str = "",
+            children: dict[str, "ActionNode"] = None,
+            schema: str = "",
     ):
         self.key = key
         self.expected_type = expected_type
@@ -350,14 +349,14 @@ class ActionNode:
         after=general_after_log(logger),
     )
     async def _aask_v1(
-        self,
-        prompt: str,
-        output_class_name: str,
-        output_data_mapping: dict,
-        images: Optional[Union[str, list[str]]] = None,
-        system_msgs: Optional[list[str]] = None,
-        schema="markdown",  # compatible to original format
-        timeout=3,
+            self,
+            prompt: str,
+            output_class_name: str,
+            output_data_mapping: dict,
+            images: Optional[Union[str, list[str]]] = None,
+            system_msgs: Optional[list[str]] = None,
+            schema="markdown",  # compatible to original format
+            timeout=3,
     ) -> (str, BaseModel):
         """Use ActionOutput to wrap the output of aask"""
         content = await self.llm.aask(prompt, system_msgs, images=images, timeout=timeout)
@@ -391,7 +390,6 @@ class ActionNode:
 
     async def simple_fill(self, schema, mode, images: Optional[Union[str, list[str]]] = None, timeout=3, exclude=None):
         prompt = self.compile(context=self.context, schema=schema, mode=mode, exclude=exclude)
-
         if schema != "raw":
             mapping = self.get_mapping(mode, exclude=exclude)
             class_name = f"{self.key}_AN"
@@ -408,15 +406,15 @@ class ActionNode:
         return self
 
     async def fill(
-        self,
-        context,
-        llm,
-        schema="json",
-        mode="auto",
-        strgy="simple",
-        images: Optional[Union[str, list[str]]] = None,
-        timeout=3,
-        exclude=[],
+            self,
+            context,
+            llm,
+            schema="json",
+            mode="auto",
+            strgy="simple",
+            images: Optional[Union[str, list[str]]] = None,
+            timeout=3,
+            exclude=[],
     ):
         logger.info("进入fill")
         """Fill the node(s) with mode.
@@ -562,7 +560,7 @@ class ActionNode:
         return nodes_output
 
     async def auto_revise(
-        self, revise_mode: ReviseMode = ReviseMode.AUTO, template: str = REVISE_TEMPLATE
+            self, revise_mode: ReviseMode = ReviseMode.AUTO, template: str = REVISE_TEMPLATE
     ) -> dict[str, str]:
         """revise the value of incorrect keys"""
         # generate review comments