update andriod_assistant with obs/action space

2026-05-03 21:02:38 +02:00 · 2024-03-27 14:59:20 +08:00 · 2024-03-27 14:59:20 +08:00 · 53d0b70fa0
commit 53d0b70fa0
parent cdb4d48191
8 changed files with 102 additions and 113 deletions
--- a/examples/andriod_assistant/actions/manual_record.py
+++ b/examples/andriod_assistant/actions/manual_record.py
@ -6,6 +6,7 @@ from pathlib import Path

 import cv2

+from examples.andriod_assistant.utils.const import ADB_EXEC_FAIL
 from examples.andriod_assistant.utils.schema import (
    ActionOp,
    AndroidActionOutput,
@ -15,9 +16,13 @@ from examples.andriod_assistant.utils.schema import (
 from examples.andriod_assistant.utils.utils import draw_bbox_multi, traverse_xml_tree
 from metagpt.actions.action import Action
 from metagpt.config2 import config
-from metagpt.const import ADB_EXEC_FAIL
 from metagpt.environment.android_env.android_env import AndroidEnv
-from metagpt.environment.api.env_api import EnvAPIAbstract
+from metagpt.environment.android_env.env_space import (
+    EnvAction,
+    EnvActionType,
+    EnvObsParams,
+    EnvObsType,
+)
 from metagpt.logs import logger


@ -53,19 +58,13 @@ class ManualRecord(Action):
        step = 0
        while True:
            step += 1
-            screenshot_path: Path = await env.observe(
-                EnvAPIAbstract(
-                    api_name="get_screenshot",
-                    # kwargs={"ss_name": f"{demo_name}_{step}", "local_save_dir": self.screenshot_before_path}
-                    kwargs={"ss_name": f"{step}", "local_save_dir": self.screenshot_before_path},
+            screenshot_path: Path = env.observe(
+                EnvObsParams(
+                    obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{step}", local_save_dir=self.screenshot_before_path
                )
            )
-            xml_path: Path = await env.observe(
-                EnvAPIAbstract(
-                    api_name="get_xml",
-                    # kwargs={"xml_name": f"{demo_name}_{step}", "local_save_dir": self.xml_path}
-                    kwargs={"xml_name": f"{step}", "local_save_dir": self.xml_path},
-                )
+            xml_path: Path = env.observe(
+                EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{step}", local_save_dir=self.xml_path)
            )
            if not screenshot_path.exists() or not xml_path.exists():
                return AndroidActionOutput(action_state=RunState.FAIL)
@ -103,8 +102,8 @@ class ManualRecord(Action):

            user_input = "xxx"
            logger.info(
-                "Choose one of the following actions you want to perform on the current screen:\ntap, text, long_press,"
-                "swipe, stop",
+                "Choose one of the following actions you want to perform on the current screen:\n"
+                "tap, text, long_press, swipe, stop",
                "blue",
            )

@ -126,10 +125,8 @@ class ManualRecord(Action):
                    user_input = input()
                tl, br = elem_list[int(user_input) - 1].bbox
                x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
-                ret = await env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y}))
-                if ret == ADB_EXEC_FAIL:
-                    return AndroidActionOutput(action_state=RunState.FAIL)
-                record_file.write(f"tap({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n")
+                action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
+                log_str = f"tap({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n"
            elif user_input.lower() == ActionOp.TEXT.value:
                logger.info(
                    f"Which element do you want to input the text string? Choose a numeric tag from 1 to "
@ -143,8 +140,8 @@ class ManualRecord(Action):
                user_input = ""
                while not user_input:
                    user_input = input()
-                await env.step(EnvAPIAbstract(api_name="user_input", kwargs={"input_txt": user_input}))
-                record_file.write(f'text({input_area}:sep:"{user_input}"):::{elem_list[int(input_area) - 1].uid}\n')
+                action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=user_input)
+                log_str = f"text({input_area}:sep:'{user_input}'):::{elem_list[int(input_area) - 1].uid}\n"
            elif user_input.lower() == ActionOp.LONG_PRESS.value:
                logger.info(
                    f"Which element do you want to long press? Choose a numeric tag from 1 to {len(elem_list)}:", "blue"
@ -154,14 +151,12 @@ class ManualRecord(Action):
                    user_input = input()
                tl, br = elem_list[int(user_input) - 1].bbox
                x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
-                ret = await env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y}))
-                if ret == ADB_EXEC_FAIL:
-                    return AndroidActionOutput(action_state=RunState.FAIL)
-                record_file.write(f"long_press({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n")
+                action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
+                log_str = f"long_press({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n"
            elif user_input.lower() == ActionOp.SWIPE.value:
                logger.info(
-                    "What is the direction of your swipe? Choose one from the following options:\nup, down, left,"
-                    " right",
+                    "What is the direction of your swipe? Choose one from the following options:\n"
+                    "up, down, left, right",
                    "blue",
                )
                user_input = ""
@ -178,16 +173,20 @@ class ManualRecord(Action):
                    user_input = input()
                tl, br = elem_list[int(user_input) - 1].bbox
                x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
-                ret = await env.step(
-                    EnvAPIAbstract(api_name="user_swipe", kwargs={"x": x, "y": y, "orient": swipe_dir})
-                )
-                if ret == ADB_EXEC_FAIL:
-                    return AndroidActionOutput(action_state=RunState.FAIL)
-                record_file.write(f"swipe({int(user_input)}:sep:{swipe_dir}):::{elem_list[int(user_input) - 1].uid}\n")
+
+                action = EnvAction(action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=swipe_dir)
+                log_str = f"swipe({int(user_input)}:sep:{swipe_dir}):::{elem_list[int(user_input) - 1].uid}\n"
            elif user_input.lower() == ActionOp.STOP.value:
                record_file.write("stop\n")
                record_file.close()
                break
            else:
                break
+
+            obs, _, _, _, info = env.step(action)
+            action_res = info["res"]
+            if action_res == ADB_EXEC_FAIL:
+                return AndroidActionOutput(action_state=RunState.FAIL)
+            record_file.write(log_str)
+
            time.sleep(3)
--- a/examples/andriod_assistant/actions/screenshot_parse.py
+++ b/examples/andriod_assistant/actions/screenshot_parse.py
@ -10,6 +10,7 @@ from examples.andriod_assistant.prompts.assistant_prompt import (
    screenshot_parse_template,
    screenshot_parse_with_grid_template,
 )
+from examples.andriod_assistant.utils.const import ADB_EXEC_FAIL
 from examples.andriod_assistant.utils.schema import (
    AndroidActionOutput,
    AndroidElement,
@ -34,9 +35,13 @@ from examples.andriod_assistant.utils.utils import (
 )
 from metagpt.actions.action import Action
 from metagpt.config2 import config
-from metagpt.const import ADB_EXEC_FAIL
 from metagpt.environment.android_env.android_env import AndroidEnv
-from metagpt.environment.api.env_api import EnvAPIAbstract
+from metagpt.environment.android_env.env_space import (
+    EnvAction,
+    EnvActionType,
+    EnvObsParams,
+    EnvObsType,
+)
 from metagpt.utils.common import encode_image


@ -91,14 +96,11 @@ class ScreenshotParse(Action):
        for path in [task_dir, docs_dir]:
            if not path.exists():
                path.mkdir(parents=True, exist_ok=True)
-
-        screenshot_path: Path = await env.observe(
-            EnvAPIAbstract(
-                api_name="get_screenshot", kwargs={"ss_name": f"{round_count}_before", "local_save_dir": task_dir}
-            )
+        screenshot_path: Path = env.observe(
+            EnvObsParams(obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{round_count}_before", local_save_dir=task_dir)
        )
-        xml_path: Path = await env.observe(
-            EnvAPIAbstract(api_name="get_xml", kwargs={"xml_name": f"{round_count}", "local_save_dir": task_dir})
+        xml_path: Path = env.observe(
+            EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{round_count}", local_save_dir=task_dir)
        )
        width, height = env.device_shape
        if not screenshot_path.exists() or not xml_path.exists():
@ -150,41 +152,26 @@ class ScreenshotParse(Action):

        if isinstance(op_param, TapOp):
            x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
-            res = await env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y}))
-            if res == ADB_EXEC_FAIL:
-                return AndroidActionOutput(action_state=RunState.FAIL)
+            action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
        elif isinstance(op_param, TextOp):
-            res = await env.step(EnvAPIAbstract(api_name="user_input", kwargs={"input_txt": op_param.input_str}))
-            if res == ADB_EXEC_FAIL:
-                return AndroidActionOutput(action_state=RunState.FAIL)
+            action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=op_param.input_str)
        elif isinstance(op_param, LongPressOp):
            x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
-            res = await env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y}))
-            if res == ADB_EXEC_FAIL:
-                return AndroidActionOutput(action_state=RunState.FAIL)
+            action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
        elif isinstance(op_param, SwipeOp_3):
            x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
-            res = await env.step(
-                EnvAPIAbstract(
-                    api_name="user_swipe",
-                    kwargs={"x": x, "y": y, "orient": op_param.swipe_orient, "dist": op_param.dist},
-                )
+            action = EnvAction(
+                action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=op_param.swipe_orient, dist=op_param.dist
            )
-            if res == ADB_EXEC_FAIL:
-                return AndroidActionOutput(action_state=RunState.FAIL)
        elif isinstance(op_param, GridOp):
            grid_on = True
        elif isinstance(op_param, TapGridOp) or isinstance(op_param, LongPressGridOp):
            x, y = area_to_xy(op_param.area, op_param.subarea, env.width, env.height, env.rows, env.cols)
            if isinstance(op_param, TapGridOp):
-                res = await env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y}))
-                if res == ADB_EXEC_FAIL:
-                    return AndroidActionOutput(action_state=RunState.FAIL)
+                action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
            else:
                # LongPressGridOp
-                res = await env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y}))
-                if res == ADB_EXEC_FAIL:
-                    return AndroidActionOutput(action_state=RunState.FAIL)
+                action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
        elif isinstance(op_param, SwipeGridOp):
            start_x, start_y = area_to_xy(
                op_param.start_area, op_param.start_subarea, env.width, env.height, env.rows, env.cols
@ -192,11 +179,14 @@ class ScreenshotParse(Action):
            end_x, end_y = area_to_xy(
                op_param.end_area, op_param.end_subarea, env.width, env.height, env.rows, env.cols
            )
-            res = await env.step(
-                EnvAPIAbstract(api_name="user_swipe_to", kwargs={"start": (start_x, start_y), "end": (end_x, end_y)})
+            action = EnvAction(
+                action_type=EnvActionType.USER_SWIPE_TO, coord=(start_x, start_y), tgt_coord=(end_x, end_y)
            )
-            if res == ADB_EXEC_FAIL:
-                return AndroidActionOutput(action_state=RunState.FAIL)
+
+        obs, _, _, _, info = env.step(action)
+        action_res = info["res"]
+        if action_res == ADB_EXEC_FAIL:
+            return AndroidActionOutput(action_state=RunState.FAIL)

        if op_param.act_name != "grid":
            grid_on = True
--- a/examples/andriod_assistant/actions/self_learn_and_reflect.py
+++ b/examples/andriod_assistant/actions/self_learn_and_reflect.py
@ -15,6 +15,7 @@ from examples.andriod_assistant.prompts.assistant_prompt import (
 from examples.andriod_assistant.prompts.assistant_prompt import (
    screenshot_parse_self_explore_template,
 )
+from examples.andriod_assistant.utils.const import ADB_EXEC_FAIL
 from examples.andriod_assistant.utils.schema import (
    ActionOp,
    AndroidActionOutput,
@ -39,9 +40,13 @@ from examples.andriod_assistant.utils.utils import (
 )
 from metagpt.actions.action import Action
 from metagpt.config2 import config
-from metagpt.const import ADB_EXEC_FAIL
 from metagpt.environment.android_env.android_env import AndroidEnv
-from metagpt.environment.api.env_api import EnvAPIAbstract
+from metagpt.environment.android_env.env_space import (
+    EnvAction,
+    EnvActionType,
+    EnvObsParams,
+    EnvObsType,
+)
 from metagpt.logs import logger
 from metagpt.utils.common import encode_image

@ -71,13 +76,11 @@ class SelfLearnAndReflect(Action):
    async def run_self_learn(
        self, round_count: int, task_desc: str, last_act: str, task_dir: Path, env: AndroidEnv
    ) -> AndroidActionOutput:
-        screenshot_path: Path = await env.observe(
-            EnvAPIAbstract(
-                api_name="get_screenshot", kwargs={"ss_name": f"{round_count}_before", "local_save_dir": task_dir}
-            )
+        screenshot_path: Path = env.observe(
+            EnvObsParams(obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{round_count}_before", local_save_dir=task_dir)
        )
-        xml_path: Path = await env.observe(
-            EnvAPIAbstract(api_name="get_xml", kwargs={"xml_name": f"{round_count}", "local_save_dir": task_dir})
+        xml_path: Path = env.observe(
+            EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{round_count}", local_save_dir=task_dir)
        )
        if not screenshot_path.exists() or not xml_path.exists():
            return AndroidActionOutput(action_state=RunState.FAIL)
@ -116,7 +119,7 @@ class SelfLearnAndReflect(Action):
        context = self_explore_template.format(task_description=task_desc, last_act=last_act)

        node = await SCREENSHOT_PARSE_NODE.fill(context=context, llm=self.llm, images=[img_base64])
-        print(f"fill result:{node}")
+        logger.debug(f"fill result:{node}")
        if "error" in node.content:
            return AndroidActionOutput(action_state=RunState.FAIL)
        prompt = node.compile(context=context, schema="json", mode="auto")
@ -132,31 +135,25 @@ class SelfLearnAndReflect(Action):
        if isinstance(op_param, TapOp):
            self.ui_area = op_param.area
            x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
-            res = await env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y}))
-            if res == ADB_EXEC_FAIL:
-                return AndroidActionOutput(action_state=RunState.FAIL)
+            action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
        elif isinstance(op_param, TextOp):
-            res = await env.step(EnvAPIAbstract(api_name="user_input", kwargs={"input_txt": op_param.input_str}))
-            if res == ADB_EXEC_FAIL:
-                return AndroidActionOutput(action_state=RunState.FAIL)
+            action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=op_param.input_str)
        elif isinstance(op_param, LongPressOp):
            self.ui_area = op_param.area
            x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
-            res = await env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y}))
-            if res == ADB_EXEC_FAIL:
-                return AndroidActionOutput(action_state=RunState.FAIL)
+            action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
        elif isinstance(op_param, SwipeOp_3):
            self.ui_area = op_param.area
            self.swipe_orient = op_param.swipe_orient
            x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
-            res = await env.step(
-                EnvAPIAbstract(
-                    api_name="user_swipe",
-                    kwargs={"x": x, "y": y, "orient": op_param.swipe_orient, "dist": op_param.dist},
-                )
+            action = EnvAction(
+                action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=op_param.swipe_orient, dist=op_param.dist
            )
-            if res == ADB_EXEC_FAIL:
-                return AndroidActionOutput(action_state=RunState.FAIL)
+
+        obs, _, _, _, info = env.step(action)
+        action_res = info["res"]
+        if action_res == ADB_EXEC_FAIL:
+            return AndroidActionOutput(action_state=RunState.FAIL)

        self.elem_list = elem_list
        self.act_name = op_param.act_name
@ -165,10 +162,8 @@ class SelfLearnAndReflect(Action):
    async def run_reflect(
        self, round_count: int, task_desc: str, last_act: str, task_dir: Path, docs_dir: Path, env: AndroidEnv
    ) -> AndroidActionOutput:
-        screenshot_path: Path = await env.observe(
-            EnvAPIAbstract(
-                api_name="get_screenshot", kwargs={"ss_name": f"{round_count}_after", "local_save_dir": task_dir}
-            )
+        screenshot_path: Path = env.observe(
+            EnvObsParams(obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{round_count}_after", local_save_dir=task_dir)
        )
        if not screenshot_path.exists():
            return AndroidActionOutput(action_state=RunState.FAIL)
@ -226,8 +221,9 @@ class SelfLearnAndReflect(Action):
                self.useless_list.append(resource_id)
                last_act = "NONE"
                if op_param.decision == Decision.BACK.value:
-                    res = await env.step(EnvAPIAbstract(api_name="system_back"))
-                    if res == ADB_EXEC_FAIL:
+                    action = EnvAction(action_type=EnvActionType.SYSTEM_BACK)
+                    obs, _, _, _, info = env.step(action)
+                    if info["res"] == ADB_EXEC_FAIL:
                        return AndroidActionOutput(action_state=RunState.FAIL)
            doc = op_param.documentation
            doc_path = docs_dir.joinpath(f"{resource_id}.txt")
--- a/examples/andriod_assistant/requirements.txt
+++ b/examples/andriod_assistant/requirements.txt
@ -1 +1,2 @@
 pyshine==0.0.9
+opencv-python==4.6.0.66
--- a/examples/andriod_assistant/roles/android_assistant.py
+++ b/examples/andriod_assistant/roles/android_assistant.py
@ -77,7 +77,7 @@ class AndroidAssistant(Role):
    async def react(self) -> Message:
        self.round_count += 1
        result = await super().react()
-        print(f"react result {result}")
+        logger.debug(f"react result {result}")
        return result

    async def _act(self) -> Message:
--- a/examples/andriod_assistant/tests/test.py
+++ b/examples/andriod_assistant/tests/test.py
@ -2,22 +2,20 @@
 # -*- coding: utf-8 -*-
 # @Desc   : test case (imgs from appagent's)

-
 import ast
 import asyncio
 import re
 from pathlib import Path

-from actions.parse_record_an import RECORD_PARSE_NODE
-from prompts.operation_prompt import (
+from examples.andriod_assistant.actions.parse_record_an import RECORD_PARSE_NODE
+from examples.andriod_assistant.prompts.operation_prompt import (
    long_press_doc_template,
    refine_doc_suffix,
    swipe_doc_template,
    tap_doc_template,
    text_doc_template,
 )
-from utils.schema import ActionOp, SwipeOp
-
+from examples.andriod_assistant.utils.schema import ActionOp, SwipeOp
 from metagpt.actions.action import Action
 from metagpt.config2 import config
 from metagpt.logs import logger
@ -62,7 +60,7 @@ async def manual_test():
            prompt_template = swipe_doc_template
            context = prompt_template.format(swipe_dir=swipe_dir, ui_element=swipe_area)
        else:
-            print("Error occurs")
+            logger.error("Error occurs")

        task_desc_path = TASK_DESC_PATH
        task_desc = open(task_desc_path, "r").read()
@ -108,4 +106,3 @@ if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    loop.run_until_complete(manual_test())
    loop.close()
-    print("OK")
--- a/examples/andriod_assistant/tests/test_for_an.py
+++ b/examples/andriod_assistant/tests/test_for_an.py
@ -1,15 +1,17 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # @Desc   : test on android emulator action. After Modify Role Test, this script is discarded.
+
 import asyncio
 import time
 from pathlib import Path

-from actions.manual_record import ManualRecord
-from actions.parse_record import ParseRecord
-from actions.screenshot_parse import ScreenshotParse
-from actions.self_learn_and_reflect import SelfLearnAndReflect
-
+from examples.andriod_assistant.actions.manual_record import ManualRecord
+from examples.andriod_assistant.actions.parse_record import ParseRecord
+from examples.andriod_assistant.actions.screenshot_parse import ScreenshotParse
+from examples.andriod_assistant.actions.self_learn_and_reflect import (
+    SelfLearnAndReflect,
+)
 from metagpt.environment.android_env.android_env import AndroidEnv

 TASK_PATH = Path("apps/Contacts")
@ -77,4 +79,3 @@ if __name__ == "__main__":

    loop.run_until_complete(asyncio.gather(*test_action_list))
    loop.close()
-    print("Finish")
--- a/examples/andriod_assistant/utils/const.py
+++ b/examples/andriod_assistant/utils/const.py
@ -0,0 +1,5 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc   :
+
+ADB_EXEC_FAIL = "FAILED"