update andriod_assistant with obs/action space

This commit is contained in:
better629 2024-03-27 14:59:20 +08:00
parent cdb4d48191
commit 53d0b70fa0
8 changed files with 102 additions and 113 deletions

View file

@ -6,6 +6,7 @@ from pathlib import Path
import cv2
from examples.andriod_assistant.utils.const import ADB_EXEC_FAIL
from examples.andriod_assistant.utils.schema import (
ActionOp,
AndroidActionOutput,
@ -15,9 +16,13 @@ from examples.andriod_assistant.utils.schema import (
from examples.andriod_assistant.utils.utils import draw_bbox_multi, traverse_xml_tree
from metagpt.actions.action import Action
from metagpt.config2 import config
from metagpt.const import ADB_EXEC_FAIL
from metagpt.environment.android_env.android_env import AndroidEnv
from metagpt.environment.api.env_api import EnvAPIAbstract
from metagpt.environment.android_env.env_space import (
EnvAction,
EnvActionType,
EnvObsParams,
EnvObsType,
)
from metagpt.logs import logger
@ -53,19 +58,13 @@ class ManualRecord(Action):
step = 0
while True:
step += 1
screenshot_path: Path = await env.observe(
EnvAPIAbstract(
api_name="get_screenshot",
# kwargs={"ss_name": f"{demo_name}_{step}", "local_save_dir": self.screenshot_before_path}
kwargs={"ss_name": f"{step}", "local_save_dir": self.screenshot_before_path},
screenshot_path: Path = env.observe(
EnvObsParams(
obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{step}", local_save_dir=self.screenshot_before_path
)
)
xml_path: Path = await env.observe(
EnvAPIAbstract(
api_name="get_xml",
# kwargs={"xml_name": f"{demo_name}_{step}", "local_save_dir": self.xml_path}
kwargs={"xml_name": f"{step}", "local_save_dir": self.xml_path},
)
xml_path: Path = env.observe(
EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{step}", local_save_dir=self.xml_path)
)
if not screenshot_path.exists() or not xml_path.exists():
return AndroidActionOutput(action_state=RunState.FAIL)
@ -103,8 +102,8 @@ class ManualRecord(Action):
user_input = "xxx"
logger.info(
"Choose one of the following actions you want to perform on the current screen:\ntap, text, long_press,"
"swipe, stop",
"Choose one of the following actions you want to perform on the current screen:\n"
"tap, text, long_press, swipe, stop",
"blue",
)
@ -126,10 +125,8 @@ class ManualRecord(Action):
user_input = input()
tl, br = elem_list[int(user_input) - 1].bbox
x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
ret = await env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y}))
if ret == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
record_file.write(f"tap({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n")
action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
log_str = f"tap({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n"
elif user_input.lower() == ActionOp.TEXT.value:
logger.info(
f"Which element do you want to input the text string? Choose a numeric tag from 1 to "
@ -143,8 +140,8 @@ class ManualRecord(Action):
user_input = ""
while not user_input:
user_input = input()
await env.step(EnvAPIAbstract(api_name="user_input", kwargs={"input_txt": user_input}))
record_file.write(f'text({input_area}:sep:"{user_input}"):::{elem_list[int(input_area) - 1].uid}\n')
action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=user_input)
log_str = f"text({input_area}:sep:'{user_input}'):::{elem_list[int(input_area) - 1].uid}\n"
elif user_input.lower() == ActionOp.LONG_PRESS.value:
logger.info(
f"Which element do you want to long press? Choose a numeric tag from 1 to {len(elem_list)}:", "blue"
@ -154,14 +151,12 @@ class ManualRecord(Action):
user_input = input()
tl, br = elem_list[int(user_input) - 1].bbox
x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
ret = await env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y}))
if ret == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
record_file.write(f"long_press({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n")
action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
log_str = f"long_press({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n"
elif user_input.lower() == ActionOp.SWIPE.value:
logger.info(
"What is the direction of your swipe? Choose one from the following options:\nup, down, left,"
" right",
"What is the direction of your swipe? Choose one from the following options:\n"
"up, down, left, right",
"blue",
)
user_input = ""
@ -178,16 +173,20 @@ class ManualRecord(Action):
user_input = input()
tl, br = elem_list[int(user_input) - 1].bbox
x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
ret = await env.step(
EnvAPIAbstract(api_name="user_swipe", kwargs={"x": x, "y": y, "orient": swipe_dir})
)
if ret == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
record_file.write(f"swipe({int(user_input)}:sep:{swipe_dir}):::{elem_list[int(user_input) - 1].uid}\n")
action = EnvAction(action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=swipe_dir)
log_str = f"swipe({int(user_input)}:sep:{swipe_dir}):::{elem_list[int(user_input) - 1].uid}\n"
elif user_input.lower() == ActionOp.STOP.value:
record_file.write("stop\n")
record_file.close()
break
else:
break
obs, _, _, _, info = env.step(action)
action_res = info["res"]
if action_res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
record_file.write(log_str)
time.sleep(3)

View file

@ -10,6 +10,7 @@ from examples.andriod_assistant.prompts.assistant_prompt import (
screenshot_parse_template,
screenshot_parse_with_grid_template,
)
from examples.andriod_assistant.utils.const import ADB_EXEC_FAIL
from examples.andriod_assistant.utils.schema import (
AndroidActionOutput,
AndroidElement,
@ -34,9 +35,13 @@ from examples.andriod_assistant.utils.utils import (
)
from metagpt.actions.action import Action
from metagpt.config2 import config
from metagpt.const import ADB_EXEC_FAIL
from metagpt.environment.android_env.android_env import AndroidEnv
from metagpt.environment.api.env_api import EnvAPIAbstract
from metagpt.environment.android_env.env_space import (
EnvAction,
EnvActionType,
EnvObsParams,
EnvObsType,
)
from metagpt.utils.common import encode_image
@ -91,14 +96,11 @@ class ScreenshotParse(Action):
for path in [task_dir, docs_dir]:
if not path.exists():
path.mkdir(parents=True, exist_ok=True)
screenshot_path: Path = await env.observe(
EnvAPIAbstract(
api_name="get_screenshot", kwargs={"ss_name": f"{round_count}_before", "local_save_dir": task_dir}
)
screenshot_path: Path = env.observe(
EnvObsParams(obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{round_count}_before", local_save_dir=task_dir)
)
xml_path: Path = await env.observe(
EnvAPIAbstract(api_name="get_xml", kwargs={"xml_name": f"{round_count}", "local_save_dir": task_dir})
xml_path: Path = env.observe(
EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{round_count}", local_save_dir=task_dir)
)
width, height = env.device_shape
if not screenshot_path.exists() or not xml_path.exists():
@ -150,41 +152,26 @@ class ScreenshotParse(Action):
if isinstance(op_param, TapOp):
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
res = await env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y}))
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
elif isinstance(op_param, TextOp):
res = await env.step(EnvAPIAbstract(api_name="user_input", kwargs={"input_txt": op_param.input_str}))
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=op_param.input_str)
elif isinstance(op_param, LongPressOp):
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
res = await env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y}))
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
elif isinstance(op_param, SwipeOp_3):
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
res = await env.step(
EnvAPIAbstract(
api_name="user_swipe",
kwargs={"x": x, "y": y, "orient": op_param.swipe_orient, "dist": op_param.dist},
)
action = EnvAction(
action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=op_param.swipe_orient, dist=op_param.dist
)
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
elif isinstance(op_param, GridOp):
grid_on = True
elif isinstance(op_param, TapGridOp) or isinstance(op_param, LongPressGridOp):
x, y = area_to_xy(op_param.area, op_param.subarea, env.width, env.height, env.rows, env.cols)
if isinstance(op_param, TapGridOp):
res = await env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y}))
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
else:
# LongPressGridOp
res = await env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y}))
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
elif isinstance(op_param, SwipeGridOp):
start_x, start_y = area_to_xy(
op_param.start_area, op_param.start_subarea, env.width, env.height, env.rows, env.cols
@ -192,11 +179,14 @@ class ScreenshotParse(Action):
end_x, end_y = area_to_xy(
op_param.end_area, op_param.end_subarea, env.width, env.height, env.rows, env.cols
)
res = await env.step(
EnvAPIAbstract(api_name="user_swipe_to", kwargs={"start": (start_x, start_y), "end": (end_x, end_y)})
action = EnvAction(
action_type=EnvActionType.USER_SWIPE_TO, coord=(start_x, start_y), tgt_coord=(end_x, end_y)
)
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
obs, _, _, _, info = env.step(action)
action_res = info["res"]
if action_res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
if op_param.act_name != "grid":
grid_on = True

View file

@ -15,6 +15,7 @@ from examples.andriod_assistant.prompts.assistant_prompt import (
from examples.andriod_assistant.prompts.assistant_prompt import (
screenshot_parse_self_explore_template,
)
from examples.andriod_assistant.utils.const import ADB_EXEC_FAIL
from examples.andriod_assistant.utils.schema import (
ActionOp,
AndroidActionOutput,
@ -39,9 +40,13 @@ from examples.andriod_assistant.utils.utils import (
)
from metagpt.actions.action import Action
from metagpt.config2 import config
from metagpt.const import ADB_EXEC_FAIL
from metagpt.environment.android_env.android_env import AndroidEnv
from metagpt.environment.api.env_api import EnvAPIAbstract
from metagpt.environment.android_env.env_space import (
EnvAction,
EnvActionType,
EnvObsParams,
EnvObsType,
)
from metagpt.logs import logger
from metagpt.utils.common import encode_image
@ -71,13 +76,11 @@ class SelfLearnAndReflect(Action):
async def run_self_learn(
self, round_count: int, task_desc: str, last_act: str, task_dir: Path, env: AndroidEnv
) -> AndroidActionOutput:
screenshot_path: Path = await env.observe(
EnvAPIAbstract(
api_name="get_screenshot", kwargs={"ss_name": f"{round_count}_before", "local_save_dir": task_dir}
)
screenshot_path: Path = env.observe(
EnvObsParams(obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{round_count}_before", local_save_dir=task_dir)
)
xml_path: Path = await env.observe(
EnvAPIAbstract(api_name="get_xml", kwargs={"xml_name": f"{round_count}", "local_save_dir": task_dir})
xml_path: Path = env.observe(
EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{round_count}", local_save_dir=task_dir)
)
if not screenshot_path.exists() or not xml_path.exists():
return AndroidActionOutput(action_state=RunState.FAIL)
@ -116,7 +119,7 @@ class SelfLearnAndReflect(Action):
context = self_explore_template.format(task_description=task_desc, last_act=last_act)
node = await SCREENSHOT_PARSE_NODE.fill(context=context, llm=self.llm, images=[img_base64])
print(f"fill result:{node}")
logger.debug(f"fill result:{node}")
if "error" in node.content:
return AndroidActionOutput(action_state=RunState.FAIL)
prompt = node.compile(context=context, schema="json", mode="auto")
@ -132,31 +135,25 @@ class SelfLearnAndReflect(Action):
if isinstance(op_param, TapOp):
self.ui_area = op_param.area
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
res = await env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y}))
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
elif isinstance(op_param, TextOp):
res = await env.step(EnvAPIAbstract(api_name="user_input", kwargs={"input_txt": op_param.input_str}))
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=op_param.input_str)
elif isinstance(op_param, LongPressOp):
self.ui_area = op_param.area
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
res = await env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y}))
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
elif isinstance(op_param, SwipeOp_3):
self.ui_area = op_param.area
self.swipe_orient = op_param.swipe_orient
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
res = await env.step(
EnvAPIAbstract(
api_name="user_swipe",
kwargs={"x": x, "y": y, "orient": op_param.swipe_orient, "dist": op_param.dist},
)
action = EnvAction(
action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=op_param.swipe_orient, dist=op_param.dist
)
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
obs, _, _, _, info = env.step(action)
action_res = info["res"]
if action_res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
self.elem_list = elem_list
self.act_name = op_param.act_name
@ -165,10 +162,8 @@ class SelfLearnAndReflect(Action):
async def run_reflect(
self, round_count: int, task_desc: str, last_act: str, task_dir: Path, docs_dir: Path, env: AndroidEnv
) -> AndroidActionOutput:
screenshot_path: Path = await env.observe(
EnvAPIAbstract(
api_name="get_screenshot", kwargs={"ss_name": f"{round_count}_after", "local_save_dir": task_dir}
)
screenshot_path: Path = env.observe(
EnvObsParams(obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{round_count}_after", local_save_dir=task_dir)
)
if not screenshot_path.exists():
return AndroidActionOutput(action_state=RunState.FAIL)
@ -226,8 +221,9 @@ class SelfLearnAndReflect(Action):
self.useless_list.append(resource_id)
last_act = "NONE"
if op_param.decision == Decision.BACK.value:
res = await env.step(EnvAPIAbstract(api_name="system_back"))
if res == ADB_EXEC_FAIL:
action = EnvAction(action_type=EnvActionType.SYSTEM_BACK)
obs, _, _, _, info = env.step(action)
if info["res"] == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
doc = op_param.documentation
doc_path = docs_dir.joinpath(f"{resource_id}.txt")

View file

@ -1 +1,2 @@
pyshine==0.0.9
opencv-python==4.6.0.66

View file

@ -77,7 +77,7 @@ class AndroidAssistant(Role):
async def react(self) -> Message:
self.round_count += 1
result = await super().react()
print(f"react result {result}")
logger.debug(f"react result {result}")
return result
async def _act(self) -> Message:

View file

@ -2,22 +2,20 @@
# -*- coding: utf-8 -*-
# @Desc : test case (imgs from appagent's)
import ast
import asyncio
import re
from pathlib import Path
from actions.parse_record_an import RECORD_PARSE_NODE
from prompts.operation_prompt import (
from examples.andriod_assistant.actions.parse_record_an import RECORD_PARSE_NODE
from examples.andriod_assistant.prompts.operation_prompt import (
long_press_doc_template,
refine_doc_suffix,
swipe_doc_template,
tap_doc_template,
text_doc_template,
)
from utils.schema import ActionOp, SwipeOp
from examples.andriod_assistant.utils.schema import ActionOp, SwipeOp
from metagpt.actions.action import Action
from metagpt.config2 import config
from metagpt.logs import logger
@ -62,7 +60,7 @@ async def manual_test():
prompt_template = swipe_doc_template
context = prompt_template.format(swipe_dir=swipe_dir, ui_element=swipe_area)
else:
print("Error occurs")
logger.error("Error occurs")
task_desc_path = TASK_DESC_PATH
task_desc = open(task_desc_path, "r").read()
@ -108,4 +106,3 @@ if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(manual_test())
loop.close()
print("OK")

View file

@ -1,15 +1,17 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc : test on android emulator action. After Modify Role Test, this script is discarded.
import asyncio
import time
from pathlib import Path
from actions.manual_record import ManualRecord
from actions.parse_record import ParseRecord
from actions.screenshot_parse import ScreenshotParse
from actions.self_learn_and_reflect import SelfLearnAndReflect
from examples.andriod_assistant.actions.manual_record import ManualRecord
from examples.andriod_assistant.actions.parse_record import ParseRecord
from examples.andriod_assistant.actions.screenshot_parse import ScreenshotParse
from examples.andriod_assistant.actions.self_learn_and_reflect import (
SelfLearnAndReflect,
)
from metagpt.environment.android_env.android_env import AndroidEnv
TASK_PATH = Path("apps/Contacts")
@ -77,4 +79,3 @@ if __name__ == "__main__":
loop.run_until_complete(asyncio.gather(*test_action_list))
loop.close()
print("Finish")

View file

@ -0,0 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc :
ADB_EXEC_FAIL = "FAILED"