UPdate ScreenShot Parse Action Node;Adds 4 action nodes test to test_for_an.py

This commit is contained in:
didi 2024-02-25 13:14:50 +08:00 committed by better629
parent 13975976d5
commit 26bbdef6c6
3 changed files with 86 additions and 83 deletions

View file

@ -19,13 +19,14 @@ from examples.andriod_assistant.utils.schema import (
OpLogItem,
RunState,
SwipeGridOp,
SwipeOp,
SwipeOp_3,
TapGridOp,
TapOp,
TextOp,
)
from examples.andriod_assistant.utils.utils import (
area_to_xy,
draw_grid,
draw_bbox_multi,
elem_bbox_to_xy,
screenshot_parse_extract,
@ -47,9 +48,9 @@ class ScreenshotParse(Action):
return ""
ui_doc = """
You also have access to the following documentations that describes the functionalities of UI
elements you can interact on the screen. These docs are crucial for you to determine the target of your
next action. You should always prioritize these documented elements for interaction:"""
You also have access to the following documentations that describes the functionalities of UI
elements you can interact on the screen. These docs are crucial for you to determine the target of your
next action. You should always prioritize these documented elements for interaction:"""
for i, elem in enumerate(elem_list):
doc_path = docs_idr.joinpath(f"{elem.uid}.txt")
if not doc_path.exists():
@ -78,23 +79,32 @@ next action. You should always prioritize these documented elements for interact
return ui_doc
async def run(
self,
round_count: int,
task_desc: str,
last_act: str,
task_dir: Path,
docs_dir: Path,
grid_on: bool,
env: AndroidEnv,
self,
round_count: int,
task_desc: str,
last_act: str,
task_dir: Path,
docs_dir: Path,
grid_on: bool,
env: AndroidEnv,
):
screenshot_path: Path = env.step(
for path in [task_dir, docs_dir]:
if not path.exists():
path.mkdir(parents=True, exist_ok=True)
screenshot_path: Path = env.observe(
EnvAPIAbstract(
api_name="get_screenshot", kwargs={"ss_name": f"{round_count}_before", "local_save_dir": task_dir}
api_name="get_screenshot",
kwargs={"ss_name": f"{round_count}_before", "local_save_dir": task_dir}
)
)
xml_path: Path = env.step(
EnvAPIAbstract(api_name="get_xml", kwargs={"xml_name": f"{round_count}", "local_save_dir": task_dir})
xml_path: Path = env.observe(
EnvAPIAbstract(
api_name="get_xml",
kwargs={"xml_name": f"{round_count}", "local_save_dir": task_dir}
)
)
width, height = env.device_shape
if not screenshot_path.exists() or not xml_path.exists():
return AndroidActionOutput(action_state=RunState.FAIL)
@ -111,18 +121,23 @@ next action. You should always prioritize these documented elements for interact
bbox = e.bbox
center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5
if dist <= config.get_other("min_dist"):
# TODO Modify config to default 30. It should be modified back config after single action test
# if dist <= config.get_other("min_dist"):
if dist <= 30:
close = True
break
if not close:
elem_list.append(elem)
screenshot_labeled_path = task_dir.joinpath(f"{task_dir}_{round_count}_labeled.png")
screenshot_labeled_path = task_dir.joinpath(f"{round_count}_labeled.png")
draw_bbox_multi(screenshot_path, screenshot_labeled_path, elem_list)
img_base64 = encode_image(screenshot_labeled_path)
parse_template = screenshot_parse_with_grid_template if grid_on else screenshot_parse_template
if grid_on:
rows, cols = draw_grid(screenshot_path, task_dir / f"{round_count}_grid.png")
ui_doc = self._makeup_ui_document(elem_list, docs_dir)
context = parse_template.format(ui_document=ui_doc, task_description=task_desc, last_act=last_act)
node = await SCREENSHOT_PARSE_NODE.fill(context=context, llm=self.llm, images=[img_base64])
@ -131,7 +146,7 @@ next action. You should always prioritize these documented elements for interact
return AndroidActionOutput(action_state=RunState.FAIL)
prompt = node.compile(context=context, schema="json", mode="auto")
OpLogItem(step=round_count, prompt=prompt, image=screenshot_labeled_path, response=node.content)
OpLogItem(step=round_count, prompt=prompt, image=str(screenshot_labeled_path), response=node.content)
op_param = screenshot_parse_extract(node.instruct_content.model_dump(), grid_on)
if op_param.param_state == RunState.FINISH:
@ -141,23 +156,24 @@ next action. You should always prioritize these documented elements for interact
if isinstance(op_param, TapOp):
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
res = env.step(EnvAPIAbstract("system_tap", kwargs={"x": x, "y": y}))
res = env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y}))
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
elif isinstance(op_param, TextOp):
res = env.step(EnvAPIAbstract("user_input", kwargs={"input_txt": op_param.input_str}))
res = env.step(EnvAPIAbstract(api_name="user_input", kwargs={"input_txt": op_param.input_str}))
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
elif isinstance(op_param, LongPressOp):
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
res = env.step(EnvAPIAbstract("user_longpress", kwargs={"x": x, "y": y}))
res = env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y}))
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
elif isinstance(op_param, SwipeOp):
elif isinstance(op_param, SwipeOp_3):
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
res = env.step(
EnvAPIAbstract(
"user_swipe", kwargs={"x": x, "y": y, "orient": op_param.swipe_orient, "dist": op_param.dist}
api_name="user_swipe",
kwargs={"x": x, "y": y, "orient": op_param.swipe_orient, "dist": op_param.dist}
)
)
if res == ADB_EXEC_FAIL:
@ -167,18 +183,19 @@ next action. You should always prioritize these documented elements for interact
elif isinstance(op_param, TapGridOp) or isinstance(op_param, LongPressGridOp):
x, y = area_to_xy(op_param.area, op_param.subarea, env.width, env.height, env.rows, env.cols)
if isinstance(op_param, TapGridOp):
res = env.step(EnvAPIAbstract("system_tap", kwargs={"x": x, "y": y}))
res = env.step(EnvAPIAbstract(api_name="system_tap", kwargs={"x": x, "y": y}))
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
else:
# LongPressGridOp
res = env.step(EnvAPIAbstract("user_longpress", kwargs={"x": x, "y": y}))
res = env.step(EnvAPIAbstract(api_name="user_longpress", kwargs={"x": x, "y": y}))
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
elif isinstance(op_param, SwipeGridOp):
start_x, start_y = area_to_xy(op_param.start_area, op_param.start_subarea)
end_x, end_y = area_to_xy(op_param.end_area, op_param.end_subarea)
res = env.step(EnvAPIAbstract("user_swipe_to", kwargs={"start": (start_x, start_y), "end": (end_x, end_y)}))
start_x, start_y = area_to_xy(op_param.start_area, op_param.start_subarea, width, height, rows, cols)
end_x, end_y = area_to_xy(op_param.end_area, op_param.end_subarea, width, height, rows, cols)
res = env.step(
EnvAPIAbstract(api_name="user_swipe_to", kwargs={"start": (start_x, start_y), "end": (end_x, end_y)}))
if res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)

View file

@ -7,6 +7,7 @@ from pathlib import Path
from actions.manual_record import ManualRecord
from actions.parse_record import ParseRecord
from actions.self_learn_and_reflect import SelfLearnAndReflect
from actions.screenshot_parse import ScreenshotParse
from metagpt.environment.android_env.android_env import AndroidEnv
TASK_PATH = Path("apps/Contacts")
@ -29,64 +30,49 @@ test_env_manual_learn_android = AndroidEnv(
test_manual_record = ManualRecord()
test_manual_parse = ParseRecord()
# 虚拟机效果实现
# 不同 Action Node 结果符合预期(Action Node)
test_env_screenshot_parse_android = AndroidEnv(
device_id="emulator-5554",
xml_dir=Path("/sdcard"),
screenshot_dir=Path("/sdcard/Pictures/Screenshots"),
)
test_screenshot_parse = ScreenshotParse()
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(
test_action_list = [
test_self_learning.run(
round_count=20,
task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ",
last_act="",
task_dir=TASK_PATH / "demos" / f"self_learning_{DEMO_NAME}",
docs_dir=SELF_EXPLORE_DOC_PATH,
env=test_env_self_learn_android
),
test_manual_record.run(
demo_name=DEMO_NAME,
task_dir=TASK_PATH / "demos" / f"manual_record_{DEMO_NAME}",
task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ",
env=test_env_manual_learn_android
),
test_manual_parse.run(
app_name="Contacts",
demo_name="1708753998.5757847",
task_dir=TASK_PATH / "demos" / f"manual_record_1708753998.5757847", # 修要修改
docs_dir=PARSE_RECORD_DOC_PATH, # 需要修改
demo_name=DEMO_NAME,
task_dir=TASK_PATH / "demos" / f"manual_record_{DEMO_NAME}", # 修要修改
docs_dir=PARSE_RECORD_DOC_PATH, # 需要修改
env=test_env_manual_learn_android
))
# test_action_list = [
# # test_self_learning.run(
# # round_count=20,
# # task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ",
# # last_act="",
# # task_dir= TASK_PATH / "demos" / f"self_learning_{DEMO_NAME}",
# # docs_dir=DOC_PATH,
# # env=test_env_self_learn_android
# # ),
# test_manual_record.run(
# demo_name=DEMO_NAME,
# task_dir=TASK_PATH / "demos" / f"manual_record_{DEMO_NAME}",
# task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ",
# env=test_env_manual_learn_android
# ),
# test_manual_parse.run(
# app_name="Contacts",
# demo_name=DEMO_NAME,
# task_dir=TASK_PATH / "demos" / f"manual_record_{DEMO_NAME}", # 修要修改
# docs_dir=PARSE_RECORD_DOC_PATH, # 需要修改
# env=test_env_manual_learn_android
# )
# ]
# test_action_list = [
# test_self_learning.run(
# round_count=20,
# task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ",
# last_act="",
# task_dir=TASK_PATH,
# docs_dir=DOC_PATH,
# env=test_env_self_learn_android
# ),
# test_manual_record.run(
# demo_name=DEMO_NAME,
# task_dir=TASK_PATH,
# env=test_env_manual_learn_android
# ),
# test_manual_parse.run(
# app_name="Contacts",
# demo_name=DEMO_NAME,
# task_dir=TASK_PATH,
# docs_dir=DOC_PATH,
# env=test_env_manual_learn_android
# )
# ]
# loop.run_until_complete(asyncio.gather(*test_action_list))
),
test_screenshot_parse.run(
round_count=20,
task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ",
last_act="",
task_dir=TASK_PATH / f"act_{DEMO_NAME}",
docs_dir=PARSE_RECORD_DOC_PATH,
env=test_env_screenshot_parse_android,
grid_on=False
)
]
loop.run_until_complete(asyncio.gather(*test_action_list))
loop.close()
print("Finish")

View file

@ -163,7 +163,7 @@ def area_to_xy(area: int, subarea: str, width: int, height: int, rows: int, cols
return x, y
def elem_bbox_to_xy(bbox: tuple[tuple[int, int]]) -> tuple[int, int]:
def elem_bbox_to_xy(bbox: tuple[tuple[int, int], tuple[int, int]]) -> tuple[int, int]:
tl, br = bbox
x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
return x, y