Merge pull request #1135 from better629/feat_android

Feat android assistant agent
This commit is contained in:
Alexander Wu 2024-04-04 21:31:41 +08:00 committed by GitHub
commit 9e26a40c89
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
42 changed files with 2345 additions and 29 deletions

View file

@ -0,0 +1,2 @@
pyshine==0.0.9
opencv-python==4.6.0.66

View file

@ -0,0 +1,71 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc : the entry of android assistant including learning and acting stage
# See the usage README inside `metagpt/ext/android_assistant`
# README see `metagpt/ext/android_assistant/README.md`
import asyncio
from pathlib import Path
import typer
from metagpt.config2 import config
from metagpt.environment.android.android_env import AndroidEnv
from metagpt.ext.android_assistant.roles.android_assistant import AndroidAssistant
from metagpt.team import Team
app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
@app.command("", help="Run a Android Assistant")
def startup(
task_desc: str = typer.Argument(help="the task description you want the android assistant to learn or act"),
n_round: int = typer.Option(default=20, help="The max round to do an app operation task."),
stage: str = typer.Option(default="learn", help="stage: learn / act"),
mode: str = typer.Option(default="auto", help="mode: auto / manual , when state=learn"),
app_name: str = typer.Option(default="demo", help="the name of app you want to run"),
investment: float = typer.Option(default=5.0, help="Dollar amount to invest in the AI company."),
refine_doc: bool = typer.Option(
default=False, help="Refine existing operation docs based on the latest observation if True."
),
min_dist: int = typer.Option(
default=30, help="The minimum distance between elements to prevent overlapping during the labeling process."
),
android_screenshot_dir: str = typer.Option(
default="/sdcard/Pictures/Screenshots",
help="The path to store screenshots on android device. Make sure it exists.",
),
android_xml_dir: str = typer.Option(
default="/sdcard",
help="The path to store xml files for determining UI elements localtion. Make sure it exists.",
),
device_id: str = typer.Option(default="emulator-5554", help="The Android device_id"),
):
config.extra = {
"stage": stage,
"mode": mode,
"app_name": app_name,
"task_desc": task_desc,
"refine_doc": refine_doc,
"min_dist": min_dist,
"android_screenshot_dir": android_screenshot_dir,
"android_xml_dir": android_xml_dir,
"device_id": device_id,
}
team = Team(
env=AndroidEnv(
device_id=device_id,
xml_dir=Path(android_xml_dir),
screenshot_dir=Path(android_screenshot_dir),
)
)
team.hire([AndroidAssistant(output_root_dir=Path(__file__).parent)])
team.invest(investment)
team.run_project(idea=task_desc)
asyncio.run(team.run(n_round=n_round))
if __name__ == "__main__":
app()

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc : entry of Stanford Town(ST/st) game
# README see `metagpt/ext/stanford_town/README.md`
import asyncio
from typing import Optional

View file

@ -450,7 +450,6 @@ class ActionNode:
self, schema, mode, images: Optional[Union[str, list[str]]] = None, timeout=USE_CONFIG_TIMEOUT, exclude=None
):
prompt = self.compile(context=self.context, schema=schema, mode=mode, exclude=exclude)
if schema != "raw":
mapping = self.get_mapping(mode, exclude=exclude)
class_name = f"{self.key}_AN"

View file

@ -75,6 +75,7 @@ class Config(CLIParams, YamlModel):
iflytek_api_key: str = ""
azure_tts_subscription_key: str = ""
azure_tts_region: str = ""
_extra: dict = dict() # extra config dict
@classmethod
def from_home(cls, path):
@ -127,6 +128,14 @@ class Config(CLIParams, YamlModel):
self.reqa_file = reqa_file
self.max_auto_summarize_code = max_auto_summarize_code
@property
def extra(self):
return self._extra
@extra.setter
def extra(self, value: dict):
self._extra = value
def get_openai_llm(self) -> Optional[LLMConfig]:
"""Get OpenAI LLMConfig by name. If no OpenAI, raise Exception"""
if self.llm.api_type == LLMType.OPENAI:

View file

@ -8,6 +8,8 @@ from metagpt.environment.android.android_ext_env import AndroidExtEnv
from metagpt.environment.base_env import Environment
class AndroidEnv(Environment, AndroidExtEnv):
class AndroidEnv(AndroidExtEnv, Environment):
"""in order to use actual `reset`&`observe`, inherited order: AndroidExtEnv, Environment"""
rows: int = Field(default=0, description="rows of a grid on the screenshot")
cols: int = Field(default=0, description="cols of a grid on the screenshot")

View file

@ -9,8 +9,14 @@ from typing import Any, Optional
from pydantic import Field
from metagpt.environment.android.const import ADB_EXEC_FAIL
from metagpt.environment.android.env_space import (
EnvAction,
EnvActionType,
EnvObsParams,
EnvObsType,
EnvObsValType,
)
from metagpt.environment.base_env import ExtEnv, mark_as_readable, mark_as_writeable
from metagpt.environment.base_env_space import BaseEnvAction, BaseEnvObsParams
class AndroidExtEnv(ExtEnv):
@ -20,26 +26,71 @@ class AndroidExtEnv(ExtEnv):
width: int = Field(default=720, description="device screen width")
height: int = Field(default=1080, description="device screen height")
def __init__(self, **data: Any):
super().__init__(**data)
device_id = data.get("device_id")
if device_id:
devices = self.list_devices()
if device_id not in devices:
raise RuntimeError(f"device-id: {device_id} not found")
(width, height) = self.device_shape
self.width = data.get("width", width)
self.height = data.get("height", height)
self.create_device_path(self.screenshot_dir)
self.create_device_path(self.xml_dir)
def reset(
self,
*,
seed: Optional[int] = None,
options: Optional[dict[str, Any]] = None,
) -> tuple[dict[str, Any], dict[str, Any]]:
super().reset(seed=seed, options=options)
obs = self._get_obs()
return obs, {}
def _get_obs(self) -> dict[str, EnvObsValType]:
pass
def observe(self, obs_params: Optional[BaseEnvObsParams] = None) -> Any:
pass
def observe(self, obs_params: Optional[EnvObsParams] = None) -> Any:
obs_type = obs_params.obs_type if obs_params else EnvObsType.NONE
if obs_type == EnvObsType.NONE:
pass
elif obs_type == EnvObsType.GET_SCREENSHOT:
obs = self.get_screenshot(ss_name=obs_params.ss_name, local_save_dir=obs_params.local_save_dir)
elif obs_type == EnvObsType.GET_XML:
obs = self.get_xml(xml_name=obs_params.xml_name, local_save_dir=obs_params.local_save_dir)
return obs
def step(self, action: BaseEnvAction) -> tuple[dict[str, Any], float, bool, bool, dict[str, Any]]:
pass
def step(self, action: EnvAction) -> tuple[dict[str, Any], float, bool, bool, dict[str, Any]]:
res = self._execute_env_action(action)
def __init__(self, **data: Any):
super().__init__(**data)
if data.get("device_id"):
(width, height) = self.device_shape
self.width = data.get("width", width)
self.height = data.get("height", height)
obs = {}
ret = (obs, 1.0, False, False, {"res": res})
return ret
def _execute_env_action(self, action: EnvAction):
action_type = action.action_type
res = None
if action_type == EnvActionType.NONE:
pass
elif action_type == EnvActionType.SYSTEM_BACK:
res = self.system_back()
elif action_type == EnvActionType.SYSTEM_TAP:
res = self.system_tap(x=action.coord[0], y=action.coord[1])
elif action_type == EnvActionType.USER_INPUT:
res = self.user_input(input_txt=action.input_txt)
elif action_type == EnvActionType.USER_LONGPRESS:
res = self.user_longpress(x=action.coord[0], y=action.coord[1])
elif action_type == EnvActionType.USER_SWIPE:
res = self.user_swipe(x=action.coord[0], y=action.coord[1], orient=action.orient, dist=action.dist)
elif action_type == EnvActionType.USER_SWIPE_TO:
res = self.user_swipe_to(start=action.coord, end=action.tgt_coord)
return res
@property
def adb_prefix_si(self):
@ -57,12 +108,19 @@ class AndroidExtEnv(ExtEnv):
return f"adb -s {self.device_id} "
def execute_adb_with_cmd(self, adb_cmd: str) -> str:
adb_cmd = adb_cmd.replace("\\", "/")
res = subprocess.run(adb_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
exec_res = ADB_EXEC_FAIL
if not res.returncode:
exec_res = res.stdout.strip()
return exec_res
def create_device_path(self, folder_path: Path):
adb_cmd = f"{self.adb_prefix_shell} mkdir {folder_path} -p"
res = self.execute_adb_with_cmd(adb_cmd)
if res == ADB_EXEC_FAIL:
raise RuntimeError(f"create device path: {folder_path} failed")
@property
def device_shape(self) -> tuple[int, int]:
adb_cmd = f"{self.adb_prefix_shell} wm size"

View file

@ -0,0 +1,92 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc :
from pathlib import Path
from typing import Union
import numpy as np
import numpy.typing as npt
from gymnasium import spaces
from pydantic import ConfigDict, Field, field_validator
from metagpt.environment.base_env_space import (
BaseEnvAction,
BaseEnvActionType,
BaseEnvObsParams,
BaseEnvObsType,
)
class EnvActionType(BaseEnvActionType):
NONE = 0 # no action to run, just get observation
SYSTEM_BACK = 1
SYSTEM_TAP = 2
USER_INPUT = 3
USER_LONGPRESS = 4
USER_SWIPE = 5
USER_SWIPE_TO = 6
class EnvAction(BaseEnvAction):
model_config = ConfigDict(arbitrary_types_allowed=True)
action_type: int = Field(default=EnvActionType.NONE, description="action type")
coord: npt.NDArray[np.int64] = Field(
default_factory=lambda: np.zeros(2, dtype=np.int64), description="operation coordinate"
)
tgt_coord: npt.NDArray[np.int64] = Field(
default_factory=lambda: np.zeros(2, dtype=np.int64), description="target operation coordinate"
)
input_txt: str = Field(default="", description="user input text")
orient: str = Field(default="up", description="swipe orient")
dist: str = Field(default="medium", description="swipe dist")
@field_validator("coord", "tgt_coord", mode="before")
@classmethod
def check_coord(cls, coord) -> npt.NDArray[np.int64]:
if not isinstance(coord, np.ndarray):
return np.array(coord)
class EnvObsType(BaseEnvObsType):
NONE = 0 # get whole observation from env
GET_SCREENSHOT = 1
GET_XML = 2
class EnvObsParams(BaseEnvObsParams):
model_config = ConfigDict(arbitrary_types_allowed=True)
obs_type: int = Field(default=EnvObsType.NONE, description="observation type")
ss_name: str = Field(default="", description="screenshot file name")
xml_name: str = Field(default="", description="xml file name")
local_save_dir: Union[str, Path] = Field(default="", description="local dir to save file")
EnvObsValType = str
def get_observation_space() -> spaces.Dict:
space = spaces.Dict({"screenshot": spaces.Text(256), "xml": spaces.Text(256)})
return space
def get_action_space(device_shape: tuple[int, int]) -> spaces.Dict:
space = spaces.Dict(
{
"action_type": spaces.Discrete(len(EnvActionType)),
"coord": spaces.Box(
np.array([0, 0], dtype=np.int64), np.array([device_shape[0], device_shape[1]], dtype=np.int64)
),
"tgt_coord": spaces.Box(
np.array([0, 0], dtype=np.int64), np.array([device_shape[0], device_shape[1]], dtype=np.int64)
),
"input_txt": spaces.Text(256),
"orient": spaces.Text(16),
"dist": spaces.Text(16),
}
)
return space

View file

@ -18,11 +18,11 @@ class EnvAPIAbstract(BaseModel):
class EnvAPIRegistry(BaseModel):
"""the registry to store environment w&r api/interface"""
registry: dict[str, dict[str, Union[dict, Any, str]]] = Field(default=dict(), exclude=True)
registry: dict[str, Callable] = Field(default=dict(), exclude=True)
def get(self, api_name: str):
if api_name not in self.registry:
raise ValueError
raise KeyError(f"api_name: {api_name} not found")
return self.registry.get(api_name)
def __getitem__(self, api_name: str) -> Callable:

View file

@ -0,0 +1,118 @@
# MetaGPT Android Assistant
The MetaGPT Android Assistant is an intelligent assistance tool driven by a multi-modal large language model based on the advanced MetaGPT framework. It has the ability to self-learn, mastering users' daily usage patterns through learning, and can automatically complete various application operations according to user instructions, achieving comprehensive liberation of users' hands.
Next, we will introduce the functions of the MetaGPT Android Assistant and how to use it.
## Features
The operation of the MetaGPT Android Assistant mainly includes two stages: learning and automatic execution. Below, we introduce the specific features of the MetaGPT Android Assistant from these two stages.
### Learning Stage
By learning from human demonstrations or exploring apps based on human instructions, the MetaGPT Android Assistant can learn the functionality of apps, generate corresponding operation documents for use in the subsequent "automatic execution" stage. Approximately 20 rounds of exploration for any given task objective can significantly improve performance.
By setting the `stage` to `learn`, you can ask the Android Assistant to enter the learning stage. By setting the `mode` to `auto`, you can instruct the Android Assistant to learn through automatic exploration; by setting the mode to manual, you can instruct the Android Assistant to learn through human manual demonstration. In the usage section, we provide detailed explanations of the script parameters. You can try experimenting with automatic exploration and manual demonstration modes on the "Messenger" app with the following commands:
```bash
cd examples/android_assistant
python run_assistant.py "Send 'When will we release this feature?' to +86 8888888" --stage "learn" --mode "auto or manual" --app-name "Messenger"
```
#### Learning Based on Human Demonstration
When asking the Android Assistant to perform self-exploration during the learning stage, you can free your hands. However, when instructing it to learn according to your commands, you need to follow the instructions in the terminal for the Android Assistant to accurately learn your operation methods.
A possible example is as follows:
```bash
cd examples/android_assistant
python run_assistant.py "Send 'When will we release this feature?' to +86 8888888" --stage "learn" --mode "manual" --app-name "Messenger"
```
After running this command, you will first see a screenshot of an Android screen that has been marked at various interactive locations, as shown in the figure below:
<img src="./resources/manual_example.png" width = 30%>
After remembering the location where you want to operate, a request similar to the one below will be output in the terminal. Reply to it and thereby direct the Android assistant to learn your demonstration action:
```bash
| INFO | examples.android_assistant.actions.manual_record:run:96 - Which element do you want to tap? Choose a numeric tag from 1 to 11:
user_input: 8
| INFO | examples.android_assistant.actions.manual_record:run:81 - Choose one of the following actions you want to perform on the current screen:
tap, text, long_press, swipe, stop
user_input: tap
```
### Automatic Execution Stage
After the Android Assistant completes the learning stage, you can command it to complete tasks on the phone through text descriptions. By configuring the operation documents from the self-learning stage, the Android Assistant has richer prior knowledge, and its execution capabilities are further enhanced.
You can instruct the Android Assistant to send messages in the "Messenger" app with the following command:
```bash
python run_assistant.py "Send 'When will we release this feature?' to +86 8888888" --stage "act" --mode "auto or manual" --app-name "Messenger"
```
Specifically, by selecting `auto` for `mode`, the Android assistant will employ the operational records compiled through self-exploration. Alternatively, if `manual` is chosen as the `mode`, the Android assistant will leverage the operation manuals accrued from learning via human demonstration.
## Installation
To use the Android Assistant, you first need to meet the following conditions:
1. Complete the installation of the MetaGPT environment.
2. Install [Android Debug Bridge (ADB)](https://developer.android.com/tools/adb?hl=zh-cn) on your PC, which enables interaction between your PC and Android devices.
3. Install Android Studio and within it, install the Android emulator to provide an environment for the Android Assistant to learn and execute. For information on how to install the Android emulator, refer to [Quick Installation of Android Studio & Emulator](https://docs.expo.dev/workflow/android-studio-emulator/).
4. (Optional) Connect your Android device to the USB port of your PC, which can also provide an environment for the Android Assistant to learn and execute.
Note ⚠️: When operating with the Android emulator, the emulator model we use is Medium Phone, which is recommended for first-time users to complete the operation.
After completing these operations, you can enter the following command to check if ADB is installed successfully and if the Android device is connected:
```bash
adb devices
```
## Usage
The MetaGPT Android Assistant is designed within the MetaGPT framework as a collection of Roles and multiple Actions. You can run it by executing the `run_assistant.py` script. The specific parameter description of this script is as follows:
```text
Usage: run_assistant.py [OPTIONS] TASK_DESC
Run a Android Assistant
Arguments:
TASK_DESC the task description you want the android assistant to learn or
act [required]
Options:
--n-round INTEGER The max round to do an app operation task.
[default: 20]
--stage TEXT stage: learn / act [default: learn]
--mode TEXT mode: auto / manual , when state=learn
[default: auto]
--app-name TEXT the name of app you want to run [default:
demo]
--investment FLOAT Dollar amount to invest in the AI company.
[default: 5.0]
--refine-doc / --no-refine-doc Refine existing operation docs based on the
latest observation if True. [default: no-
refine-doc]
--min-dist INTEGER The minimum distance between elements to
prevent overlapping during the labeling
process. [default: 30]
--android-screenshot-dir TEXT The path to store screenshots on android
device. Make sure it exists. [default:
/sdcard/Pictures/Screenshots]
--android-xml-dir TEXT The path to store xml files for determining
UI elements localtion. Make sure it exists.
[default: /sdcard]
--device-id TEXT The Android device_id [default:
emulator-5554]
--help Show this message and exit.
```
## Acknowledgements
The MetaGPT Android Assistant has referenced some ideas and code from the [AppAgent](https://github.com/mnotgod96/AppAgent) project. We thank the developers of the Appagent project.
### Citation
```bib
@misc{yang2023appagent,
title={AppAgent: Multimodal Agents as Smartphone Users},
author={Chi Zhang and Zhao Yang and Jiaxuan Liu and Yucheng Han and Xin Chen and Zebiao Huang and Bin Fu and Gang Yu},
year={2023},
eprint={2312.13771},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```

View file

@ -0,0 +1,113 @@
# MetaGPT 安卓助理
MetaGPT安卓助理是一款依托于先进的MetaGPT框架构建的多模态大语言模型驱动的智能辅助工具。
它具备自我学习的能力,能够通过学习掌握用户的日常使用方式,同时能够根据用户的指令自动完成各类应用程序的操作任务,实现了用户双手的全面解放。
接下来我们将介绍MetaGPT安卓助理的功能以及如何使用它。
## 功能
MetaGPT 安卓助理的执行主要包含两个阶段分别为自我学习与自动执行。下面我们将从这两个阶段介绍MetaGPT 安卓助理的具体功能。
### 自我学习阶段
通过学习人类演示或基于人类指令对app进行探索MetaGPT安卓助理可以对app的功能进行学习生成相应的操作文档为后续的“自动执行”阶段使用。对于任何给定的任务目标进行约20轮的探索可以显著提高性能。
通过设定`stage``learn`可要求安卓助理进入自我学习阶段。通过设定`mode``auto`,可要求安卓助理通过自动探索学习,通过设定`mode``manual`,可要求安卓助理通过人类手动演示学习。在使用章节,我们对脚本的参数进行了详细的说明。
您可以尝试对“Messenger”应用程序进行自动探索和手动演示模式的实验具体命令如下
```bash
cd examples/android_assistant
python run_assistant.py "Send 'When will we release this feature? to +86 8888888'" --stage "learn" --mode "auto or manual" --app-name "Messenger"
```
#### 基于人类演示的学习
在要求安卓助理在自我学习阶段执行自我探索时,您可以解放您的双手,但在要求他根据您的指令进行学习时,你需要根据终端中的指令进行输入,以便安卓助理能够准确地学习您的操作方式。
一个可能的例子如下:
```bash
cd examples/android_assistant
python run_assistant.py "Send 'When will we release this feature? to +86 8888888'" --stage "learn" --mode "manual" --app-name "Messenger"
```
在运行这一指令后,你将首先看到一个在各个可交互的位置进行了标记的安卓屏幕的截图,如下图:
<img src="./resources/manual_example.png" width = 30%>
在记住你要操作的位置之后,终端中将会输出与下面类似的要求,回复它,进而指挥安卓助理学习你的演示行为:
```bash
| INFO | examples.android_assistant.actions.manual_record:run:96 - Which element do you want to tap? Choose a numeric tag from 1 to 11:
user_input: 8
| INFO | examples.android_assistant.actions.manual_record:run:81 - Choose one of the following actions you want to perform on the current screen:
tap, text, long_press, swipe, stop
user_input: tap
```
### 自动执行阶段
在安卓助理完成了自我学习阶段之后,您可以通过文本描述的方式,指挥安卓助理在手机中完成任务。通过为其配置自我学习阶段的操作文档,安卓助理具备了更丰富的前置知识,执行能力进一步得到提升。
你可以通过以下指令指挥安卓助理在“Messenger”应用中发送信息
```bash
python run_assistant.py "Send 'When will we release this feature? to +86 8888888'" --stage "act" --mode "auto or manual" --app-name "Messenger"
```
其中,`mode`选择`auto`,安卓助理将使用自我探索中积累的操作文档;`mode`选择`manual`,安卓助理将使用人类演示学习中积累的操作文档。
## 安装
为了使用安卓助理,你首先需要满足以下条件:
1. 完成MetaGPT环境的安装
2. 在你的PC上安装[Android Debug Bridge(ADB)](https://developer.android.com/tools/adb?hl=zh-cn)ADB可以使你的PC与安卓设备进行交互。
3. 安装Android Studio在其中安装Android模拟器以为安卓助手提供学习与执行的环境。关于如何安装Android模拟器可以参考[快速安装Android Studio & Emulator](https://dev.weixin.qq.com/docs/framework/dev/framework/env/android-simulator.html)。
4. (Optional) 将你的安卓设备连接到PC的USB端口上这同样可以为安卓助手提供学习与执行的环境。
注意 ⚠在使用Android模拟器进行操作时我们使用的模拟器型号为Medium Phone建议第一次尝试此类应用的用户使用这一型号完成操作。
在完成这一系列操作之后你可以输入以下命令检查ADB是否安装成功以及安卓设备是否连接
```bash
adb devices
```
## 使用
MetaGPT 安卓助理在MetaGPT框架中被设计为一个`Role`与多个`Action`的集合,你可以通过运行`run_assistant.py`脚本来运行它。这一脚本具体的参数说明如下:
```text
用法run_assistant.py [选项] 任务描述
运行一个安卓助手
参数:
TASK_DESC 你希望安卓助手学习或执行的任务描述
[必需]
选项:
--n-round 整数 执行应用程序操作任务的最大轮数。
[默认值20]
--stage 文本 阶段learn/act [默认值learn]
--mode 文本 模式auto/manual当状态=learn时 [默认值auto]
--app-name 文本 你想要运行的应用程序名称 [默认值:
演示]
--investment 浮点数 投资于人工智能公司的美元金额。
[默认值5.0]
--refine-doc / --no-refine-doc 如果为真,则根据最新的观察结果优化现有操作文档。
[默认值:--no-refine-doc]
--min-dist 整数 在标记过程中防止元素重叠的最小元素间距。
[默认值30]
--android-screenshot-dir 文本 在安卓设备上存储截图的路径。确保其存在。
[默认值:/sdcard/Pictures/Screenshots]
--android-xml-dir 文本 存储用于确定UI元素位置的XML文件的路径。
确保其存在。[默认值:/sdcard]
--device-id 文本 安卓device_id [默认值:
模拟器-5554]
--help 显示此信息并退出。
```
## 致谢
MetaGPT 安卓助理参考了 [AppAgent](https://github.com/mnotgod96/AppAgent) 项目的部分思路与代码,感谢 Appagent 项目的开发者们。
### 引用
```bib
@misc{yang2023appagent,
title={AppAgent: Multimodal Agents as Smartphone Users},
author={Chi Zhang and Zhao Yang and Jiaxuan Liu and Yucheng Han and Xin Chen and Zebiao Huang and Bin Fu and Gang Yu},
year={2023},
eprint={2312.13771},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```

View file

@ -0,0 +1,3 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc :

View file

@ -0,0 +1,3 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc :

View file

@ -0,0 +1,168 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc : manual record user interaction in stage=learn & mode=manual, LIKE scripts/step_recorder.py
import time
from pathlib import Path
import cv2
from metagpt.actions.action import Action
from metagpt.config2 import config
from metagpt.environment.android.android_env import AndroidEnv
from metagpt.environment.android.const import ADB_EXEC_FAIL
from metagpt.environment.android.env_space import (
EnvAction,
EnvActionType,
EnvObsParams,
EnvObsType,
)
from metagpt.ext.android_assistant.utils.schema import (
ActionOp,
AndroidActionOutput,
RunState,
SwipeOp,
)
from metagpt.ext.android_assistant.utils.utils import (
draw_bbox_multi,
elem_list_from_xml_tree,
)
from metagpt.logs import logger
class ManualRecord(Action):
"""do a human operation on the screen with human input"""
name: str = "ManualRecord"
useless_list: list[str] = [] # store useless elements uid
record_path: Path = ""
task_desc_path: Path = ""
screenshot_before_path: Path = ""
screenshot_after_path: Path = ""
xml_path: Path = ""
async def run(self, task_desc: str, task_dir: Path, env: AndroidEnv):
self.record_path = Path(task_dir) / "record.txt"
self.task_desc_path = Path(task_dir) / "task_desc.txt"
self.screenshot_before_path = Path(task_dir) / "raw_screenshots"
self.screenshot_after_path = Path(task_dir) / "labeled_screenshots"
self.xml_path = Path(task_dir) / "xml"
for path in [self.screenshot_before_path, self.screenshot_after_path, self.xml_path]:
path.mkdir(parents=True, exist_ok=True)
self.record_path.write_text("")
record_file = open(self.record_path, "w")
self.task_desc_path.write_text(task_desc)
step = 0
extra_config = config.extra
while True:
step += 1
screenshot_path: Path = env.observe(
EnvObsParams(
obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{step}", local_save_dir=self.screenshot_before_path
)
)
xml_path: Path = env.observe(
EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{step}", local_save_dir=self.xml_path)
)
if not screenshot_path.exists() or not xml_path.exists():
return AndroidActionOutput(action_state=RunState.FAIL)
elem_list = elem_list_from_xml_tree(xml_path, self.useless_list, extra_config.get("min_dist", 30))
screenshot_labeled_path = Path(self.screenshot_after_path).joinpath(f"{step}_labeled.png")
labeled_img = draw_bbox_multi(screenshot_path, screenshot_labeled_path, elem_list)
cv2.namedWindow("image", cv2.WINDOW_NORMAL)
cv2.imshow("image", labeled_img)
cv2.waitKey(0)
cv2.destroyAllWindows()
user_input = "xxx"
logger.info(
"Choose one of the following actions you want to perform on the current screen:\n"
"tap, text, long_press, swipe, stop"
)
while (
user_input.lower() != ActionOp.TAP.value
and user_input.lower() != ActionOp.TEXT.value
and user_input.lower() != ActionOp.LONG_PRESS.value
and user_input.lower() != ActionOp.SWIPE.value
and user_input.lower() != ActionOp.STOP.value
):
user_input = input("user_input: ")
if user_input.lower() == ActionOp.TAP.value:
logger.info(f"Which element do you want to tap? Choose a numeric tag from 1 to {len(elem_list)}:")
user_input = "xxx"
while not user_input.isnumeric() or int(user_input) > len(elem_list) or int(user_input) < 1:
user_input = input("user_input: ")
tl, br = elem_list[int(user_input) - 1].bbox
x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
log_str = f"tap({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n"
elif user_input.lower() == ActionOp.TEXT.value:
logger.info(
f"Which element do you want to input the text string? Choose a numeric tag from 1 to "
f"{len(elem_list)}:"
)
input_area = "xxx"
while not input_area.isnumeric() or int(input_area) > len(elem_list) or int(input_area) < 1:
input_area = input("user_input: ")
logger.info("Enter your input text below:")
user_input = ""
while not user_input:
user_input = input("user_input: ")
action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=user_input)
log_str = f"text({input_area}:sep:'{user_input}'):::{elem_list[int(input_area) - 1].uid}\n"
elif user_input.lower() == ActionOp.LONG_PRESS.value:
logger.info(
f"Which element do you want to long press? Choose a numeric tag from 1 to {len(elem_list)}:"
)
user_input = "xxx"
while not user_input.isnumeric() or int(user_input) > len(elem_list) or int(user_input) < 1:
user_input = input("user_input: ")
tl, br = elem_list[int(user_input) - 1].bbox
x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
log_str = f"long_press({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n"
elif user_input.lower() == ActionOp.SWIPE.value:
logger.info(
"What is the direction of your swipe? Choose one from the following options:\n"
"up, down, left, right"
)
user_input = ""
while (
user_input != SwipeOp.UP.value
and user_input != SwipeOp.DOWN.value
and user_input != SwipeOp.LEFT.value
and user_input != SwipeOp.RIGHT.value
):
user_input = input("user_input: ")
swipe_dir = user_input
logger.info(f"Which element do you want to swipe? Choose a numeric tag from 1 to {len(elem_list)}:")
while not user_input.isnumeric() or int(user_input) > len(elem_list) or int(user_input) < 1:
user_input = input("user_input: ")
tl, br = elem_list[int(user_input) - 1].bbox
x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
action = EnvAction(action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=swipe_dir)
log_str = f"swipe({int(user_input)}:sep:{swipe_dir}):::{elem_list[int(user_input) - 1].uid}\n"
elif user_input.lower() == ActionOp.STOP.value:
record_file.write("stop\n")
record_file.close()
break
else:
break
obs, _, _, _, info = env.step(action)
action_res = info["res"]
if action_res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
record_file.write(log_str)
time.sleep(1)
return AndroidActionOutput(action_state=RunState.SUCCESS)

View file

@ -0,0 +1,137 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc : parse record to generate learned standard operations in stage=learn & mode=manual,
# LIKE scripts/document_generation.py
import ast
import re
from pathlib import Path
from metagpt.actions.action import Action
from metagpt.config2 import config
from metagpt.ext.android_assistant.actions.parse_record_an import RECORD_PARSE_NODE
from metagpt.ext.android_assistant.prompts.operation_prompt import (
long_press_doc_template,
refine_doc_suffix,
swipe_doc_template,
tap_doc_template,
text_doc_template,
)
from metagpt.ext.android_assistant.utils.schema import (
ActionOp,
AndroidActionOutput,
RecordLogItem,
RunState,
SwipeOp,
)
from metagpt.logs import logger
from metagpt.utils.common import encode_image
class ParseRecord(Action):
name: str = "ParseRecord"
record_path: Path = ""
task_desc_path: Path = ""
screenshot_before_path: Path = ""
screenshot_after_path: Path = ""
async def run(self, task_dir: Path, docs_dir: Path):
doc_count = 0
self.record_path = Path(task_dir) / "record.txt"
self.task_desc_path = Path(task_dir) / "task_desc.txt"
self.screenshot_before_path = Path(task_dir) / "raw_screenshots"
self.screenshot_after_path = Path(task_dir) / "labeled_screenshots"
for path in [self.screenshot_before_path, self.screenshot_after_path]:
path.mkdir(parents=True, exist_ok=True)
task_desc = self.task_desc_path.read_text()
extra_config = config.extra
with open(self.record_path, "r") as record_file:
record_step_count = len(record_file.readlines()) - 1
record_file.seek(0)
for step in range(1, record_step_count + 1):
img_before_base64 = encode_image(self.screenshot_after_path.joinpath(f"{step}_labeled.png"))
img_after_base64 = encode_image(self.screenshot_after_path.joinpath(f"{step + 1}_labeled.png"))
rec = record_file.readline().strip()
action, resource_id = rec.split(":::")
action_type = action.split("(")[0]
# 构建Prompt
action_param = re.findall(r"\((.*?)\)", action)[0]
if action_type == ActionOp.TAP.value:
prompt_template = tap_doc_template
context = prompt_template.format(ui_element=action_param)
elif action_type == ActionOp.TEXT.value:
input_area, input_text = action_param.split(":sep:")
prompt_template = text_doc_template
context = prompt_template.format(ui_element=input_area)
elif action_type == ActionOp.LONG_PRESS.value:
prompt_template = long_press_doc_template
context = prompt_template.format(ui_element=action_param)
elif action_type == ActionOp.SWIPE.value:
swipe_area, swipe_dir = action_param.split(":sep:")
if swipe_dir == SwipeOp.UP.value or swipe_dir == SwipeOp.DOWN.value:
action_type = ActionOp.VERTICAL_SWIPE.value
elif swipe_dir == SwipeOp.LEFT.value or swipe_dir == SwipeOp.RIGHT.value:
action_type = ActionOp.HORIZONTAL_SWIPE.value
prompt_template = swipe_doc_template
context = prompt_template.format(swipe_dir=swipe_dir, ui_element=swipe_area)
else:
break
context = context.format(task_desc=task_desc)
doc_name = resource_id + ".txt"
doc_path = docs_dir.joinpath(doc_name)
if doc_path.exists():
try:
doc_content = ast.literal_eval(doc_path.read_text())
except Exception as exp:
logger.error(f"ast parse doc: {doc_path} failed, exp: {exp}")
continue
if doc_content[action_type]:
if extra_config.get("doc_refine", False):
refine_context = refine_doc_suffix.format(old_doc=doc_content[action_type])
context += refine_context
logger.info(
f"Documentation for the element {resource_id} already exists. The doc will be "
f"refined based on the latest demo."
)
else:
logger.info(
f"Documentation for the element {resource_id} already exists. Turn on DOC_REFINE "
f"in the config file if needed."
)
continue
else:
doc_content = {"tap": "", "text": "", "v_swipe": "", "h_swipe": "", "long_press": ""}
logger.info(f"Waiting for GPT-4V to generate documentation for the element {resource_id}")
node = await RECORD_PARSE_NODE.fill(
context=context, llm=self.llm, images=[img_before_base64, img_after_base64]
)
if "error" in node.content:
return AndroidActionOutput(action_state=RunState.FAIL)
log_path = task_dir.joinpath("log_parse_record.txt")
prompt = node.compile(context=context, schema="json", mode="auto")
msg = node.content
doc_content[action_type] = msg
with open(log_path, "a") as logfile:
log_item = RecordLogItem(
step=step,
prompt=prompt,
image_before=img_before_base64,
image_after=img_after_base64,
response=node.content,
)
logfile.write(log_item.model_dump_json() + "\n")
with open(doc_path, "w") as outfile:
outfile.write(str(doc_content))
doc_count += 1
logger.info(f"Documentation generated and saved to {doc_path}")
logger.info(f"Documentation generation phase completed. {doc_count} docs generated.")
return AndroidActionOutput(action_state=RunState.FINISH)

View file

@ -0,0 +1,32 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc : the ActionNode to parse record
from metagpt.actions.action_node import ActionNode
OBSERVATION = ActionNode(
key="Observation",
expected_type=str,
instruction="Provide a description of your observations of the two images. "
"Subsequently, delineate the distinctions between the first image and the second one.",
example="",
)
THOUGHT = ActionNode(
key="Thought",
expected_type=str,
instruction="Consider the impact of Action acting on UI elements.",
example="",
)
DESCRIPTION = ActionNode(
key="Description",
expected_type=str,
instruction="Describe the functionality of the UI element concisely in one or two sentences Do not include "
"the numeric tag in your description",
example="",
)
NODES = [OBSERVATION, THOUGHT, DESCRIPTION]
RECORD_PARSE_NODE = ActionNode.from_children("RecordParse", NODES)

View file

@ -0,0 +1,204 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc : LIKE scripts/task_executor.py in stage=act
import ast
from pathlib import Path
from metagpt.actions.action import Action
from metagpt.config2 import config
from metagpt.environment.android.android_env import AndroidEnv
from metagpt.environment.android.const import ADB_EXEC_FAIL
from metagpt.environment.android.env_space import (
EnvAction,
EnvActionType,
EnvObsParams,
EnvObsType,
)
from metagpt.ext.android_assistant.actions.screenshot_parse_an import (
SCREENSHOT_PARSE_NODE,
)
from metagpt.ext.android_assistant.prompts.assistant_prompt import (
screenshot_parse_template,
screenshot_parse_with_grid_template,
)
from metagpt.ext.android_assistant.utils.schema import (
AndroidActionOutput,
AndroidElement,
GridOpParam,
LongPressGridOpParam,
LongPressOpParam,
OpLogItem,
RunState,
SwipeGridOpParam,
SwipeOpParam,
TapGridOpParam,
TapOpParam,
TextOpParam,
)
from metagpt.ext.android_assistant.utils.utils import (
area_to_xy,
draw_bbox_multi,
draw_grid,
elem_bbox_to_xy,
screenshot_parse_extract,
traverse_xml_tree,
)
from metagpt.logs import logger
from metagpt.utils.common import encode_image
class ScreenshotParse(Action):
name: str = "ScreenshotParse"
def _makeup_ui_document(self, elem_list: list[AndroidElement], docs_idr: Path, use_exist_doc: bool = True) -> str:
if not use_exist_doc:
return ""
ui_doc = """
You also have access to the following documentations that describes the functionalities of UI
elements you can interact on the screen. These docs are crucial for you to determine the target of your
next action. You should always prioritize these documented elements for interaction: """
for i, elem in enumerate(elem_list):
doc_path = docs_idr.joinpath(f"{elem.uid}.txt")
if not doc_path.exists():
continue
try:
doc_content = ast.literal_eval(doc_path.read_text())
except Exception as exp:
logger.error(f"ast parse doc: {doc_path} failed, exp: {exp}")
continue
ui_doc += f"Documentation of UI element labeled with the numeric tag '{i + 1}':\n"
if doc_content["tap"]:
ui_doc += f"This UI element is clickable. {doc_content['tap']}\n\n"
if doc_content["text"]:
ui_doc += (
f"This UI element can receive text input. The text input is used for the following "
f"purposes: {doc_content['text']}\n\n"
)
if doc_content["long_press"]:
ui_doc += f"This UI element is long clickable. {doc_content['long_press']}\n\n"
if doc_content["v_swipe"]:
ui_doc += (
f"This element can be swiped directly without tapping. You can swipe vertically on "
f"this UI element. {doc_content['v_swipe']}\n\n"
)
if doc_content["h_swipe"]:
ui_doc += (
f"This element can be swiped directly without tapping. You can swipe horizontally on "
f"this UI element. {doc_content['h_swipe']}\n\n"
)
return ui_doc
async def run(
self,
round_count: int,
task_desc: str,
last_act: str,
task_dir: Path,
docs_dir: Path,
grid_on: bool,
env: AndroidEnv,
):
extra_config = config.extra
for path in [task_dir, docs_dir]:
path.mkdir(parents=True, exist_ok=True)
screenshot_path: Path = env.observe(
EnvObsParams(obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{round_count}_before", local_save_dir=task_dir)
)
xml_path: Path = env.observe(
EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{round_count}", local_save_dir=task_dir)
)
if not screenshot_path.exists() or not xml_path.exists():
return AndroidActionOutput(action_state=RunState.FAIL)
clickable_list = []
focusable_list = []
traverse_xml_tree(xml_path, clickable_list, "clickable", True)
traverse_xml_tree(xml_path, focusable_list, "focusable", True)
elem_list: list[AndroidElement] = clickable_list.copy()
for elem in focusable_list:
bbox = elem.bbox
center = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
close = False
for e in clickable_list:
bbox = e.bbox
center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5
if dist <= extra_config.get("min_dist", 30):
close = True
break
if not close:
elem_list.append(elem)
screenshot_labeled_path = task_dir.joinpath(f"{round_count}_labeled.png")
draw_bbox_multi(screenshot_path, screenshot_labeled_path, elem_list)
img_base64 = encode_image(screenshot_labeled_path)
parse_template = screenshot_parse_with_grid_template if grid_on else screenshot_parse_template
if grid_on:
env.rows, env.cols = draw_grid(screenshot_path, task_dir / f"{round_count}_grid.png")
ui_doc = self._makeup_ui_document(elem_list, docs_dir)
context = parse_template.format(ui_document=ui_doc, task_description=task_desc, last_act=last_act)
node = await SCREENSHOT_PARSE_NODE.fill(context=context, llm=self.llm, images=[img_base64])
if "error" in node.content:
return AndroidActionOutput(action_state=RunState.FAIL)
prompt = node.compile(context=context, schema="json", mode="auto")
OpLogItem(step=round_count, prompt=prompt, image=str(screenshot_labeled_path), response=node.content)
op_param = screenshot_parse_extract(node.instruct_content.model_dump(), grid_on)
if op_param.param_state == RunState.FINISH:
logger.info(f"op_param: {op_param}")
return AndroidActionOutput(action_state=RunState.FINISH)
if op_param.param_state == RunState.FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
last_act = op_param.last_act
if isinstance(op_param, TapOpParam):
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
elif isinstance(op_param, TextOpParam):
action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=op_param.input_str)
elif isinstance(op_param, LongPressOpParam):
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
elif isinstance(op_param, SwipeOpParam):
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
action = EnvAction(
action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=op_param.swipe_orient, dist=op_param.dist
)
elif isinstance(op_param, GridOpParam):
grid_on = True
elif isinstance(op_param, TapGridOpParam) or isinstance(op_param, LongPressGridOpParam):
x, y = area_to_xy(op_param.area, op_param.subarea, env.width, env.height, env.rows, env.cols)
if isinstance(op_param, TapGridOpParam):
action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
else:
# LongPressGridOpParam
action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
elif isinstance(op_param, SwipeGridOpParam):
start_x, start_y = area_to_xy(
op_param.start_area, op_param.start_subarea, env.width, env.height, env.rows, env.cols
)
end_x, end_y = area_to_xy(
op_param.end_area, op_param.end_subarea, env.width, env.height, env.rows, env.cols
)
action = EnvAction(
action_type=EnvActionType.USER_SWIPE_TO, coord=(start_x, start_y), tgt_coord=(end_x, end_y)
)
if not grid_on:
obs, _, _, _, info = env.step(action)
action_res = info["res"]
if action_res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
if op_param.act_name != "grid":
grid_on = False
return AndroidActionOutput(data={"grid_on": grid_on, "last_act": last_act})

View file

@ -0,0 +1,48 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc : the ActionNode to parse screenshot
from metagpt.actions.action_node import ActionNode
OBSERVATION = ActionNode(
key="Observation", expected_type=str, instruction="Describe what you observe in the image", example=""
)
THOUGHT = ActionNode(
key="Thought",
expected_type=str,
instruction="To complete the given task, what is the next step I should do",
example="",
)
ACTION = ActionNode(
key="Action",
expected_type=str,
instruction="The function call with the correct parameters to proceed with the task. If you believe the task is "
"completed or there is nothing to be done, you should output FINISH. You cannot output anything else "
"except a function call or FINISH in this field.",
example="",
)
SUMMARY = ActionNode(
key="Summary",
expected_type=str,
instruction="Summarize your past actions along with your latest action in one or two sentences. Do not include "
"the numeric tag in your summary",
example="",
)
SUMMARY_GRID = ActionNode(
key="Summary",
expected_type=str,
instruction="Summarize your past actions along with your latest action in one or two sentences. Do not include "
"the grid area number in your summary",
example="",
)
NODES = [OBSERVATION, THOUGHT, ACTION, SUMMARY]
NODES_GRID = [OBSERVATION, THOUGHT, ACTION, SUMMARY_GRID]
SCREENSHOT_PARSE_NODE = ActionNode.from_children("ScreenshotParse", NODES)
SCREENSHOT_PARSE_GRID_NODE = ActionNode.from_children("ScreenshotParseGrid", NODES_GRID)

View file

@ -0,0 +1,231 @@
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc : LIKE scripts/self_explorer.py in stage=learn & mode=auto self_explore_task stage
import ast
from pathlib import Path
from metagpt.actions.action import Action
from metagpt.config2 import config
from metagpt.environment.android.android_env import AndroidEnv
from metagpt.environment.android.const import ADB_EXEC_FAIL
from metagpt.environment.android.env_space import (
EnvAction,
EnvActionType,
EnvObsParams,
EnvObsType,
)
from metagpt.ext.android_assistant.actions.screenshot_parse_an import (
SCREENSHOT_PARSE_NODE,
)
from metagpt.ext.android_assistant.actions.self_learn_reflect_an import (
SELF_LEARN_REFLECT_NODE,
)
from metagpt.ext.android_assistant.prompts.assistant_prompt import (
screenshot_parse_self_explore_reflect_template as reflect_template,
)
from metagpt.ext.android_assistant.prompts.assistant_prompt import (
screenshot_parse_self_explore_template,
)
from metagpt.ext.android_assistant.utils.schema import (
ActionOp,
AndroidActionOutput,
AndroidElement,
Decision,
DocContent,
LongPressOpParam,
OpLogItem,
ReflectLogItem,
RunState,
SwipeOp,
SwipeOpParam,
TapOpParam,
TextOpParam,
)
from metagpt.ext.android_assistant.utils.utils import (
draw_bbox_multi,
elem_bbox_to_xy,
elem_list_from_xml_tree,
reflect_parse_extarct,
screenshot_parse_extract,
)
from metagpt.logs import logger
from metagpt.utils.common import encode_image
class SelfLearnAndReflect(Action):
name: str = "SelfLearnAndReflect"
useless_list: list[str] = [] # store useless elements uid
screenshot_before_path: str = ""
screenshot_before_base64: str = ""
elem_list: list[AndroidElement] = []
swipe_orient: str = "up"
act_name: str = ""
ui_area: int = -1
async def run(
self, round_count: int, task_desc: str, last_act: str, task_dir: Path, docs_dir: Path, env: AndroidEnv
) -> AndroidActionOutput:
for path in [task_dir, docs_dir]:
path.mkdir(parents=True, exist_ok=True)
resp = await self.run_self_learn(round_count, task_desc, last_act, task_dir, env)
if resp.action_state != RunState.SUCCESS:
return resp
resp = await self.run_reflect(round_count, task_desc, last_act, task_dir, docs_dir, env)
return resp
async def run_self_learn(
self, round_count: int, task_desc: str, last_act: str, task_dir: Path, env: AndroidEnv
) -> AndroidActionOutput:
extra_config = config.extra
screenshot_path: Path = env.observe(
EnvObsParams(obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{round_count}_before", local_save_dir=task_dir)
)
xml_path: Path = env.observe(
EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{round_count}", local_save_dir=task_dir)
)
if not screenshot_path.exists() or not xml_path.exists():
return AndroidActionOutput(action_state=RunState.FAIL)
elem_list = elem_list_from_xml_tree(xml_path, self.useless_list, extra_config.get("min_dist", 30))
screenshot_before_labeled_path = task_dir.joinpath(f"{round_count}_before_labeled.png")
draw_bbox_multi(screenshot_path, screenshot_before_labeled_path, elem_list)
img_base64 = encode_image(screenshot_before_labeled_path)
self.screenshot_before_base64 = img_base64
self.screenshot_before_path = screenshot_before_labeled_path
self_explore_template = screenshot_parse_self_explore_template
context = self_explore_template.format(task_description=task_desc, last_act=last_act)
node = await SCREENSHOT_PARSE_NODE.fill(context=context, llm=self.llm, images=[img_base64])
logger.debug(f"fill result:{node}")
if "error" in node.content:
return AndroidActionOutput(action_state=RunState.FAIL)
prompt = node.compile(context=context, schema="json", mode="auto")
# Modify WindowsPath to Str
OpLogItem(step=round_count, prompt=prompt, image=str(screenshot_before_labeled_path), response=node.content)
op_param = screenshot_parse_extract(node.instruct_content.model_dump(), grid_on=False)
# TODO Modify Op_param. When op_param.action is FINISH, how to solve this ?
if op_param.param_state == RunState.FINISH:
return AndroidActionOutput(action_state=RunState.FINISH)
if op_param.param_state == RunState.FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
if isinstance(op_param, TapOpParam):
self.ui_area = op_param.area
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
elif isinstance(op_param, TextOpParam):
action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=op_param.input_str)
elif isinstance(op_param, LongPressOpParam):
self.ui_area = op_param.area
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
elif isinstance(op_param, SwipeOpParam):
self.ui_area = op_param.area
self.swipe_orient = op_param.swipe_orient
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
action = EnvAction(
action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=op_param.swipe_orient, dist=op_param.dist
)
obs, _, _, _, info = env.step(action)
action_res = info["res"]
if action_res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
self.elem_list = elem_list
self.act_name = op_param.act_name
return AndroidActionOutput()
async def run_reflect(
self, round_count: int, task_desc: str, last_act: str, task_dir: Path, docs_dir: Path, env: AndroidEnv
) -> AndroidActionOutput:
screenshot_path: Path = env.observe(
EnvObsParams(obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{round_count}_after", local_save_dir=task_dir)
)
if not screenshot_path.exists():
return AndroidActionOutput(action_state=RunState.FAIL)
screenshot_after_labeled_path = task_dir.joinpath(f"{round_count}_after_labeled.png")
draw_bbox_multi(screenshot_path, screenshot_after_labeled_path, elem_list=self.elem_list)
img_base64 = encode_image(screenshot_after_labeled_path)
if self.act_name == ActionOp.TAP.value:
action = "tapping"
elif self.act_name == ActionOp.LONG_PRESS.value:
action = "long pressing"
elif self.act_name == ActionOp.SWIPE.value:
action = "swiping"
if self.swipe_orient == SwipeOp.UP.value or self.swipe_orient == SwipeOp.DOWN.value:
action = "v_swipe"
elif self.swipe_orient == SwipeOp.LEFT.value or self.swipe_orient == SwipeOp.RIGHT.value:
action = "h_swipe"
else:
# TODO Test for assignment, This error is eupiped with the next.
logger.warning(f"Current action name parse failed, it's `{self.act_name}`")
action = None
context = reflect_template.format(
action=action, ui_element=str(self.ui_area), task_desc=task_desc, last_act=last_act
)
node = await SELF_LEARN_REFLECT_NODE.fill(
context=context, llm=self.llm, images=[self.screenshot_before_base64, img_base64]
)
if "error" in node.content:
return AndroidActionOutput(action_state=RunState.FAIL)
prompt = node.compile(context=context, schema="json", mode="auto")
ReflectLogItem(
step=round_count,
prompt=prompt,
image_before=str(self.screenshot_before_path),
image_after=str(screenshot_after_labeled_path),
response=node.content,
)
op_param = reflect_parse_extarct(node.instruct_content.model_dump())
if op_param.param_state == RunState.FINISH:
return AndroidActionOutput(action_state=RunState.FINISH)
if op_param.param_state == RunState.FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
logger.info(
f"reflect_parse_extarct decision: {op_param.decision}, "
f"elem_list size: {len(self.elem_list)}, ui_area: {self.ui_area}"
)
# TODO here will cause `IndexError: list index out of range`.
# Maybe you should clink back to the desktop in the simulator
resource_id = self.elem_list[int(self.ui_area) - 1].uid
if op_param.decision == Decision.INEFFECTIVE.value:
self.useless_list.append(resource_id)
last_act = "NONE" # TODO global
elif op_param.decision in [Decision.BACK.value, Decision.CONTINUE.value, Decision.SUCCESS.value]:
if op_param.decision in [Decision.BACK.value, Decision.CONTINUE.value]:
self.useless_list.append(resource_id)
last_act = "NONE"
if op_param.decision == Decision.BACK.value:
action = EnvAction(action_type=EnvActionType.SYSTEM_BACK)
obs, _, _, _, info = env.step(action)
if info["res"] == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
doc = op_param.documentation
doc_path = docs_dir.joinpath(f"{resource_id}.txt")
if doc_path.exists():
try:
doc_content = ast.literal_eval(doc_path.read_text())
except Exception as exp:
logger.error(f"ast parse doc: {doc_path} failed, exp: {exp}")
return AndroidActionOutput(action_state=RunState.FAIL)
if doc_content[self.act_name]:
logger.info(f"Documentation for the element {resource_id} already exists.")
return AndroidActionOutput(action_state=RunState.FAIL)
else:
doc_content = DocContent()
setattr(doc_content, self.act_name, doc)
doc_path.write_text(str(doc_content))
return AndroidActionOutput(data={"last_act": last_act})

View file

@ -0,0 +1,21 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc : the ActionNode to parse Reflection
from metagpt.actions.action_node import ActionNode
DECISION = ActionNode(
key="Decision", expected_type=str, instruction="explain why you made this decision", example="BACK"
)
THOUGHT = ActionNode(key="Thought", expected_type=str, instruction="explain why you made this decision", example="")
DOCUMENTATION = ActionNode(
key="Documentation", expected_type=str, instruction="describe the function of the UI element", example=""
)
NODES = [DECISION, THOUGHT, DOCUMENTATION]
SELF_LEARN_REFLECT_NODE = ActionNode.from_children("SelfLearnReflect", NODES)

View file

@ -0,0 +1,3 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc :

View file

@ -0,0 +1,168 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc : the prompt templates of assistant learning and acting
screenshot_parse_template = """You are an agent that is trained to perform some basic tasks on a smartphone. You will be given a
smartphone screenshot. The interactive UI elements on the screenshot are labeled with numeric tags starting from 1. The
numeric tag of each interactive element is located in the center of the element.
You can call the following functions to control the smartphone:
1. tap(element: int)
This function is used to tap an UI element shown on the smartphone screen.
"element" is a numeric tag assigned to an UI element shown on the smartphone screen.
A simple use case can be tap(5), which taps the UI element labeled with the number 5.
2. text(text_input: str)
This function is used to insert text input in an input field/box. text_input is the string you want to insert and must
be wrapped with double quotation marks. A simple use case can be text("Hello, world!"), which inserts the string
"Hello, world!" into the input area on the smartphone screen. This function is usually callable when you see a keyboard
showing in the lower half of the screen.
3. long_press(element: int)
This function is used to long press an UI element shown on the smartphone screen.
"element" is a numeric tag assigned to an UI element shown on the smartphone screen.
A simple use case can be long_press(5), which long presses the UI element labeled with the number 5.
4. swipe(element: int, direction: str, dist: str)
This function is used to swipe an UI element shown on the smartphone screen, usually a scroll view or a slide bar.
"element" is a numeric tag assigned to an UI element shown on the smartphone screen. "direction" is a string that
represents one of the four directions: up, down, left, right. "direction" must be wrapped with double quotation
marks. "dist" determines the distance of the swipe and can be one of the three options: short, medium, long. You should
choose the appropriate distance option according to your need.
A simple use case can be swipe(21, "up", "medium"), which swipes up the UI element labeled with the number 21 for a
medium distance.
5. grid()
You should call this function when you find the element you want to interact with is not labeled with a numeric tag and
other elements with numeric tags cannot help with the task. The function will bring up a grid overlay to divide the
smartphone screen into small areas and this will give you more freedom to choose any part of the screen to tap, long
press, or swipe.
{ui_document}
The task you need to complete is to: {task_description}. Your past actions to proceed with this task are summarized as
follows: {last_act}
Now, given the documentation and the following labeled screenshot, you need to think and call the function needed to
proceed with the task. Your output should include three parts in the given format:
You can only take one action at a time, so please directly call the function."""
screenshot_parse_with_grid_template = """You are an agent that is trained to perform some basic tasks on a smartphone. You will be given
a smartphone screenshot overlaid by a grid. The grid divides the screenshot into small square areas. Each area is
labeled with an integer in the top-left corner.
You can call the following functions to control the smartphone:
1. tap(area: int, subarea: str)
This function is used to tap a grid area shown on the smartphone screen. "area" is the integer label assigned to a grid
area shown on the smartphone screen. "subarea" is a string representing the exact location to tap within the grid area.
It can take one of the nine values: center, top-left, top, top-right, left, right, bottom-left, bottom, and
bottom-right.
A simple use case can be tap(5, "center"), which taps the exact center of the grid area labeled with the number 5.
2. long_press(area: int, subarea: str)
This function is used to long press a grid area shown on the smartphone screen. "area" is the integer label assigned to
a grid area shown on the smartphone screen. "subarea" is a string representing the exact location to long press within
the grid area. It can take one of the nine values: center, top-left, top, top-right, left, right, bottom-left, bottom,
and bottom-right.
A simple use case can be long_press(7, "top-left"), which long presses the top left part of the grid area labeled with
the number 7.
3. swipe(start_area: int, start_subarea: str, end_area: int, end_subarea: str)
This function is used to perform a swipe action on the smartphone screen, especially when you want to interact with a
scroll view or a slide bar. "start_area" is the integer label assigned to the grid area which marks the starting
location of the swipe. "start_subarea" is a string representing the exact location to begin the swipe within the grid
area. "end_area" is the integer label assigned to the grid area which marks the ending location of the swipe.
"end_subarea" is a string representing the exact location to end the swipe within the grid area.
The two subarea parameters can take one of the nine values: center, top-left, top, top-right, left, right, bottom-left,
bottom, and bottom-right.
A simple use case can be swipe(21, "center", 25, "right"), which performs a swipe starting from the center of grid area
21 to the right part of grid area 25.
The task you need to complete is to: {task_description}. Your past actions to proceed with this task are summarized as
follows: {last_act}
Now, given the following labeled screenshot, you need to think and call the function needed to proceed with the task.
Your output should include three parts in the given format:
You can only take one action at a time, so please directly call the function."""
screenshot_parse_self_explore_template = """You are an agent that is trained to complete certain tasks on a smartphone. You will be
given a screenshot of a smartphone app. The interactive UI elements on the screenshot are labeled with numeric tags
starting from 1.
You can call the following functions to interact with those labeled elements to control the smartphone:
1. tap(element: int)
This function is used to tap an UI element shown on the smartphone screen.
"element" is a numeric tag assigned to an UI element shown on the smartphone screen.
A simple use case can be tap(5), which taps the UI element labeled with the number 5.
2. text(text_input: str)
This function is used to insert text input in an input field/box. text_input is the string you want to insert and must
be wrapped with double quotation marks. A simple use case can be text("Hello, world!"), which inserts the string
"Hello, world!" into the input area on the smartphone screen. This function is only callable when you see a keyboard
showing in the lower half of the screen.
3. long_press(element: int)
This function is used to long press an UI element shown on the smartphone screen.
"element" is a numeric tag assigned to an UI element shown on the smartphone screen.
A simple use case can be long_press(5), which long presses the UI element labeled with the number 5.
4. swipe(element: int, direction: str, dist: str)
This function is used to swipe an UI element shown on the smartphone screen, usually a scroll view or a slide bar.
"element" is a numeric tag assigned to an UI element shown on the smartphone screen. "direction" is a string that
represents one of the four directions: up, down, left, right. "direction" must be wrapped with double quotation
marks. "dist" determines the distance of the swipe and can be one of the three options: short, medium, long. You should
choose the appropriate distance option according to your need.
A simple use case can be swipe(21, "up", "medium"), which swipes up the UI element labeled with the number 21 for a
medium distance.
The task you need to complete is to {task_description}. Your past actions to proceed with this task are summarized as
follows: {last_act}
Now, given the following labeled screenshot, you need to think and call the function needed to proceed with the task.
Your output should include three parts in the given format:
You can only take one action at a time, so please directly call the function."""
screenshot_parse_self_explore_reflect_template = """I will give you screenshots of a mobile app before and after {action} the UI
element labeled with the number '{ui_element}' on the first screenshot. The numeric tag of each element is located at
the center of the element. The action of {action} this UI element was described as follows:
{last_act}
The action was also an attempt to proceed with a larger task, which is to {task_desc}. Your job is to carefully analyze
the difference between the two screenshots to determine if the action is in accord with the description above and at
the same time effectively moved the task forward. Your output should be determined based on the following situations:
1. BACK
If you think the action navigated you to a page where you cannot proceed with the given task, you should go back to the
previous interface. At the same time, describe the functionality of the UI element concisely in one or two sentences by
observing the difference between the two screenshots. Notice that your description of the UI element should focus on
the general function. Never include the numeric tag of the UI element in your description. You can use pronouns such as
"the UI element" to refer to the element. Your output should be in the following format:
Decision: BACK
Thought: <explain why you think the last action is wrong and you should go back to the previous interface>
Documentation: <describe the function of the UI element>
2. INEFFECTIVE
If you find the action changed nothing on the screen (screenshots before and after the action are identical), you
should continue to interact with other elements on the screen. Notice that if you find the location of the cursor
changed between the two screenshots, then they are not identical. Your output should be in the following format:
Decision: INEFFECTIVE
Thought: <explain why you made this decision>
Documentation: <None>
3. CONTINUE
If you find the action changed something on the screen but does not reflect the action description above and did not
move the given task forward, you should continue to interact with other elements on the screen. At the same time,
describe the functionality of the UI element concisely in one or two sentences by observing the difference between the
two screenshots. Notice that your description of the UI element should focus on the general function. Never include the
numeric tag of the UI element in your description. You can use pronouns such as "the UI element" to refer to the
element. Your output should be in the following format:
Decision: CONTINUE
Thought: <explain why you think the action does not reflect the action description above and did not move the given
task forward>
Documentation: <describe the function of the UI element>
4. SUCCESS
If you think the action successfully moved the task forward (even though it did not completed the task), you should
describe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI
element should focus on the general function. Never include the numeric tag of the UI element in your description. You
can use pronouns such as "the UI element" to refer to the element. Your output should be in the following format:
Decision: SUCCESS
Thought: <explain why you think the action successfully moved the task forward>
Documentation: <describe the function of the UI element>
"""

View file

@ -0,0 +1,45 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc : the prompt templates of phone operation
tap_doc_template = """I will give you the screenshot of a mobile app before and after tapping the UI element labeled
with the number {ui_element} on the screen. The numeric tag of each element is located at the center of the element.
Tapping this UI element is a necessary part of proceeding with a larger task, which is to <task_desc>. Your task is to
describe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI
element should focus on the general function. For example, if the UI element is used to navigate to the chat window
with John, your description should not include the name of the specific person. Just say: "Tapping this area will
navigate the user to the chat window". Never include the numeric tag of the UI element in your description. You can use
pronouns such as "the UI element" to refer to the element."""
text_doc_template = """I will give you the screenshot of a mobile app before and after typing in the input area labeled
with the number {ui_element} on the screen. The numeric tag of each element is located at the center of the element.
Typing in this UI element is a necessary part of proceeding with a larger task, which is to <task_desc>. Your task is
to describe the functionality of the UI element concisely in one or two sentences. Notice that your description of the
UI element should focus on the general function. For example, if the change of the screenshot shows that the user typed
"How are you?" in the chat box, you do not need to mention the actual text. Just say: "This input area is used for the
user to type a message to send to the chat window.". Never include the numeric tag of the UI element in your
description. You can use pronouns such as "the UI element" to refer to the element."""
long_press_doc_template = """I will give you the screenshot of a mobile app before and after long pressing the UI
element labeled with the number {ui_element} on the screen. The numeric tag of each element is located at the center of
the element. Long pressing this UI element is a necessary part of proceeding with a larger task, which is to
<task_desc>. Your task is to describe the functionality of the UI element concisely in one or two sentences. Notice
that your description of the UI element should focus on the general function. For example, if long pressing the UI
element redirects the user to the chat window with John, your description should not include the name of the specific
person. Just say: "Long pressing this area will redirect the user to the chat window". Never include the numeric tag of
the UI element in your description. You can use pronouns such as "the UI element" to refer to the element."""
swipe_doc_template = """I will give you the screenshot of a mobile app before and after swiping <swipe_dir> the UI
element labeled with the number {ui_element} on the screen. The numeric tag of each element is located at the center of
the element. Swiping this UI element is a necessary part of proceeding with a larger task, which is to <task_desc>.
Your task is to describe the functionality of the UI element concisely in one or two sentences. Notice that your
description of the UI element should be as general as possible. For example, if swiping the UI element increases the
contrast ratio of an image of a building, your description should be just like this: "Swiping this area enables the
user to tune a specific parameter of the image". Never include the numeric tag of the UI element in your description.
You can use pronouns such as "the UI element" to refer to the element."""
refine_doc_suffix = """\nA documentation of this UI element generated from previous demos is shown below. Your
generated description should be based on this previous doc and optimize it. Notice that it is possible that your
understanding of the function of the UI element derived from the given screenshots conflicts with the previous doc,
because the function of a UI element can be flexible. In this case, your generated description should combine both.
Old documentation of this UI element: {old_doc}"""

View file

@ -0,0 +1,3 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc :

View file

@ -0,0 +1,146 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc : android assistant to learn from app operations and operate apps
import time
from datetime import datetime
from pathlib import Path
from typing import Optional
from pydantic import Field
from metagpt.actions.add_requirement import UserRequirement
from metagpt.config2 import config
from metagpt.const import EXAMPLE_PATH
from metagpt.ext.android_assistant.actions.manual_record import ManualRecord
from metagpt.ext.android_assistant.actions.parse_record import ParseRecord
from metagpt.ext.android_assistant.actions.screenshot_parse import ScreenshotParse
from metagpt.ext.android_assistant.actions.self_learn_and_reflect import (
SelfLearnAndReflect,
)
from metagpt.ext.android_assistant.utils.schema import AndroidActionOutput, RunState
from metagpt.logs import logger
from metagpt.roles.role import Role, RoleReactMode
from metagpt.schema import Message
class AndroidAssistant(Role):
name: str = "Nick"
profile: str = "AndroidAssistant"
goal: str = "operate the mobile phone's apps with self-learn"
task_desc: str = ""
round_count: int = 0
last_act: str = "None"
output_root_dir: Optional[Path] = Field(default=None)
task_dir: Optional[Path] = Field(default=None)
docs_dir: Optional[Path] = Field(default=None)
grid_on: bool = Field(default=False)
def __init__(self, **data):
super().__init__(**data)
self._watch([UserRequirement, AndroidActionOutput])
extra_config = config.extra
self.task_desc = extra_config.get("task_desc", "Just explore any app in this phone!")
app_name = extra_config.get("app_name", "demo")
data_dir = self.output_root_dir.absolute().joinpath("output") or EXAMPLE_PATH.joinpath(
"android_assistant/output"
)
cur_datetime = datetime.fromtimestamp(int(time.time())).strftime("%Y-%m-%d_%H-%M-%S")
"""Firstly, we decide the state with user config, further, we can do it automatically, like if it's new app,
run the learn first and then do the act stage or learn it during the action.
"""
stage = extra_config.get("stage")
mode = extra_config.get("mode")
if stage == "learn" and mode == "manual":
# choose ManualRecord and then run ParseRecord
# Remember, only run each action only one time, no need to run n_round.
self.set_actions([ManualRecord, ParseRecord])
self.task_dir = data_dir.joinpath(app_name, f"manual_learn_{cur_datetime}")
self.docs_dir = data_dir.joinpath(app_name, "manual_docs")
elif stage == "learn" and mode == "auto":
# choose SelfLearnAndReflect to run
self.set_actions([SelfLearnAndReflect])
self.task_dir = data_dir.joinpath(app_name, f"auto_learn_{cur_datetime}")
self.docs_dir = data_dir.joinpath(app_name, "auto_docs")
elif stage == "act":
# choose ScreenshotParse to run
self.set_actions([ScreenshotParse])
self.task_dir = data_dir.joinpath(app_name, f"act_{cur_datetime}")
if mode == "manual":
self.docs_dir = data_dir.joinpath(app_name, "manual_docs")
else:
self.docs_dir = data_dir.joinpath(app_name, "auto_docs")
else:
raise ValueError(f"invalid stage: {stage}, mode: {mode}")
self._check_dir()
self._set_react_mode(RoleReactMode.BY_ORDER)
def _check_dir(self):
self.task_dir.mkdir(parents=True, exist_ok=True)
self.docs_dir.mkdir(parents=True, exist_ok=True)
async def react(self) -> Message:
self.round_count += 1
result = await super().react()
logger.debug(f"react result {result}")
return result
async def _observe(self, ignore_memory=True) -> int:
"""ignore old memory to make it run multi rounds inside a role"""
newest_msgs = self.rc.memory.get(k=1)
newest_msg = newest_msgs[0] if newest_msgs else None
if newest_msg and (RunState.SUCCESS.value.upper() not in newest_msg.content):
ignore_memory = False
state_val = newest_msg.content.split(".")[-1] # RoundCount: 1, action_state: RunState.SUCCESS
logger.warning(f"Latest action_state is {state_val}, will run in the remainder rounds without `react`")
return await super()._observe(ignore_memory)
async def _act(self) -> Message:
logger.info(f"{self._setting}: to do {self.rc.todo}({self.rc.todo.name})")
todo = self.rc.todo
if isinstance(todo, ManualRecord):
resp = await todo.run(task_dir=self.task_dir, task_desc=self.task_desc, env=self.rc.env)
elif isinstance(todo, ParseRecord):
resp = await todo.run(
task_dir=self.task_dir,
docs_dir=self.docs_dir,
)
elif isinstance(todo, SelfLearnAndReflect):
resp = await todo.run(
round_count=self.round_count,
task_desc=self.task_desc,
last_act=self.last_act,
task_dir=self.task_dir,
docs_dir=self.docs_dir,
env=self.rc.env,
)
if resp.action_state == RunState.SUCCESS:
self.last_act = resp.data.get("last_act")
elif isinstance(todo, ScreenshotParse):
resp = await todo.run(
round_count=self.round_count,
task_desc=self.task_desc,
last_act=self.last_act,
task_dir=self.task_dir,
docs_dir=self.docs_dir,
grid_on=self.grid_on,
env=self.rc.env,
)
if resp.action_state == RunState.SUCCESS:
logger.info(f"grid_on: {resp.data.get('grid_on')}")
self.grid_on = resp.data.get("grid_on", False)
self.last_act = resp.data.get("last_act", "None")
msg = Message(
content=f"RoundCount: {self.round_count}, action_state: {resp.action_state}",
role=self.profile,
cause_by=type(resp),
send_from=self.name,
send_to=self.name,
)
self.rc.memory.add(msg)
return msg

View file

@ -0,0 +1,3 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc :

View file

@ -0,0 +1,158 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc :
from enum import Enum
from pydantic import BaseModel, Field, field_validator
class ActionOp(Enum):
TAP = "tap"
LONG_PRESS = "long_press"
TEXT = "text"
SWIPE = "swipe"
VERTICAL_SWIPE = "v_swipe"
HORIZONTAL_SWIPE = "h_swipe"
GRID = "grid"
STOP = "stop"
class SwipeOp(Enum):
UP = "up"
DOWN = "down"
LEFT = "left"
RIGHT = "right"
class Decision(Enum):
BACK = "BACK"
INEFFECTIVE = "INEFFECTIVE"
CONTINUE = "CONTINUE"
SUCCESS = "SUCCESS"
@classmethod
def values(cls):
return [item.value for item in cls]
class AndroidElement(BaseModel):
"""UI Element"""
uid: str = Field(default="")
bbox: tuple[tuple[int, int], tuple[int, int]] = Field(default={})
attrib: str = Field(default="")
class OpLogItem(BaseModel):
"""log content for self-learn or task act"""
step: int = Field(default=0)
prompt: str = Field(default="")
image: str = Field(default="")
response: str = Field(default="")
class ReflectLogItem(BaseModel):
"""log content for self-learn-reflect"""
step: int = Field(default=0)
prompt: str = Field(default="")
image_before: str = Field(default="")
image_after: str = Field(default="")
response: str = Field(default="")
class RecordLogItem(BaseModel):
"""log content for record parse, same as ReflectLogItem"""
step: int = Field(default=0)
prompt: str = Field(default="")
image_before: str = Field(default="")
image_after: str = Field(default="")
response: str = Field(default="")
class DocContent(BaseModel):
tap: str = Field(default="")
text: str = Field(default="")
v_swipe: str = Field(default="")
h_swipe: str = Field(default="")
long_press: str = Field(default="")
# start =================== define different Action Op and its params =============
class RunState(Enum):
"""run state"""
SUCCESS = "success"
FINISH = "finish"
FAIL = "fail"
class BaseOpParam(BaseModel):
act_name: str = Field(default="", validate_default=True)
last_act: str = Field(default="None")
param_state: RunState = Field(default=RunState.SUCCESS, description="return state when extract params")
class TapOpParam(BaseOpParam):
area: int = Field(default=-1)
class TextOpParam(BaseOpParam):
input_str: str = Field(default="")
class LongPressOpParam(BaseOpParam):
area: int = Field(default=-1)
# Modify This SwipeOp to SwipeOpParam, Need better name
class SwipeOpParam(BaseOpParam):
area: int = Field(default=-1)
swipe_orient: str = Field(default="up")
dist: str = Field(default="")
class GridOpParam(BaseOpParam):
act_name: str = Field(default="")
class BaseGridOpParam(BaseOpParam):
@field_validator("act_name", mode="before")
@classmethod
def check_act_name(cls, act_name: str) -> str:
return f"{act_name}_grid"
class TapGridOpParam(BaseGridOpParam):
area: int = Field(default=-1)
subarea: str = Field(default="")
class LongPressGridOpParam(BaseGridOpParam):
area: int = Field(default=-1)
subarea: str = Field(default="")
class SwipeGridOpParam(BaseGridOpParam):
start_area: int = Field(default=-1)
start_subarea: str = Field(default="")
end_area: int = Field(default=-1)
end_subarea: str = Field(default="")
# end =================== define different Action Op and its params =============
class ReflectOp(BaseModel):
decision: str = ""
thought: str = ""
documentation: str = ""
param_state: RunState = RunState.SUCCESS
class AndroidActionOutput(BaseModel):
data: dict = Field(default=dict())
action_state: RunState = Field(default=RunState.SUCCESS)

View file

@ -0,0 +1,329 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc :
import re
from pathlib import Path
from typing import Union
from xml.etree.ElementTree import Element, iterparse
import cv2
import pyshine as ps
from metagpt.config2 import config
from metagpt.ext.android_assistant.utils.schema import (
ActionOp,
AndroidElement,
BaseGridOpParam,
BaseOpParam,
Decision,
GridOpParam,
LongPressGridOpParam,
LongPressOpParam,
ReflectOp,
RunState,
SwipeGridOpParam,
SwipeOpParam,
TapGridOpParam,
TapOpParam,
TextOpParam,
)
from metagpt.logs import logger
def get_id_from_element(elem: Element) -> str:
bounds = elem.attrib["bounds"][1:-1].split("][")
x1, y1 = map(int, bounds[0].split(","))
x2, y2 = map(int, bounds[1].split(","))
elem_w, elem_h = x2 - x1, y2 - y1
if "resource-id" in elem.attrib and elem.attrib["resource-id"]:
elem_id = elem.attrib["resource-id"].replace(":", ".").replace("/", "_")
else:
elem_id = f"{elem.attrib['class']}_{elem_w}_{elem_h}"
if "content-desc" in elem.attrib and elem.attrib["content-desc"] and len(elem.attrib["content-desc"]) < 20:
content_desc = elem.attrib["content-desc"].replace("/", "_").replace(" ", "").replace(":", "_")
elem_id += f"_{content_desc}"
return elem_id
def traverse_xml_tree(xml_path: Path, elem_list: list[AndroidElement], attrib: str, add_index=False):
path = []
extra_config = config.extra
for event, elem in iterparse(str(xml_path), ["start", "end"]):
if event == "start":
path.append(elem)
if attrib in elem.attrib and elem.attrib[attrib] == "true":
parent_prefix = ""
if len(path) > 1:
parent_prefix = get_id_from_element(path[-2])
bounds = elem.attrib["bounds"][1:-1].split("][")
x1, y1 = map(int, bounds[0].split(","))
x2, y2 = map(int, bounds[1].split(","))
center = (x1 + x2) // 2, (y1 + y2) // 2
elem_id = get_id_from_element(elem)
if parent_prefix:
elem_id = parent_prefix + "_" + elem_id
if add_index:
elem_id += f"_{elem.attrib['index']}"
close = False
for e in elem_list:
bbox = e.bbox
center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5
if dist <= extra_config.get("min_dist", 30):
close = True
break
if not close:
elem_list.append(AndroidElement(uid=elem_id, bbox=((x1, y1), (x2, y2)), attrib=attrib))
if event == "end":
path.pop()
def elem_list_from_xml_tree(xml_path: Path, useless_list: list[str], min_dist: int) -> list[AndroidElement]:
clickable_list = []
focusable_list = []
traverse_xml_tree(xml_path, clickable_list, "clickable", True)
traverse_xml_tree(xml_path, focusable_list, "focusable", True)
elem_list = []
for elem in clickable_list:
if elem.uid in useless_list:
continue
elem_list.append(elem)
for elem in focusable_list:
if elem.uid in useless_list:
continue
bbox = elem.bbox
center = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
close = False
for e in clickable_list:
bbox = e.bbox
center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5
if dist <= min_dist:
close = True
break
if not close:
elem_list.append(elem)
return elem_list
def draw_bbox_multi(
img_path: Path,
output_path: Path,
elem_list: list[AndroidElement],
record_mode: bool = False,
dark_mode: bool = False,
):
imgcv = cv2.imread(str(img_path))
count = 1
for elem in elem_list:
try:
top_left = elem.bbox[0]
bottom_right = elem.bbox[1]
left, top = top_left[0], top_left[1]
right, bottom = bottom_right[0], bottom_right[1]
label = str(count)
if record_mode:
if elem.attrib == "clickable":
color = (250, 0, 0)
elif elem.attrib == "focusable":
color = (0, 0, 250)
else:
color = (0, 250, 0)
imgcv = ps.putBText(
imgcv,
label,
text_offset_x=(left + right) // 2 + 10,
text_offset_y=(top + bottom) // 2 + 10,
vspace=10,
hspace=10,
font_scale=1,
thickness=2,
background_RGB=color,
text_RGB=(255, 250, 250),
alpha=0.5,
)
else:
text_color = (10, 10, 10) if dark_mode else (255, 250, 250)
bg_color = (255, 250, 250) if dark_mode else (10, 10, 10)
imgcv = ps.putBText(
imgcv,
label,
text_offset_x=(left + right) // 2 + 10,
text_offset_y=(top + bottom) // 2 + 10,
vspace=10,
hspace=10,
font_scale=1,
thickness=2,
background_RGB=bg_color,
text_RGB=text_color,
alpha=0.5,
)
except Exception as e:
logger.error(f"ERROR: An exception occurs while labeling the image\n{e}")
count += 1
cv2.imwrite(str(output_path), imgcv)
return imgcv
def draw_grid(img_path: Path, output_path: Path) -> tuple[int, int]:
def get_unit_len(n):
for i in range(1, n + 1):
if n % i == 0 and 120 <= i <= 180:
return i
return -1
image = cv2.imread(str(img_path))
height, width, _ = image.shape
color = (255, 116, 113)
unit_height = get_unit_len(height)
if unit_height < 0:
unit_height = 120
unit_width = get_unit_len(width)
if unit_width < 0:
unit_width = 120
thick = int(unit_width // 50)
rows = height // unit_height
cols = width // unit_width
for i in range(rows):
for j in range(cols):
label = i * cols + j + 1
left = int(j * unit_width)
top = int(i * unit_height)
right = int((j + 1) * unit_width)
bottom = int((i + 1) * unit_height)
cv2.rectangle(image, (left, top), (right, bottom), color, thick // 2)
cv2.putText(
image,
str(label),
(left + int(unit_width * 0.05) + 3, top + int(unit_height * 0.3) + 3),
0,
int(0.01 * unit_width),
(0, 0, 0),
thick,
)
cv2.putText(
image,
str(label),
(left + int(unit_width * 0.05), top + int(unit_height * 0.3)),
0,
int(0.01 * unit_width),
color,
thick,
)
cv2.imwrite(str(output_path), image)
return rows, cols
def area_to_xy(area: int, subarea: str, width: int, height: int, rows: int, cols: int) -> tuple[int, int]:
area -= 1
row, col = area // cols, area % cols
x_0, y_0 = col * (width // cols), row * (height // rows)
if subarea == "top-left":
x, y = x_0 + (width // cols) // 4, y_0 + (height // rows) // 4
elif subarea == "top":
x, y = x_0 + (width // cols) // 2, y_0 + (height // rows) // 4
elif subarea == "top-right":
x, y = x_0 + (width // cols) * 3 // 4, y_0 + (height // rows) // 4
elif subarea == "left":
x, y = x_0 + (width // cols) // 4, y_0 + (height // rows) // 2
elif subarea == "right":
x, y = x_0 + (width // cols) * 3 // 4, y_0 + (height // rows) // 2
elif subarea == "bottom-left":
x, y = x_0 + (width // cols) // 4, y_0 + (height // rows) * 3 // 4
elif subarea == "bottom":
x, y = x_0 + (width // cols) // 2, y_0 + (height // rows) * 3 // 4
elif subarea == "bottom-right":
x, y = x_0 + (width // cols) * 3 // 4, y_0 + (height // rows) * 3 // 4
else:
x, y = x_0 + (width // cols) // 2, y_0 + (height // rows) // 2
return x, y
def elem_bbox_to_xy(bbox: tuple[tuple[int, int], tuple[int, int]]) -> tuple[int, int]:
tl, br = bbox
x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
return x, y
def reflect_parse_extarct(parsed_json: dict) -> ReflectOp:
decision = parsed_json.get("Decision")
if decision not in Decision.values():
op = ReflectOp(param_state=RunState.FAIL)
else:
op = ReflectOp(
decision=parsed_json.get("Decision"),
thought=parsed_json.get("Thought"),
documentation=parsed_json.get("Documentation"),
)
return op
def screenshot_parse_extract(
parsed_json: dict, grid_on: bool = False
) -> Union[BaseOpParam, BaseGridOpParam, GridOpParam]:
act = parsed_json.get("Action")
last_act = parsed_json.get("Summary")
act_name = act.split("(")[0]
if RunState.FINISH.value.upper() in act:
return BaseOpParam(param_state=RunState.FINISH)
if grid_on:
return screenshot_parse_extract_with_grid(act_name, act, last_act)
else:
return screenshot_parse_extract_without_grid(act_name, act, last_act)
def op_params_clean(params: list[str]) -> list[Union[int, str]]:
param_values = []
for param_value in params:
if '"' in param_value or "'" in param_value: # remove `"`
param_values.append(param_value.strip()[1:-1])
else:
param_values.append(int(param_value))
return param_values
def screenshot_parse_extract_without_grid(act_name: str, act: str, last_act: str) -> Union[BaseOpParam, GridOpParam]:
if act_name == ActionOp.TAP.value:
area = int(re.findall(r"tap\((.*?)\)", act)[0])
op = TapOpParam(act_name=act_name, area=area, last_act=last_act)
elif act_name == ActionOp.TEXT.value:
input_str = re.findall(r"text\((.*?)\)", act)[0][1:-1]
op = TextOpParam(act_name=act_name, input_str=input_str, last_act=last_act)
elif act_name == ActionOp.LONG_PRESS.value:
area = int(re.findall(r"long_press\((.*?)\)", act)[0])
op = LongPressOpParam(act_name=act_name, area=area, last_act=last_act)
elif act_name == ActionOp.SWIPE.value:
params = re.findall(r"swipe\((.*?)\)", act)[0].split(",")
params = op_params_clean(params) # area, swipe_orient, dist
op = SwipeOpParam(act_name=act_name, area=params[0], swipe_orient=params[1], dist=params[2], last_act=last_act)
elif act_name == ActionOp.GRID.value:
op = GridOpParam(act_name=act_name)
else:
op = BaseOpParam(param_state=RunState.FAIL)
return op
def screenshot_parse_extract_with_grid(act_name: str, act: str, last_act: str) -> Union[BaseGridOpParam, GridOpParam]:
if act_name == ActionOp.TAP.value:
params = re.findall(r"tap\((.*?)\)", act)[0].split(",")
params = op_params_clean(params)
op = TapGridOpParam(act_name=act_name, area=params[0], subarea=params[1], last_act=last_act)
elif act_name == ActionOp.LONG_PRESS.value:
params = re.findall(r"long_press\((.*?)\)", act)[0].split(",")
params = op_params_clean(params)
op = LongPressGridOpParam(act_name=act_name, area=params[0], subarea=params[1], last_act=last_act)
elif act_name == ActionOp.SWIPE.value:
params = re.findall(r"swipe\((.*?)\)", act)[0].split(",")
params = op_params_clean(params)
op = SwipeGridOpParam(
act_name=act_name, start_area=params[0], start_subarea=params[1], end_area=params[2], end_subarea=params[3]
)
elif act_name == ActionOp.GRID.value:
op = GridOpParam(act_name=act_name)
else:
op = BaseGridOpParam(param_state=RunState.FAIL)
return op

View file

@ -32,5 +32,20 @@ ### Frontend service startup
Enter `environment/frontend_server` and use `python3 manage.py runserver` to start the front-end service.
Visit `http://localhost:8000/simulator_home` to enter the current simulation interface.
## Appreciation
The reproduction work has referred the `https://github.com/joonspk-research/generative_agents`, let's make a general statement here.
## Acknowledgements
The reproduction work has referred the [generative_agents](https://github.com/joonspk-research/generative_agents), let's make a general statement here.
### Citation
```bib
@inproceedings{Park2023GenerativeAgents,
author = {Park, Joon Sung and O'Brien, Joseph C. and Cai, Carrie J. and Morris, Meredith Ringel and Liang, Percy and Bernstein, Michael S.},
title = {Generative Agents: Interactive Simulacra of Human Behavior},
year = {2023},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
booktitle = {In the 36th Annual ACM Symposium on User Interface Software and Technology (UIST '23)},
keywords = {Human-AI interaction, agents, generative AI, large language models},
location = {San Francisco, CA, USA},
series = {UIST '23}
}
```

View file

@ -31,5 +31,20 @@ ### 前端服务启动
进入`environment/frontend_server`,使用`python3 manage.py runserver`启动前端服务。
访问`http://localhost:8000/simulator_home` 进入当前的仿真界面。
## Appreciation
The reproduction work has referred the `https://github.com/joonspk-research/generative_agents`, let's make a general statement here.
## 致谢
复现工作参考了 [generative_agents](https://github.com/joonspk-research/generative_agents), 感谢相关作者们。
### 引用
```bib
@inproceedings{Park2023GenerativeAgents,
author = {Park, Joon Sung and O'Brien, Joseph C. and Cai, Carrie J. and Morris, Meredith Ringel and Liang, Percy and Bernstein, Michael S.},
title = {Generative Agents: Interactive Simulacra of Human Behavior},
year = {2023},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
booktitle = {In the 36th Annual ACM Symposium on User Interface Software and Technology (UIST '23)},
keywords = {Human-AI interaction, agents, generative AI, large language models},
location = {San Francisco, CA, USA},
series = {UIST '23}
}
```

View file

@ -119,9 +119,6 @@ class Team(BaseModel):
)
return self.run_project(idea=idea, send_to=send_to)
def _save(self):
logger.info(self.model_dump_json())
@serialize_decorator
async def run(self, n_round=3, idea="", send_to="", auto_archive=True):
"""Run company until target round or no money"""
@ -129,11 +126,10 @@ class Team(BaseModel):
self.run_project(idea=idea, send_to=send_to)
while n_round > 0:
# self._save()
n_round -= 1
logger.debug(f"max {n_round=} left.")
self._check_balance()
await self.env.run()
logger.debug(f"max {n_round=} left.")
self.env.archive(auto_archive)
return self.env.history

View file

@ -39,6 +39,7 @@ extras_require = {
"llama-index-vector-stores-elasticsearch==0.1.6",
"llama-index-vector-stores-chroma==0.1.6",
],
"android_assistant": ["pyshine==0.0.9", "opencv-python==4.6.0.66"],
}
extras_require["test"] = [

View file

@ -0,0 +1,2 @@
!*.png
unitest_Contacts

Binary file not shown.

After

Width:  |  Height:  |  Size: 611 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 840 KiB

View file

@ -0,0 +1,2 @@
tap(9):::android.view.ViewGroup_1067_236_android.widget.TextView_183_204_Apps_2
stop

View file

@ -0,0 +1 @@
Create a contact in Contacts App named zjy with a phone number +86 18831933368

View file

@ -16,8 +16,8 @@ def mock_device_shape_invalid(self, adb_cmd: str) -> str:
return ADB_EXEC_FAIL
def mock_list_devices(self, adb_cmd: str) -> str:
return "devices\nemulator-5554"
def mock_list_devices(self) -> str:
return ["emulator-5554"]
def mock_get_screenshot(self, adb_cmd: str) -> str:
@ -35,6 +35,7 @@ def mock_write_read_operation(self, adb_cmd: str) -> str:
def test_android_ext_env(mocker):
device_id = "emulator-5554"
mocker.patch("metagpt.environment.android.android_ext_env.AndroidExtEnv.execute_adb_with_cmd", mock_device_shape)
mocker.patch("metagpt.environment.android.android_ext_env.AndroidExtEnv.list_devices", mock_list_devices)
ext_env = AndroidExtEnv(device_id=device_id, screenshot_dir="/data2/", xml_dir="/data2/")
assert ext_env.adb_prefix == f"adb -s {device_id} "
@ -48,7 +49,6 @@ def test_android_ext_env(mocker):
)
assert ext_env.device_shape == (0, 0)
mocker.patch("metagpt.environment.android.android_ext_env.AndroidExtEnv.execute_adb_with_cmd", mock_list_devices)
assert ext_env.list_devices() == [device_id]
mocker.patch("metagpt.environment.android.android_ext_env.AndroidExtEnv.execute_adb_with_cmd", mock_get_screenshot)

View file

@ -64,7 +64,7 @@ async def test_ext_env():
_ = await env.write_thru_api(EnvAPIAbstract(api_name="write_api", kwargs={"a": 5, "b": 10}))
assert env.value == 15
with pytest.raises(ValueError):
with pytest.raises(KeyError):
await env.read_from_api("not_exist_api")
assert await env.read_from_api("read_api_no_param") == 15

View file

@ -0,0 +1,3 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc :

View file

@ -0,0 +1,85 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc : test on android emulator action. After Modify Role Test, this script is discarded.
import asyncio
import time
from pathlib import Path
from metagpt.const import TEST_DATA_PATH
from metagpt.environment.android.android_env import AndroidEnv
from metagpt.ext.android_assistant.actions.manual_record import ManualRecord
from metagpt.ext.android_assistant.actions.parse_record import ParseRecord
from metagpt.ext.android_assistant.actions.screenshot_parse import ScreenshotParse
from metagpt.ext.android_assistant.actions.self_learn_and_reflect import (
SelfLearnAndReflect,
)
TASK_PATH = TEST_DATA_PATH.joinpath("andriod_assistant/unitest_Contacts")
TASK_PATH.mkdir(parents=True, exist_ok=True)
DEMO_NAME = str(time.time())
SELF_EXPLORE_DOC_PATH = TASK_PATH.joinpath("auto_docs")
PARSE_RECORD_DOC_PATH = TASK_PATH.joinpath("demo_docs")
device_id = "emulator-5554"
xml_dir = Path("/sdcard")
screenshot_dir = Path("/sdcard/Pictures/Screenshots")
test_env_self_learn_android = AndroidEnv(
device_id=device_id,
xml_dir=xml_dir,
screenshot_dir=screenshot_dir,
)
test_self_learning = SelfLearnAndReflect()
test_env_manual_learn_android = AndroidEnv(
device_id=device_id,
xml_dir=xml_dir,
screenshot_dir=screenshot_dir,
)
test_manual_record = ManualRecord()
test_manual_parse = ParseRecord()
test_env_screenshot_parse_android = AndroidEnv(
device_id=device_id,
xml_dir=xml_dir,
screenshot_dir=screenshot_dir,
)
test_screenshot_parse = ScreenshotParse()
if __name__ == "__main__":
loop = asyncio.get_event_loop()
test_action_list = [
test_self_learning.run(
round_count=20,
task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ",
last_act="",
task_dir=TASK_PATH / "demos" / f"self_learning_{DEMO_NAME}",
docs_dir=SELF_EXPLORE_DOC_PATH,
env=test_env_self_learn_android,
),
test_manual_record.run(
task_dir=TASK_PATH / "demos" / f"manual_record_{DEMO_NAME}",
task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ",
env=test_env_manual_learn_android,
),
test_manual_parse.run(
task_dir=TASK_PATH / "demos" / f"manual_record_{DEMO_NAME}", # 修要修改
docs_dir=PARSE_RECORD_DOC_PATH, # 需要修改
env=test_env_manual_learn_android,
),
test_screenshot_parse.run(
round_count=20,
task_desc="Create a contact in Contacts App named zjy with a phone number +86 18831933368 ",
last_act="",
task_dir=TASK_PATH / f"act_{DEMO_NAME}",
docs_dir=PARSE_RECORD_DOC_PATH,
env=test_env_screenshot_parse_android,
grid_on=False,
),
]
loop.run_until_complete(asyncio.gather(*test_action_list))
loop.close()

View file

@ -0,0 +1,29 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Desc : test case (imgs from appagent's)
import asyncio
from metagpt.actions.action import Action
from metagpt.const import TEST_DATA_PATH
from metagpt.ext.android_assistant.actions.parse_record import ParseRecord
TASK_PATH = TEST_DATA_PATH.joinpath("andriod_assistant/demo_Contacts")
TEST_BEFORE_PATH = TASK_PATH.joinpath("labeled_screenshots/0_labeled.png")
TEST_AFTER_PATH = TASK_PATH.joinpath("labeled_screenshots/1_labeled.png")
RECORD_PATH = TASK_PATH.joinpath("record.txt")
TASK_DESC_PATH = TASK_PATH.joinpath("task_desc.txt")
DOCS_DIR = TASK_PATH.joinpath("storage")
test_action = Action(name="test")
async def manual_learn_test():
parse_record = ParseRecord()
await parse_record.run(app_name="demo_Contacts", task_dir=TASK_PATH, docs_dir=DOCS_DIR)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(manual_learn_test())
loop.close()