mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-21 14:05:17 +02:00
udpate android assistant
This commit is contained in:
parent
e3e49eee03
commit
a9ec18af72
7 changed files with 76 additions and 66 deletions
|
|
@ -161,4 +161,6 @@ class ManualRecord(Action):
|
|||
return AndroidActionOutput(action_state=RunState.FAIL)
|
||||
record_file.write(log_str)
|
||||
|
||||
time.sleep(3)
|
||||
time.sleep(1)
|
||||
|
||||
return AndroidActionOutput(action_state=RunState.SUCCESS)
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ class ParseRecord(Action):
|
|||
screenshot_before_path: Path = ""
|
||||
screenshot_after_path: Path = ""
|
||||
|
||||
async def run(self, app_name: str, task_dir: Path, docs_dir: Path):
|
||||
async def run(self, task_dir: Path, docs_dir: Path):
|
||||
doc_count = 0
|
||||
self.record_path = Path(task_dir) / "record.txt"
|
||||
self.task_desc_path = Path(task_dir) / "task_desc.txt"
|
||||
|
|
@ -112,7 +112,7 @@ class ParseRecord(Action):
|
|||
)
|
||||
if "error" in node.content:
|
||||
return AndroidActionOutput(action_state=RunState.FAIL)
|
||||
log_path = task_dir.joinpath(f"log_{app_name}.txt")
|
||||
log_path = task_dir.joinpath("log_parse_record.txt")
|
||||
prompt = node.compile(context=context, schema="json", mode="auto")
|
||||
msg = node.content
|
||||
doc_content[action_type] = msg
|
||||
|
|
@ -132,3 +132,5 @@ class ParseRecord(Action):
|
|||
logger.info(f"Documentation generated and saved to {doc_path}")
|
||||
|
||||
logger.info(f"Documentation generation phase completed. {doc_count} docs generated.")
|
||||
|
||||
return AndroidActionOutput(action_state=RunState.FINISH)
|
||||
|
|
|
|||
|
|
@ -25,16 +25,16 @@ from metagpt.ext.android_assistant.prompts.assistant_prompt import (
|
|||
from metagpt.ext.android_assistant.utils.schema import (
|
||||
AndroidActionOutput,
|
||||
AndroidElement,
|
||||
GridOp,
|
||||
LongPressGridOp,
|
||||
LongPressOp,
|
||||
GridOpParam,
|
||||
LongPressGridOpParam,
|
||||
LongPressOpParam,
|
||||
OpLogItem,
|
||||
RunState,
|
||||
SwipeGridOp,
|
||||
SwipeOp_3,
|
||||
TapGridOp,
|
||||
TapOp,
|
||||
TextOp,
|
||||
SwipeGridOpParam,
|
||||
SwipeOpParam,
|
||||
TapGridOpParam,
|
||||
TapOpParam,
|
||||
TextOpParam,
|
||||
)
|
||||
from metagpt.ext.android_assistant.utils.utils import (
|
||||
area_to_xy,
|
||||
|
|
@ -109,7 +109,6 @@ class ScreenshotParse(Action):
|
|||
xml_path: Path = env.observe(
|
||||
EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{round_count}", local_save_dir=task_dir)
|
||||
)
|
||||
width, height = env.device_shape
|
||||
if not screenshot_path.exists() or not xml_path.exists():
|
||||
return AndroidActionOutput(action_state=RunState.FAIL)
|
||||
|
||||
|
|
@ -153,33 +152,34 @@ class ScreenshotParse(Action):
|
|||
|
||||
op_param = screenshot_parse_extract(node.instruct_content.model_dump(), grid_on)
|
||||
if op_param.param_state == RunState.FINISH:
|
||||
logger.info(f"op_param: {op_param}")
|
||||
return AndroidActionOutput(action_state=RunState.FINISH)
|
||||
if op_param.param_state == RunState.FAIL:
|
||||
return AndroidActionOutput(action_state=RunState.FAIL)
|
||||
|
||||
if isinstance(op_param, TapOp):
|
||||
if isinstance(op_param, TapOpParam):
|
||||
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
|
||||
action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
|
||||
elif isinstance(op_param, TextOp):
|
||||
elif isinstance(op_param, TextOpParam):
|
||||
action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=op_param.input_str)
|
||||
elif isinstance(op_param, LongPressOp):
|
||||
elif isinstance(op_param, LongPressOpParam):
|
||||
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
|
||||
action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
|
||||
elif isinstance(op_param, SwipeOp_3):
|
||||
elif isinstance(op_param, SwipeOpParam):
|
||||
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
|
||||
action = EnvAction(
|
||||
action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=op_param.swipe_orient, dist=op_param.dist
|
||||
)
|
||||
elif isinstance(op_param, GridOp):
|
||||
elif isinstance(op_param, GridOpParam):
|
||||
grid_on = True
|
||||
elif isinstance(op_param, TapGridOp) or isinstance(op_param, LongPressGridOp):
|
||||
elif isinstance(op_param, TapGridOpParam) or isinstance(op_param, LongPressGridOpParam):
|
||||
x, y = area_to_xy(op_param.area, op_param.subarea, env.width, env.height, env.rows, env.cols)
|
||||
if isinstance(op_param, TapGridOp):
|
||||
if isinstance(op_param, TapGridOpParam):
|
||||
action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
|
||||
else:
|
||||
# LongPressGridOp
|
||||
# LongPressGridOpParam
|
||||
action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
|
||||
elif isinstance(op_param, SwipeGridOp):
|
||||
elif isinstance(op_param, SwipeGridOpParam):
|
||||
start_x, start_y = area_to_xy(
|
||||
op_param.start_area, op_param.start_subarea, env.width, env.height, env.rows, env.cols
|
||||
)
|
||||
|
|
@ -190,12 +190,13 @@ class ScreenshotParse(Action):
|
|||
action_type=EnvActionType.USER_SWIPE_TO, coord=(start_x, start_y), tgt_coord=(end_x, end_y)
|
||||
)
|
||||
|
||||
obs, _, _, _, info = env.step(action)
|
||||
action_res = info["res"]
|
||||
if action_res == ADB_EXEC_FAIL:
|
||||
return AndroidActionOutput(action_state=RunState.FAIL)
|
||||
if not grid_on:
|
||||
obs, _, _, _, info = env.step(action)
|
||||
action_res = info["res"]
|
||||
if action_res == ADB_EXEC_FAIL:
|
||||
return AndroidActionOutput(action_state=RunState.FAIL)
|
||||
|
||||
if op_param.act_name != "grid":
|
||||
grid_on = True
|
||||
grid_on = False
|
||||
|
||||
return AndroidActionOutput(data={"grid_on": grid_on})
|
||||
|
|
|
|||
|
|
@ -33,14 +33,14 @@ from metagpt.ext.android_assistant.utils.schema import (
|
|||
AndroidElement,
|
||||
Decision,
|
||||
DocContent,
|
||||
LongPressOp,
|
||||
LongPressOpParam,
|
||||
OpLogItem,
|
||||
ReflectLogItem,
|
||||
RunState,
|
||||
SwipeOp,
|
||||
SwipeOp_3,
|
||||
TapOp,
|
||||
TextOp,
|
||||
SwipeOpParam,
|
||||
TapOpParam,
|
||||
TextOpParam,
|
||||
)
|
||||
from metagpt.ext.android_assistant.utils.utils import (
|
||||
draw_bbox_multi,
|
||||
|
|
@ -71,6 +71,9 @@ class SelfLearnAndReflect(Action):
|
|||
for path in [task_dir, docs_dir]:
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
resp = await self.run_self_learn(round_count, task_desc, last_act, task_dir, env)
|
||||
if resp.action_state != RunState.SUCCESS:
|
||||
return resp
|
||||
|
||||
resp = await self.run_reflect(round_count, task_desc, last_act, task_dir, docs_dir, env)
|
||||
return resp
|
||||
|
||||
|
|
@ -111,17 +114,17 @@ class SelfLearnAndReflect(Action):
|
|||
if op_param.param_state == RunState.FAIL:
|
||||
return AndroidActionOutput(action_state=RunState.FAIL)
|
||||
|
||||
if isinstance(op_param, TapOp):
|
||||
if isinstance(op_param, TapOpParam):
|
||||
self.ui_area = op_param.area
|
||||
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
|
||||
action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
|
||||
elif isinstance(op_param, TextOp):
|
||||
elif isinstance(op_param, TextOpParam):
|
||||
action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=op_param.input_str)
|
||||
elif isinstance(op_param, LongPressOp):
|
||||
elif isinstance(op_param, LongPressOpParam):
|
||||
self.ui_area = op_param.area
|
||||
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
|
||||
action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
|
||||
elif isinstance(op_param, SwipeOp_3):
|
||||
elif isinstance(op_param, SwipeOpParam):
|
||||
self.ui_area = op_param.area
|
||||
self.swipe_orient = op_param.swipe_orient
|
||||
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
|
||||
|
|
|
|||
|
|
@ -92,9 +92,10 @@ class AndroidAssistant(Role):
|
|||
"""ignore old memory to make it run multi rounds inside a role"""
|
||||
newest_msgs = self.rc.memory.get(k=1)
|
||||
newest_msg = newest_msgs[0] if newest_msgs else None
|
||||
if newest_msg and (RunState.FAIL.value in newest_msg.content):
|
||||
if newest_msg and (RunState.SUCCESS.value.upper() not in newest_msg.content):
|
||||
ignore_memory = False
|
||||
logger.error("Latest action_state is FAIL, won't react in remainder rounds")
|
||||
state_val = newest_msg.content.split(".")[-1] # RoundCount: 1, action_state: RunState.SUCCESS
|
||||
logger.warning(f"Latest action_state is {state_val}, will run in the remainder rounds without `react`")
|
||||
return await super()._observe(ignore_memory)
|
||||
|
||||
async def _act(self) -> Message:
|
||||
|
|
@ -104,7 +105,6 @@ class AndroidAssistant(Role):
|
|||
resp = await todo.run(task_dir=self.task_dir, task_desc=self.task_desc, env=self.rc.env)
|
||||
elif isinstance(todo, ParseRecord):
|
||||
resp = await todo.run(
|
||||
app_name=config.get_other("app_name", "demo"),
|
||||
task_dir=self.task_dir,
|
||||
docs_dir=self.docs_dir,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -96,26 +96,26 @@ class BaseOpParam(BaseModel):
|
|||
param_state: RunState = Field(default=RunState.SUCCESS, description="return state when extract params")
|
||||
|
||||
|
||||
class TapOp(BaseOpParam):
|
||||
class TapOpParam(BaseOpParam):
|
||||
area: int = Field(default=-1)
|
||||
|
||||
|
||||
class TextOp(BaseOpParam):
|
||||
class TextOpParam(BaseOpParam):
|
||||
input_str: str = Field(default="")
|
||||
|
||||
|
||||
class LongPressOp(BaseOpParam):
|
||||
class LongPressOpParam(BaseOpParam):
|
||||
area: int = Field(default=-1)
|
||||
|
||||
|
||||
# Modify This SwipeOp to SwipeOp_3, Need better name
|
||||
class SwipeOp_3(BaseOpParam):
|
||||
# Modify This SwipeOp to SwipeOpParam, Need better name
|
||||
class SwipeOpParam(BaseOpParam):
|
||||
area: int = Field(default=-1)
|
||||
swipe_orient: str = Field(default="up")
|
||||
dist: str = Field(default="")
|
||||
|
||||
|
||||
class GridOp(BaseModel):
|
||||
class GridOpParam(BaseOpParam):
|
||||
act_name: str = Field(default="")
|
||||
|
||||
|
||||
|
|
@ -126,17 +126,17 @@ class BaseGridOpParam(BaseOpParam):
|
|||
return f"{act_name}_grid"
|
||||
|
||||
|
||||
class TapGridOp(BaseGridOpParam):
|
||||
class TapGridOpParam(BaseGridOpParam):
|
||||
area: int = Field(default=-1)
|
||||
subarea: str = Field(default="")
|
||||
|
||||
|
||||
class LongPressGridOp(BaseGridOpParam):
|
||||
class LongPressGridOpParam(BaseGridOpParam):
|
||||
area: int = Field(default=-1)
|
||||
subarea: str = Field(default="")
|
||||
|
||||
|
||||
class SwipeGridOp(BaseGridOpParam):
|
||||
class SwipeGridOpParam(BaseGridOpParam):
|
||||
start_area: int = Field(default=-1)
|
||||
start_subarea: str = Field(default="")
|
||||
end_area: int = Field(default=-1)
|
||||
|
|
|
|||
|
|
@ -16,16 +16,16 @@ from metagpt.ext.android_assistant.utils.schema import (
|
|||
BaseGridOpParam,
|
||||
BaseOpParam,
|
||||
Decision,
|
||||
GridOp,
|
||||
LongPressGridOp,
|
||||
LongPressOp,
|
||||
GridOpParam,
|
||||
LongPressGridOpParam,
|
||||
LongPressOpParam,
|
||||
ReflectOp,
|
||||
RunState,
|
||||
SwipeGridOp,
|
||||
SwipeOp_3,
|
||||
TapGridOp,
|
||||
TapOp,
|
||||
TextOp,
|
||||
SwipeGridOpParam,
|
||||
SwipeOpParam,
|
||||
TapGridOpParam,
|
||||
TapOpParam,
|
||||
TextOpParam,
|
||||
)
|
||||
from metagpt.logs import logger
|
||||
|
||||
|
|
@ -260,7 +260,9 @@ def reflect_parse_extarct(parsed_json: dict) -> ReflectOp:
|
|||
return op
|
||||
|
||||
|
||||
def screenshot_parse_extract(parsed_json: dict, grid_on: bool = False) -> Union[BaseOpParam, BaseGridOpParam, GridOp]:
|
||||
def screenshot_parse_extract(
|
||||
parsed_json: dict, grid_on: bool = False
|
||||
) -> Union[BaseOpParam, BaseGridOpParam, GridOpParam]:
|
||||
act = parsed_json.get("Action")
|
||||
last_act = parsed_json.get("Summary")
|
||||
act_name = act.split("(")[0]
|
||||
|
|
@ -284,44 +286,44 @@ def op_params_clean(params: list[str]) -> list[Union[int, str]]:
|
|||
return param_values
|
||||
|
||||
|
||||
def screenshot_parse_extract_without_grid(act_name: str, act: str, last_act: str) -> Union[BaseOpParam, GridOp]:
|
||||
def screenshot_parse_extract_without_grid(act_name: str, act: str, last_act: str) -> Union[BaseOpParam, GridOpParam]:
|
||||
if act_name == ActionOp.TAP.value:
|
||||
area = int(re.findall(r"tap\((.*?)\)", act)[0])
|
||||
op = TapOp(act_name=act_name, area=area, last_act=last_act)
|
||||
op = TapOpParam(act_name=act_name, area=area, last_act=last_act)
|
||||
elif act_name == ActionOp.TEXT.value:
|
||||
input_str = re.findall(r"text\((.*?)\)", act)[0][1:-1]
|
||||
op = TextOp(act_name=act_name, input_str=input_str, last_act=last_act)
|
||||
op = TextOpParam(act_name=act_name, input_str=input_str, last_act=last_act)
|
||||
elif act_name == ActionOp.LONG_PRESS.value:
|
||||
area = int(re.findall(r"long_press\((.*?)\)", act)[0])
|
||||
op = LongPressOp(act_name=act_name, area=area, last_act=last_act)
|
||||
op = LongPressOpParam(act_name=act_name, area=area, last_act=last_act)
|
||||
elif act_name == ActionOp.SWIPE.value:
|
||||
params = re.findall(r"swipe\((.*?)\)", act)[0].split(",")
|
||||
params = op_params_clean(params) # area, swipe_orient, dist
|
||||
op = SwipeOp_3(act_name=act_name, area=params[0], swipe_orient=params[1], dist=params[2], last_act=last_act)
|
||||
op = SwipeOpParam(act_name=act_name, area=params[0], swipe_orient=params[1], dist=params[2], last_act=last_act)
|
||||
elif act_name == ActionOp.GRID.value:
|
||||
op = GridOp(act_name=act_name)
|
||||
op = GridOpParam(act_name=act_name)
|
||||
else:
|
||||
op = BaseOpParam(param_state=RunState.FAIL)
|
||||
return op
|
||||
|
||||
|
||||
def screenshot_parse_extract_with_grid(act_name: str, act: str, last_act: str) -> Union[BaseGridOpParam, GridOp]:
|
||||
def screenshot_parse_extract_with_grid(act_name: str, act: str, last_act: str) -> Union[BaseGridOpParam, GridOpParam]:
|
||||
if act_name == ActionOp.TAP.value:
|
||||
params = re.findall(r"tap\((.*?)\)", act)[0].split(",")
|
||||
params = op_params_clean(params)
|
||||
op = TapGridOp(act_name=act_name, area=params[0], subarea=params[1], last_act=last_act)
|
||||
op = TapGridOpParam(act_name=act_name, area=params[0], subarea=params[1], last_act=last_act)
|
||||
elif act_name == ActionOp.LONG_PRESS.value:
|
||||
params = re.findall(r"long_press\((.*?)\)", act)[0].split(",")
|
||||
params = op_params_clean(params)
|
||||
op = LongPressGridOp(act_name=act_name, area=params[0], subarea=params[1], last_act=last_act)
|
||||
op = LongPressGridOpParam(act_name=act_name, area=params[0], subarea=params[1], last_act=last_act)
|
||||
elif act_name == ActionOp.SWIPE.value:
|
||||
params = re.findall(r"swipe\((.*?)\)", act)[0].split(",")
|
||||
params = op_params_clean(params)
|
||||
op = SwipeGridOp(
|
||||
op = SwipeGridOpParam(
|
||||
act_name=act_name, start_area=params[0], start_subarea=params[1], end_area=params[2], end_subarea=params[3]
|
||||
)
|
||||
elif act_name == ActionOp.GRID.value:
|
||||
op = GridOp(act_name=act_name)
|
||||
op = GridOpParam(act_name=act_name)
|
||||
else:
|
||||
op = BaseGridOpParam(param_state=RunState.FAIL)
|
||||
return op
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue