diff --git a/metagpt/environment/android/android_ext_env.py b/metagpt/environment/android/android_ext_env.py
index 826846e4f..78f27923f 100644
--- a/metagpt/environment/android/android_ext_env.py
+++ b/metagpt/environment/android/android_ext_env.py
@@ -1,13 +1,19 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # @Desc   : The Android external environment to integrate with Android apps
-
 import subprocess
+import clip
+import time
 from pathlib import Path
 from typing import Any, Optional
 
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+from PIL import Image
 from pydantic import Field
 
+from metagpt.environment.android.text_icon_localization import *
 from metagpt.environment.android.const import ADB_EXEC_FAIL
 from metagpt.environment.android.env_space import (
     EnvAction,
@@ -17,6 +23,20 @@ from metagpt.environment.android.env_space import (
     EnvObsValType,
 )
 from metagpt.environment.base_env import ExtEnv, mark_as_readable, mark_as_writeable
+from metagpt.logs import logger
+from metagpt.utils.common import download_model
+from metagpt.const import DEFAULT_WORKSPACE_ROOT
+
+
+def load_cv_model(device: str = "cpu") -> any:
+    ocr_detection = pipeline(Tasks.ocr_detection, model="damo/cv_resnet18_ocr-detection-line-level_damo")
+    ocr_recognition = pipeline(Tasks.ocr_recognition,
+                               model="damo/cv_convnextTiny_ocr-recognition-document_damo")
+    file_url = "https://huggingface.co/ShilongLiu/GroundingDINO/blob/main/groundingdino_swint_ogc.pth"
+    target_folder = Path(f"{DEFAULT_WORKSPACE_ROOT}/weights")
+    file_path = download_model(file_url, target_folder)
+    groundingdino_model = load_model(file_path, device=device).eval()
+    return ocr_detection, ocr_recognition, groundingdino_model
 
 
 class AndroidExtEnv(ExtEnv):
@@ -25,10 +45,14 @@ class AndroidExtEnv(ExtEnv):
     xml_dir: Optional[Path] = Field(default=None)
     width: int = Field(default=720, description="device screen width")
     height: int = Field(default=1080, description="device screen height")
+    ocr_detection: any = Field(default=None, description="ocr detection model")
+    ocr_recognition: any = Field(default=None, description="ocr recognition model")
+    groundingdino_model: any = Field(default=None, description="clip groundingdino model")
 
     def __init__(self, **data: Any):
         super().__init__(**data)
         device_id = data.get("device_id")
+        self.ocr_detection, self.ocr_recognition, self.groundingdino_model = load_cv_model()
         if device_id:
             devices = self.list_devices()
             if device_id not in devices:
@@ -36,15 +60,14 @@ class AndroidExtEnv(ExtEnv):
             (width, height) = self.device_shape
             self.width = data.get("width", width)
             self.height = data.get("height", height)
-
             self.create_device_path(self.screenshot_dir)
             self.create_device_path(self.xml_dir)
 
     def reset(
-        self,
-        *,
-        seed: Optional[int] = None,
-        options: Optional[dict[str, Any]] = None,
+            self,
+            *,
+            seed: Optional[int] = None,
+            options: Optional[dict[str, Any]] = None,
     ) -> tuple[dict[str, Any], dict[str, Any]]:
         super().reset(seed=seed, options=options)
 
@@ -149,14 +172,26 @@ class AndroidExtEnv(ExtEnv):
         ss_remote_path = Path(self.screenshot_dir).joinpath(f"{ss_name}.png")
         ss_cmd = f"{self.adb_prefix_shell} screencap -p {ss_remote_path}"
         ss_res = self.execute_adb_with_cmd(ss_cmd)
-
+        time.sleep(0.1)
         res = ADB_EXEC_FAIL
         if ss_res != ADB_EXEC_FAIL:
             ss_local_path = Path(local_save_dir).joinpath(f"{ss_name}.png")
             pull_cmd = f"{self.adb_prefix} pull {ss_remote_path} {ss_local_path}"
             pull_res = self.execute_adb_with_cmd(pull_cmd)
+            time.sleep(0.1)
             if pull_res != ADB_EXEC_FAIL:
                 res = ss_local_path
+        else:
+            ss_cmd = f"{self.adb_prefix_shell} rm /sdcard/{ss_name}.png"
+            ss_res = self.execute_adb_with_cmd(ss_cmd)
+            time.sleep(0.1)
+            ss_cmd = f"{self.adb_prefix_shell} screencap -p /sdcard/{ss_name}.png"
+            ss_res = self.execute_adb_with_cmd(ss_cmd)
+            time.sleep(0.1)
+            ss_cmd = f"{self.adb_prefix} pull /sdcard/{ss_name}.png {self.screenshot_dir}"
+            ss_res = self.execute_adb_with_cmd(ss_cmd)
+            image_path = Path(f"{self.screenshot_dir}/{ss_name}.png")
+            res = image_path
         return Path(res)
 
     @mark_as_readable
@@ -224,7 +259,94 @@ class AndroidExtEnv(ExtEnv):
         return swipe_res
 
     @mark_as_writeable
-    def user_swipe_to(self, start: tuple[int, int], end: tuple[int, int], duration: int = 400):
+    def user_swipe_to(self, start: tuple[int, int], end: tuple[int, int], duration: int = 400) -> str:
         adb_cmd = f"{self.adb_prefix_si} swipe {start[0]} {start[1]} {end[0]} {end[1]} {duration}"
         swipe_res = self.execute_adb_with_cmd(adb_cmd)
         return swipe_res
+
+    @mark_as_writeable
+    def user_exit(self) -> str:
+        adb_cmd = f"{self.adb_prefix_shell} am start -a android.intent.action.MAIN -c android.intent.category.HOME"
+        exit_res = self.execute_adb_with_cmd(adb_cmd)
+        return exit_res
+
+    def _ocr_text(self, text: str) -> list:
+        image = self.get_screenshot("screenshot", self.screenshot_dir)
+        iw, ih = Image.open(image).size
+        x, y = self.device_shape
+        if iw > ih:
+            x, y = y, x
+            iw, ih = ih, iw
+        in_coordinate, out_coordinate = ocr(image, text, self.ocr_detection, self.ocr_recognition, iw, ih)
+        output_list = [in_coordinate, out_coordinate, x, y, iw, ih, image]
+        return output_list
+
+    @mark_as_writeable
+    def user_open_app(self, app_name: str) -> str:
+        ocr_result = self._ocr_text(app_name)
+        in_coordinate, out_coordinate, x, y, iw, ih = (
+            ocr_result[0], ocr_result[1], ocr_result[2], ocr_result[3], ocr_result[4], ocr_result[5])
+        if len(in_coordinate) == 0:
+            logger.info(f"No App named {app_name}.")
+            return "no app here"
+        else:
+            tap_coordinate = [
+                (in_coordinate[0][0] + in_coordinate[0][2]) / 2,
+                (in_coordinate[0][1] + in_coordinate[0][3]) / 2,
+            ]
+            tap_coordinate = [round(tap_coordinate[0] / iw, 2), round(tap_coordinate[1] / ih, 2)]
+            return self.system_tap(tap_coordinate[0] * x, (tap_coordinate[1] - round(50 / y, 2)) * y)
+
+    @mark_as_writeable
+    def user_click_text(self, text: str) -> str:
+        ocr_result = self._ocr_text(text)
+        in_coordinate, out_coordinate, x, y, iw, ih, image = (
+            ocr_result[0], ocr_result[1], ocr_result[2], ocr_result[3], ocr_result[4], ocr_result[5], ocr_result[6])
+        if len(out_coordinate) == 0:
+            logger.info(
+                f"Failed to execute action click text ({text}). The text \"{text}\" is not detected in the screenshot.")
+        elif len(out_coordinate) == 1:
+            tap_coordinate = [(in_coordinate[0][0] + in_coordinate[0][2]) / 2,
+                              (in_coordinate[0][1] + in_coordinate[0][3]) / 2]
+            tap_coordinate = [round(tap_coordinate[0] / iw, 2), round(tap_coordinate[1] / ih, 2)]
+            return self.system_tap(tap_coordinate[0] * x, tap_coordinate[1] * y)
+        else:
+            logger.info(
+                f"Failed to execute action click text ({text}). There are too many text \"{text}\" in the screenshot.")
+
+    @mark_as_writeable
+    def user_stop(self):
+        logger.info("Successful execution of tasks")
+
+    @mark_as_writeable
+    def user_click_icon(self, icon_shape_color: str) -> str:
+        screenshot_path = self.get_screenshot("screenshot", self.screenshot_dir)
+        image= screenshot_path
+        iw, ih = Image.open(image).size
+        x, y = self.device_shape
+        if iw > ih:
+            x, y = y, x
+            iw, ih = ih, iw
+        in_coordinate, out_coordinate = det(image, "icon", self.groundingdino_model)  # 检测icon
+        if len(out_coordinate) == 1:  # only one icon
+            tap_coordinate = [(in_coordinate[0][0] + in_coordinate[0][2]) / 2,
+                              (in_coordinate[0][1] + in_coordinate[0][3]) / 2]
+            tap_coordinate = [round(tap_coordinate[0] / iw, 2), round(tap_coordinate[1] / ih, 2)]
+            return self.system_tap(tap_coordinate[0] * x, tap_coordinate[1] * y)
+
+        else:
+            temp_file = Path(f"{DEFAULT_WORKSPACE_ROOT}/temp")
+            temp_file.mkdir(parents=True, exist_ok=True)
+            hash_table, clip_filter = [], []
+            for i, (td, box) in enumerate(zip(in_coordinate, out_coordinate)):
+                if crop_for_clip(image, td, i, temp_file):
+                    hash_table.append(td)
+                    crop_image = f"{i}.png"
+                    clip_filter.append(temp_file.joinpath(crop_image))
+            clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
+            clip_filter = clip_for_icon(clip_model, clip_preprocess, clip_filter, icon_shape_color)
+            final_box = hash_table[clip_filter]
+            tap_coordinate = [(final_box[0] + final_box[2]) / 2, (final_box[1] + final_box[3]) / 2]
+            tap_coordinate = [round(tap_coordinate[0] / iw, 2), round(tap_coordinate[1] / ih, 2)]
+            print(tap_coordinate[0] * x, tap_coordinate[1] * y)
+            return self.system_tap(tap_coordinate[0] * x, tap_coordinate[1] * y)
diff --git a/metagpt/environment/android/grounding_dino_config.py b/metagpt/environment/android/grounding_dino_config.py
new file mode 100644
index 000000000..9158d5f62
--- /dev/null
+++ b/metagpt/environment/android/grounding_dino_config.py
@@ -0,0 +1,43 @@
+batch_size = 1
+modelname = "groundingdino"
+backbone = "swin_T_224_1k"
+position_embedding = "sine"
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+two_stage_type = "standard"
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+transformer_activation = "relu"
+dec_pred_bbox_embed_share = True
+dn_box_noise_scale = 1.0
+dn_label_noise_ratio = 0.5
+dn_label_coef = 1.0
+dn_bbox_coef = 1.0
+embed_init_tgt = True
+dn_labelbook_size = 2000
+max_text_len = 256
+text_encoder_type = "bert-base-uncased"
+use_text_enhancer = True
+use_fusion_layer = True
+use_checkpoint = True
+use_transformer_ckpt = True
+use_text_cross_attention = True
+text_dropout = 0.0
+fusion_dropout = 0.0
+fusion_droppath = 0.1
+sub_sentence_present = True
diff --git a/metagpt/environment/android/text_icon_localization.py b/metagpt/environment/android/text_icon_localization.py
new file mode 100644
index 000000000..60d62ed03
--- /dev/null
+++ b/metagpt/environment/android/text_icon_localization.py
@@ -0,0 +1,363 @@
+# The code in this file was modified by MobileAgent
+# https://github.com/X-PLUG/MobileAgent.git
+
+import math
+import clip
+import cv2
+import numpy as np
+import torch
+import subprocess
+import time
+from pathlib import Path
+import groundingdino.datasets.transforms as T
+from groundingdino.models import build_model
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+from PIL import Image, ImageDraw
+
+
+################################## text_localization using ocr #######################
+
+def crop_image(img: any, position: any) -> any:
+    def distance(x1, y1, x2, y2):
+        return math.sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2))
+
+    position = position.tolist()
+    for i in range(4):
+        for j in range(i + 1, 4):
+            if position[i][0] > position[j][0]:
+                tmp = position[j]
+                position[j] = position[i]
+                position[i] = tmp
+    if position[0][1] > position[1][1]:
+        tmp = position[0]
+        position[0] = position[1]
+        position[1] = tmp
+
+    if position[2][1] > position[3][1]:
+        tmp = position[2]
+        position[2] = position[3]
+        position[3] = tmp
+
+    x1, y1 = position[0][0], position[0][1]
+    x2, y2 = position[2][0], position[2][1]
+    x3, y3 = position[3][0], position[3][1]
+    x4, y4 = position[1][0], position[1][1]
+
+    corners = np.zeros((4, 2), np.float32)
+    corners[0] = [x1, y1]
+    corners[1] = [x2, y2]
+    corners[2] = [x4, y4]
+    corners[3] = [x3, y3]
+
+    img_width = distance((x1 + x4) / 2, (y1 + y4) / 2, (x2 + x3) / 2, (y2 + y3) / 2)
+    img_height = distance((x1 + x2) / 2, (y1 + y2) / 2, (x4 + x3) / 2, (y4 + y3) / 2)
+
+    corners_trans = np.zeros((4, 2), np.float32)
+    corners_trans[0] = [0, 0]
+    corners_trans[1] = [img_width - 1, 0]
+    corners_trans[2] = [0, img_height - 1]
+    corners_trans[3] = [img_width - 1, img_height - 1]
+
+    transform = cv2.getPerspectiveTransform(corners, corners_trans)
+    dst = cv2.warpPerspective(img, transform, (int(img_width), int(img_height)))
+    return dst
+
+
+def calculate_size(box: any) -> any:
+    return (box[2] - box[0]) * (box[3] - box[1])
+
+
+def order_point(cooperation: any) -> any:
+    arr = np.array(cooperation).reshape([4, 2])
+    sum_ = np.sum(arr, 0)
+    centroid = sum_ / arr.shape[0]
+    theta = np.arctan2(arr[:, 1] - centroid[1], arr[:, 0] - centroid[0])
+    sort_points = arr[np.argsort(theta)]
+    sort_points = sort_points.reshape([4, -1])
+    if sort_points[0][0] > centroid[0]:
+        sort_points = np.concatenate([sort_points[3:], sort_points[:3]])
+    sort_points = sort_points.reshape([4, 2]).astype("float32")
+    return sort_points
+
+
+def longest_common_substring_length(str1: str, str2: str) -> int:
+    m = len(str1)
+    n = len(str2)
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if str1[i - 1] == str2[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1] + 1
+            else:
+                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
+
+    return dp[m][n]
+
+
+def ocr(image_path: Path, prompt: str, ocr_detection: any, ocr_recognition: any, x: int, y: int) -> any:
+    text_data = []
+    coordinate = []
+    image = Image.open(image_path)
+    iw, ih = image.size
+
+    image_full = cv2.imread(str(image_path))
+    det_result = ocr_detection(image_full)
+    det_result = det_result["polygons"]
+    for i in range(det_result.shape[0]):
+        pts = order_point(det_result[i])
+        image_crop = crop_image(image_full, pts)
+        result = ocr_recognition(image_crop)["text"][0]
+
+        if result == prompt:
+            box = [int(e) for e in list(pts.reshape(-1))]
+            box = [box[0], box[1], box[4], box[5]]
+
+            if calculate_size(box) > 0.05 * iw * ih:
+                continue
+
+            text_data.append(
+                [
+                    int(max(0, box[0] - 10) * x / iw),
+                    int(max(0, box[1] - 10) * y / ih),
+                    int(min(box[2] + 10, iw) * x / iw),
+                    int(min(box[3] + 10, ih) * y / ih),
+                ]
+            )
+            coordinate.append(
+                [
+                    int(max(0, box[0] - 300) * x / iw),
+                    int(max(0, box[1] - 400) * y / ih),
+                    int(min(box[2] + 300, iw) * x / iw),
+                    int(min(box[3] + 400, ih) * y / ih),
+                ]
+            )
+
+    max_length = 0
+    if len(text_data) == 0:
+        for i in range(det_result.shape[0]):
+            pts = order_point(det_result[i])
+            image_crop = crop_image(image_full, pts)
+            result = ocr_recognition(image_crop)["text"][0]
+
+            if len(result) < 0.3 * len(prompt):
+                continue
+
+            if result in prompt:
+                now_length = len(result)
+            else:
+                now_length = longest_common_substring_length(result, prompt)
+
+            if now_length > max_length:
+                max_length = now_length
+                box = [int(e) for e in list(pts.reshape(-1))]
+                box = [box[0], box[1], box[4], box[5]]
+
+                text_data = [
+                    [
+                        int(max(0, box[0] - 10) * x / iw),
+                        int(max(0, box[1] - 10) * y / ih),
+                        int(min(box[2] + 10, iw) * x / iw),
+                        int(min(box[3] + 10, ih) * y / ih),
+                    ]
+                ]
+                coordinate = [
+                    [
+                        int(max(0, box[0] - 300) * x / iw),
+                        int(max(0, box[1] - 400) * y / ih),
+                        int(min(box[2] + 300, iw) * x / iw),
+                        int(min(box[3] + 400, ih) * y / ih),
+                    ]
+                ]
+
+        if len(prompt) <= 10:
+            if max_length >= 0.8 * len(prompt):
+                return text_data, coordinate
+            else:
+                return [], []
+        elif (len(prompt) > 10) and (len(prompt) <= 20):
+            if max_length >= 0.5 * len(prompt):
+                return text_data, coordinate
+            else:
+                return [], []
+        else:
+            if max_length >= 0.4 * len(prompt):
+                return text_data, coordinate
+            else:
+                return [], []
+
+    else:
+        return text_data, coordinate
+
+
+################################## icon_localization using clip #######################
+
+
+def calculate_iou(box1: list, box2: list) -> float:
+    x_a = max(box1[0], box2[0])
+    y_a = max(box1[1], box2[1])
+    x_b = min(box1[2], box2[2])
+    y_b = min(box1[3], box2[3])
+
+    inter_area = max(0, x_b - x_a) * max(0, y_b - y_a)
+    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    union_area = box1_area + box2_area - inter_area
+    iou = inter_area / union_area
+
+    return iou
+
+
+def in_box(box: list, target: list) -> bool:
+    if (box[0] > target[0]) and (box[1] > target[1]) and (box[2] < target[2]) and (box[3] < target[3]):
+        return True
+    else:
+        return False
+
+
+def crop_for_clip(image: any, box: any, i: int, temp_file: Path) -> bool:
+    image = Image.open(image)
+    w, h = image.size
+    bound = [0, 0, w, h]
+    if in_box(box, bound):
+        cropped_image = image.crop(box)
+        cropped_image.save(temp_file.joinpath(f"{i}.png"))
+        return True
+    else:
+        return False
+
+
+def clip_for_icon(clip_model: any, clip_preprocess: any, images: any, prompt: str) -> any:
+    image_features = []
+    for image_file in images:
+        image = clip_preprocess(Image.open(image_file)).unsqueeze(0).to(next(clip_model.parameters()).device)
+        image_feature = clip_model.encode_image(image)
+        image_features.append(image_feature)
+    image_features = torch.cat(image_features)
+
+    text = clip.tokenize([prompt]).to(next(clip_model.parameters()).device)
+    text_features = clip_model.encode_text(text)
+
+    image_features /= image_features.norm(dim=-1, keepdim=True)
+    text_features /= text_features.norm(dim=-1, keepdim=True)
+    similarity = (100.0 * image_features @ text_features.T).softmax(dim=0).squeeze(0)
+    _, max_pos = torch.max(similarity, dim=0)
+    pos = max_pos.item()
+
+    return pos
+
+
+def transform_image(image_pil: any) -> any:
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image
+
+
+def load_model(model_checkpoint_path: Path, device: str) -> any:
+    model_config_path = "grounding_dino_config.py"
+    args = SLConfig.fromfile(model_config_path)
+    args.device = device
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    print(load_res)
+    _ = model.eval()
+    return model
+
+
+def get_grounding_output(model: any, image: any, caption: str, box_threshold: any, text_threshold: any, with_logits: bool = True) -> any:
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+
+    with torch.no_grad():
+        outputs = model(image[None], captions=[caption])
+    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
+    logits.shape[0]
+
+    logits_filt = logits.clone()
+    boxes_filt = boxes.clone()
+    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+    logits_filt = logits_filt[filt_mask]  # num_filt, 256
+    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+    logits_filt.shape[0]
+
+    tokenlizer = model.tokenizer
+    tokenized = tokenlizer(caption)
+
+    pred_phrases = []
+    scores = []
+    for logit, box in zip(logits_filt, boxes_filt):
+        pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+        if with_logits:
+            pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+        else:
+            pred_phrases.append(pred_phrase)
+        scores.append(logit.max().item())
+
+    return boxes_filt, torch.Tensor(scores), pred_phrases
+
+
+def remove_boxes(boxes_filt: any, size: any, iou_threshold: float = 0.5) -> any:
+    boxes_to_remove = set()
+
+    for i in range(len(boxes_filt)):
+        if calculate_size(boxes_filt[i]) > 0.05 * size[0] * size[1]:
+            boxes_to_remove.add(i)
+        for j in range(len(boxes_filt)):
+            if calculate_size(boxes_filt[j]) > 0.05 * size[0] * size[1]:
+                boxes_to_remove.add(j)
+            if i == j:
+                continue
+            if i in boxes_to_remove or j in boxes_to_remove:
+                continue
+            iou = calculate_iou(boxes_filt[i], boxes_filt[j])
+            if iou >= iou_threshold:
+                boxes_to_remove.add(j)
+
+    boxes_filt = [box for idx, box in enumerate(boxes_filt) if idx not in boxes_to_remove]
+
+    return boxes_filt
+
+
+def det(input_image: any, text_prompt: str, groundingdino_model: any, box_threshold:float = 0.05, text_threshold:float = 0.5) -> any:
+    image = Image.open(input_image)
+    size = image.size
+
+    image_pil = image.convert("RGB")
+    image = np.array(image_pil)
+
+    transformed_image = transform_image(image_pil)
+    boxes_filt, scores, pred_phrases = get_grounding_output(
+        groundingdino_model, transformed_image, text_prompt, box_threshold, text_threshold
+    )
+
+    H, W = size[1], size[0]
+    for i in range(boxes_filt.size(0)):
+        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
+        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
+        boxes_filt[i][2:] += boxes_filt[i][:2]
+
+    boxes_filt = boxes_filt.cpu().int().tolist()
+    filtered_boxes = remove_boxes(boxes_filt, size)  # [:9]
+    coordinate = []
+    image_data = []
+    for box in filtered_boxes:
+        image_data.append(
+            [max(0, box[0] - 10), max(0, box[1] - 10), min(box[2] + 10, size[0]), min(box[3] + 10, size[1])]
+        )
+        coordinate.append(
+            [max(0, box[0] - 25), max(0, box[1] - 25), min(box[2] + 25, size[0]), min(box[3] + 25, size[1])]
+        )
+
+    return image_data, coordinate
+
+
diff --git a/metagpt/utils/common.py b/metagpt/utils/common.py
index 0876b85ad..982e6921b 100644
--- a/metagpt/utils/common.py
+++ b/metagpt/utils/common.py
@@ -219,7 +219,7 @@ class OutputParser:
 
         if start_index != -1 and end_index != -1:
             # Extract the structure part
-            structure_text = text[start_index : end_index + 1]
+            structure_text = text[start_index: end_index + 1]
 
             try:
                 # Attempt to convert the text to a Python data type using ast.literal_eval
@@ -841,3 +841,21 @@ def get_markdown_codeblock_type(filename: str) -> str:
         "application/sql": "sql",
     }
     return mappings.get(mime_type, "text")
+
+
+def download_model(file_url: str, target_folder: Path) -> Path:
+    file_name = file_url.split('/')[-1]
+    file_path = target_folder.joinpath(f"{file_name}")
+    if not file_path.exists():
+        file_path.mkdir(parents=True, exist_ok=True)
+        try:
+            response = requests.get(file_url, stream=True)
+            response.raise_for_status()  # 检查请求是否成功
+            # 保存文件
+            with open(file_path, 'wb') as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    f.write(chunk)
+                logger.info(f'权重文件已下载并保存至 {file_path}')
+        except requests.exceptions.HTTPError as err:
+            logger.info(f'权重文件下载过程中发生错误: {err}')
+    return file_path
diff --git a/setup.py b/setup.py
index f8247fa42..43c043720 100644
--- a/setup.py
+++ b/setup.py
@@ -45,7 +45,30 @@ extras_require = {
         "llama-index-postprocessor-flag-embedding-reranker==0.1.2",
         "docx2txt==0.8",
     ],
-    "android_assistant": ["pyshine==0.0.9", "opencv-python==4.6.0.66"],
+    "android_assistant": [
+        "pyshine==0.0.9",
+        "opencv-python==4.6.0.66",
+        "protobuf<3.20,>=3.9.2",
+        "modelscope",
+        "tensorflow==2.9.1; os_name == 'linux'",
+        "tensorflow==2.9.1; os_name == 'win32'",
+        "tensorflow-macos==2.9; os_name == 'darwin'",
+        "keras==2.9.0",
+        "torch",
+        "torchvision",
+        "transformers",
+        "opencv-python",
+        "matplotlib",
+        "pycocotools",
+        "SentencePiece",
+        "tf_slim",
+        "tf_keras",
+        "pyclipper",
+        "shapely",
+        "groundingdino-py",
+        "datasets==2.18.0",
+        "clip-openai"
+    ],
 }
 
 extras_require["test"] = [
@@ -96,4 +119,5 @@ setup(
         ],
     },
     include_package_data=True,
+
 )