fix conflict

2026-07-14 16:32:16 +02:00 · 2024-08-09 16:15:00 +08:00 · 2024-08-09 16:15:00 +08:00 · d2bd9055f3
commit d2bd9055f3
parent d98d36fec5 06f8ee05cb
29 changed files with 708 additions and 45 deletions
--- a/metagpt/actions/design_api.py
+++ b/metagpt/actions/design_api.py
@ -83,7 +83,7 @@ class WriteDesign(Action):
            prd_filename (str, optional): The filename of the Product Requirement Document (PRD).
            legacy_design_filename (str, optional): The filename of the legacy design document.
            extra_info (str, optional): Additional information to be included in the system design.
-            output_pathname (str, optional): The output path name of file that the system design should be saved to.
+            output_pathname (str, optional): The output file path of the document.

        Returns:
            str: The file path of the generated system design.
--- a/metagpt/actions/di/execute_nb_code.py
+++ b/metagpt/actions/di/execute_nb_code.py
@ -191,7 +191,7 @@ class ExecuteNbCode(Action):
                output_text = remove_log_and_warning_lines(output_text)
            # The useful information of the exception is at the end,
            # the useful information of normal output is at the begining.
-            if '<!DOCTYPE html>' not in output_text:
+            if "<!DOCTYPE html>" not in output_text:
                output_text = output_text[:keep_len] if is_success else output_text[-keep_len:]

            parsed_output.append(output_text)
@ -286,11 +286,7 @@ class ExecuteNbCode(Action):
 def remove_log_and_warning_lines(input_str: str) -> str:
    delete_lines = ["[warning]", "warning:", "[cv]", "[info]"]
    result = "\n".join(
-        [
-            line
-            for line in input_str.split("\n")
-            if not any(dl in line.lower() for dl in delete_lines)
-        ]
+        [line for line in input_str.split("\n") if not any(dl in line.lower() for dl in delete_lines)]
    ).strip()
    return result

--- a/metagpt/actions/project_management.py
+++ b/metagpt/actions/project_management.py
@ -62,7 +62,7 @@ class WriteTasks(Action):

        Args:
            user_requirement (str, optional): A string specifying the user's requirements. Defaults to an empty string.
-            design_filename (str): The filename of the project system design file. Defaults to an empty string.
+            design_filename (str): The output file path of the document. Defaults to an empty string.
            output_pathname (str, optional): The output path name of file that the project schedule should be saved to.
            **kwargs: Additional keyword arguments.

@ -73,8 +73,9 @@ class WriteTasks(Action):
            # Write a project schedule with a given system design.
            >>> design_filename = "/absolute/path/to/snake_game/docs/system_design.json"
            >>> output_pathname = "/absolute/path/to/snake_game/docs/project_schedule.json"
+            >>> user_requirement = "Write project schedule for a snake game following these requirements:..."
            >>> action = WriteTasks()
-            >>> result = await action.run(design_filename=design_filename, output_pathname=output_pathname)
+            >>> result = await action.run(user_requirement=user_requirement, design_filename=design_filename, output_pathname=output_pathname)
            >>> print(result)
            The project schedule is at /absolute/path/to/snake_game/docs/project_schedule.json

--- a/metagpt/actions/project_management_an.py
+++ b/metagpt/actions/project_management_an.py
@ -12,7 +12,7 @@ from metagpt.actions.action_node import ActionNode
 REQUIRED_PYTHON_PACKAGES = ActionNode(
    key="Required Python packages",
    expected_type=List[str],
-    instruction="Provide required Python packages in requirements.txt format.",
+    instruction="Provide required Python packages in requirements.txt format. The response language should correspond to the context and requirements.",
    example=["flask==1.1.2", "bcrypt==3.2.0"],
 )

--- a/metagpt/actions/search_enhanced_qa.py
+++ b/metagpt/actions/search_enhanced_qa.py
@ -69,7 +69,7 @@ class SearchEnhancedQA(Action):
        description="Action to explore the web and provide summaries of articles and webpages.",
    )
    per_page_timeout: float = Field(
-        default=10, description="The maximum time for fetching a single page is in seconds. Defaults to 10s."
+        default=20, description="The maximum time for fetching a single page is in seconds. Defaults to 20s."
    )
    java_script_enabled: bool = Field(
        default=False, description="Whether or not to enable JavaScript in the web browser context. Defaults to False."
--- a/metagpt/actions/write_prd.py
+++ b/metagpt/actions/write_prd.py
@ -98,7 +98,7 @@ class WritePRD(Action):

        Args:
            user_requirement (str): A string detailing the user's requirements.
-            output_pathname (str, optional): The path name of file that the output document should be saved to. Defaults to "".
+            output_pathname (str, optional): The output file path of the document. Defaults to "".
            legacy_prd_filename (str, optional): The file path of the legacy Product Requirement Document to use as a reference. Defaults to "".
            extra_info (str, optional): Additional information to include in the document. Defaults to "".
            **kwargs: Additional keyword arguments.
--- a/metagpt/environment/mgx/mgx_env.py
+++ b/metagpt/environment/mgx/mgx_env.py
@ -124,7 +124,9 @@ class MGXEnv(Environment):
        if converted_msg.role not in ["system", "user", "assistant"]:
            converted_msg.role = "assistant"
        sent_from = converted_msg.metadata[AGENT] if AGENT in converted_msg.metadata else converted_msg.sent_from
-        converted_msg.content = f"from {sent_from} to {converted_msg.send_to}: {converted_msg.content}"
+        converted_msg.content = (
+            f"[Message] from {sent_from if sent_from else 'User'} to {converted_msg.send_to}: {converted_msg.content}"
+        )
        return converted_msg

    def __repr__(self):
--- a/metagpt/prompts/di/role_zero.py
+++ b/metagpt/prompts/di/role_zero.py
@ -70,7 +70,7 @@ Notice: your output JSON data section must start with **```json [**
 """
 THOUGHT_GUIDANCE = """
 First, describe the actions you have taken recently.
-Second, describe the messages you have received recently, with a particular emphasis on messages from users.
+Second, describe the messages you have received recently, with a particular emphasis on messages from users. If necessary, develop a plan to address the new user requirements.
 Third, describe the plan status and the current task. Review the histroy, if `Current Task` has been undertaken and completed by you or anyone, you MUST use the **Plan.finish_current_task** command to finish it first before taking any action, the command will automatically move you to the next task.
 Fourth, describe any necessary human interaction. Use **RoleZero.reply_to_human** to report your progress if you complete a task or the overall requirement, pay attention to the history, DON'T repeat reporting. Use **RoleZero.ask_human** if you failed the current task, unsure of the situation encountered, need any help from human, or executing repetitive commands but receiving repetitive feedbacks without making progress.
 Fifth, describe if you should terminate, you should use **end** command to terminate if any of the following is met:
--- a/metagpt/prompts/di/swe_agent.py
+++ b/metagpt/prompts/di/swe_agent.py
@ -224,6 +224,8 @@ IMPORTANT_TIPS = """
 14. If provided an issue link, you MUST go to the issue page using Browser tool to understand the issue before starting your fix.

 15. When the edit fails, try to enlarge the starting line.
+
+16. Once again, and this is critical: YOU CAN ONLY ENTER ONE COMMAND AT A TIME.
 """

 NEXT_STEP_TEMPLATE = f"""
--- a/metagpt/prompts/di/team_leader.py
+++ b/metagpt/prompts/di/team_leader.py
@ -30,12 +30,13 @@ Note:
 10. Do not use escape characters in json data, particularly within file paths.
 11. Analyze the capabilities of team members and assign tasks to them based on user Requirements. If the requirements ask to ignore certain tasks, follow the requirements.
 12. Add default web technologies: HTML (*.html), CSS (*.css), and JavaScript (*.js) to your requirements.If no specific programming language is required, include these technologies in the project requirements. Using instruction  to forward this information to your team members.
+13. If the the user message is a question. use 'reply to human' to respond to the question, and then end.
 """
 TL_THOUGHT_GUIDANCE = (
    THOUGHT_GUIDANCE
    + """
-Sixth, when planning, describe the requirements as they pertain to software development, data analysis, or other areas. If the requirements is a software development and no specific restrictions are mentioned, you must create a Product Requirements Document (PRD), write a System Design document, develop a project schedule, and then begin coding. List the steps you will undertake. Plan these steps in a single response.
-Seventh, describe the technologies you must use.
+Sixth, describe the requirements as they pertain to software development, data analysis, or other areas. If the requirements is a software development and no specific restrictions are mentioned, you must create a Product Requirements Document (PRD), write a System Design document, develop a project schedule, and then begin coding. List the steps you will undertake. Plan these steps in a single response.
+Seventh, describe the technologies you must use.  
 """
 )
 QUICK_THINK_SYSTEM_PROMPT = """
--- a/metagpt/rag/schema.py
+++ b/metagpt/rag/schema.py
@ -1,7 +1,7 @@
 """RAG schemas."""
-
+from enum import Enum
 from pathlib import Path
-from typing import Any, ClassVar, Literal, Optional, Union
+from typing import Any, ClassVar, List, Literal, Optional, Union

 from chromadb.api.types import CollectionMetadata
 from llama_index.core.embeddings import BaseEmbedding
@ -12,6 +12,7 @@ from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, model_validator

 from metagpt.config2 import config
 from metagpt.configs.embedding_config import EmbeddingType
+from metagpt.logs import logger
 from metagpt.rag.interface import RAGObject


@ -44,7 +45,13 @@ class FAISSRetrieverConfig(IndexRetrieverConfig):
    @model_validator(mode="after")
    def check_dimensions(self):
        if self.dimensions == 0:
-            self.dimensions = self._embedding_type_to_dimensions.get(config.embedding.api_type, 1536)
+            self.dimensions = config.embedding.dimensions or self._embedding_type_to_dimensions.get(
+                config.embedding.api_type, 1536
+            )
+            if not config.embedding.dimensions and config.embedding.api_type not in self._embedding_type_to_dimensions:
+                logger.warning(
+                    f"You didn't set dimensions in config when using {config.embedding.api_type}, default to 1536"
+                )

        return self

@ -207,3 +214,51 @@ class ObjectNode(TextNode):
        )

        return metadata.model_dump()
+
+
+class OmniParseType(str, Enum):
+    """OmniParseType"""
+
+    PDF = "PDF"
+    DOCUMENT = "DOCUMENT"
+
+
+class ParseResultType(str, Enum):
+    """The result type for the parser."""
+
+    TXT = "text"
+    MD = "markdown"
+    JSON = "json"
+
+
+class OmniParseOptions(BaseModel):
+    """OmniParse Options config"""
+
+    result_type: ParseResultType = Field(default=ParseResultType.MD, description="OmniParse result_type")
+    parse_type: OmniParseType = Field(default=OmniParseType.DOCUMENT, description="OmniParse parse_type")
+    max_timeout: Optional[int] = Field(default=120, description="Maximum timeout for OmniParse service requests")
+    num_workers: int = Field(
+        default=5,
+        gt=0,
+        lt=10,
+        description="Number of concurrent requests for multiple files",
+    )
+
+
+class OminParseImage(BaseModel):
+    image: str = Field(default="", description="image str bytes")
+    image_name: str = Field(default="", description="image name")
+    image_info: Optional[dict] = Field(default={}, description="image info")
+
+
+class OmniParsedResult(BaseModel):
+    markdown: str = Field(default="", description="markdown text")
+    text: str = Field(default="", description="plain text")
+    images: Optional[List[OminParseImage]] = Field(default=[], description="images")
+    metadata: Optional[dict] = Field(default={}, description="metadata")
+
+    @model_validator(mode="before")
+    def set_markdown(cls, values):
+        if not values.get("markdown"):
+            values["markdown"] = values.get("text")
+        return values
--- a/metagpt/roles/di/role_zero.py
+++ b/metagpt/roles/di/role_zero.py
@ -383,10 +383,10 @@ class RoleZero(Role):
            tool_output = await tool_obj(**cmd["args"])
            if len(tool_output) <= 10:
                command_output += (
-                    f"\n[command]: {cmd['args']['cmd']} \n [command output] : {tool_output} (pay attention to this.)"
+                    f"\n[command]: {cmd['args']['cmd']} \n[command output] : {tool_output} (pay attention to this.)"
                )
            else:
-                command_output += f"\n[command]: {cmd['args']['cmd']} \n [command output] : {tool_output}"
+                command_output += f"\n[command]: {cmd['args']['cmd']} \n[command output] : {tool_output}"
        return command_output

    def _get_plan_status(self) -> Tuple[str, str]:
--- a/metagpt/strategy/task_type.py
+++ b/metagpt/strategy/task_type.py
@ -8,7 +8,8 @@ from metagpt.prompts.task_type import (
    FEATURE_ENGINEERING_PROMPT,
    IMAGE2WEBPAGE_PROMPT,
    MODEL_EVALUATE_PROMPT,
-    MODEL_TRAIN_PROMPT, WEB_SCRAPING_PROMPT,
+    MODEL_TRAIN_PROMPT,
+    WEB_SCRAPING_PROMPT,
 )


--- a/metagpt/tools/libs/init.py
+++ b/metagpt/tools/libs/init.py
@ -17,7 +17,7 @@ from metagpt.tools.libs import (
    deployer,
    git,
 )
-from metagpt.tools.libs.env import get_env, set_get_env_entry, default_get_env, get_env_description
+from metagpt.tools.libs.env import get_env, set_get_env_entry, default_get_env, get_env_description, get_env_default

 _ = (
    data_preprocess,
@ -32,6 +32,7 @@ _ = (
    deployer,
    git,
    get_env,
+    get_env_default,
    get_env_description,
    set_get_env_entry,
    default_get_env,
--- a/metagpt/tools/libs/cr.py
+++ b/metagpt/tools/libs/cr.py
@ -85,7 +85,7 @@ class CodeReview:
                    if pre:
                        patch_file_content = pre.text
        else:
-            async with aiofiles.open(patch_path) as f:
+            async with aiofiles.open(patch_path, encoding="utf-8") as f:
                patch_file_content = await f.read()
                await EditorReporter().async_report(patch_path)

--- a/metagpt/tools/libs/editor.py
+++ b/metagpt/tools/libs/editor.py
@ -1,10 +1,17 @@
+import base64
 import os
 import shutil
 import subprocess
+from pathlib import Path
+from typing import List, Optional, Union

 from pydantic import BaseModel, ConfigDict

+from metagpt.logs import logger
 from metagpt.tools.tool_registry import register_tool
+from metagpt.utils import read_docx
+from metagpt.utils.common import aread_bin, awrite_bin, run_coroutine_sync
+from metagpt.utils.repo_to_markdown import is_text_file
 from metagpt.utils.report import EditorReporter


@ -39,12 +46,26 @@ class Editor(BaseModel):

    def read(self, path: str) -> FileBlock:
        """Read the whole content of a file. Using absolute paths as the argument for specifying the file location."""
-        with open(path, "r") as f:
-            self.resource.report(path, "path")
-            lines = f.readlines()
+        is_text, mime_type = run_coroutine_sync(is_text_file, path)
+        if is_text:
+            lines = self._read_text(path)
+        elif mime_type == "application/pdf":
+            lines = self._read_pdf(path)
+        elif mime_type in {
+            "application/msword",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "application/vnd.ms-word.document.macroEnabled.12",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
+            "application/vnd.ms-word.template.macroEnabled.12",
+        }:
+            lines = self._read_docx(path)
+        else:
+            return FileBlock(file_path=str(path), block_content="")
+        self.resource.report(str(path), "path")
+
        lines_with_num = [f"{i + 1:03}|{line}" for i, line in enumerate(lines)]
        result = FileBlock(
-            file_path=path,
+            file_path=str(path),
            block_content="".join(lines_with_num),
        )
        return result
@ -195,3 +216,63 @@ class Editor(BaseModel):
        lint_passed = result.returncode == 0
        lint_message = result.stdout
        return lint_passed, lint_message
+
+    @staticmethod
+    def _read_text(path: Union[str, Path]) -> List[str]:
+        with open(str(path), "r") as f:
+            lines = f.readlines()
+        return lines
+
+    @staticmethod
+    def _read_pdf(path: Union[str, Path]) -> List[str]:
+        result = run_coroutine_sync(Editor._omniparse_read_file, path)
+        if result:
+            return result
+
+        from llama_index.readers.file import PDFReader
+
+        reader = PDFReader()
+        lines = reader.load_data(file=Path(path))
+        return [i.text for i in lines]
+
+    @staticmethod
+    def _read_docx(path: Union[str, Path]) -> List[str]:
+        result = run_coroutine_sync(Editor._omniparse_read_file, path)
+        if result:
+            return result
+        return read_docx(str(path))
+
+    @staticmethod
+    async def _omniparse_read_file(path: Union[str, Path]) -> Optional[List[str]]:
+        from metagpt.tools.libs import get_env_default
+        from metagpt.utils.omniparse_client import OmniParseClient
+
+        base_url = await get_env_default(key="base_url", app_name="OmniParse", default_value="")
+        if not base_url:
+            return None
+        api_key = await get_env_default(key="api_key", app_name="OmniParse", default_value="")
+        v = await get_env_default(key="timeout", app_name="OmniParse", default_value="120")
+        try:
+            timeout = int(v) or 120
+        except ValueError:
+            timeout = 120
+
+        try:
+            client = OmniParseClient(api_key=api_key, base_url=base_url, max_timeout=timeout)
+            file_data = await aread_bin(filename=path)
+            ret = await client.parse_document(file_input=file_data, bytes_filename=str(path))
+        except (ValueError, Exception) as e:
+            logger.exception(f"{path}: {e}")
+            return None
+        if not ret.images:
+            return [ret.text] if ret.text else None
+
+        result = [ret.text]
+        img_dir = Path(path).parent / (Path(path).name.replace(".", "_") + "_images")
+        img_dir.mkdir(parents=True, exist_ok=True)
+        for i in ret.images:
+            byte_data = base64.b64decode(i.image)
+            filename = img_dir / i.image_name
+            await awrite_bin(filename=filename, data=byte_data)
+            result.append(f"![{i.image_name}]({str(filename)})")
+        return result
--- a/metagpt/tools/libs/env.py
+++ b/metagpt/tools/libs/env.py
@ -7,7 +7,7 @@
@Desc: Implement `get_env`. RFC 216 2.4.2.4.2.
 """
 import os
-from typing import Dict
+from typing import Dict, Optional


 class EnvKeyNotFoundError(Exception):
@ -15,14 +15,26 @@ class EnvKeyNotFoundError(Exception):
        super().__init__(info)


+def to_app_key(key: str, app_name: str = None) -> str:
+    return f"{app_name}-{key}" if app_name else key
+
+
+def split_app_key(app_key: str) -> (str, str):
+    if "-" not in app_key:
+        return "", app_key
+    app_name, key = app_key.split("-", 1)
+    return app_name, key
+
+
 async def default_get_env(key: str, app_name: str = None) -> str:
-    if key in os.environ:
-        return os.environ[key]
+    app_key = to_app_key(key=key, app_name=app_name)
+    if app_key in os.environ:
+        return os.environ[app_key]

    from metagpt.context import Context

    context = Context()
-    val = context.kwargs.get(key, None)
+    val = context.kwargs.get(app_key, None)
    if val is not None:
        return val

@ -32,14 +44,16 @@ async def default_get_env(key: str, app_name: str = None) -> str:
 async def default_get_env_description() -> Dict[str, str]:
    result = {}
    for k in os.environ.keys():
-        call = f'await get_env(key="{k}", app_name="")'
+        app_name, key = split_app_key(k)
+        call = f'await get_env(key="{key}", app_name="{app_name}")'
        result[call] = f"Return the value of environment variable `{k}`."

    from metagpt.context import Context

    context = Context()
    for k in context.kwargs.__dict__.keys():
-        call = f'await get_env(key="{k}", app_name="")'
+        app_name, key = split_app_key(k)
+        call = f'await get_env(key="{key}", app_name="{app_name}")'
        result[call] = f"Get the value of environment variable `{k}`."
    return result

@ -84,6 +98,37 @@ async def get_env(key: str, app_name: str = None) -> str:
    return await default_get_env(key=key, app_name=app_name)


+async def get_env_default(key: str, app_name: str = None, default_value: str = None) -> Optional[str]:
+    """
+    Retrieves the value for the specified environment variable key. If the key is not found,
+    returns the default value.
+
+    Args:
+        key (str): The name of the environment variable to retrieve.
+        app_name (str, optional): The name of the application or component to associate with the environment variable.
+        default_value (str, optional): The default value to return if the environment variable is not found.
+
+    Returns:
+        str or None: The value of the environment variable if found, otherwise the default value.
+
+    Example:
+        >>> from metagpt.tools.libs.env import get_env
+        >>> api_key = await get_env_default(key="NOT_EXISTS_API_KEY", default_value="<API_KEY>")
+        >>> print(api_key)
+        <API_KEY>
+
+        >>> from metagpt.tools.libs.env import get_env
+        >>> api_key = await get_env_default(key="NOT_EXISTS_API_KEY", app_name="GITHUB", default_value="<API_KEY>")
+        >>> print(api_key)
+        <API_KEY>
+
+    """
+    try:
+        return await get_env(key=key, app_name=app_name)
+    except EnvKeyNotFoundError:
+        return default_value
+
+
 async def get_env_description() -> Dict[str, str]:
    global _get_env_description_entry

--- a/metagpt/tools/libs/terminal.py
+++ b/metagpt/tools/libs/terminal.py
@ -1,4 +1,5 @@
 import asyncio
+import os
 from asyncio import Queue
 from asyncio.subprocess import PIPE, STDOUT
 from typing import Optional
@ -28,7 +29,7 @@ class Terminal:
    async def _start_process(self):
        # Start a persistent shell process
        self.process = await asyncio.create_subprocess_exec(
-            *self.shell_command, stdin=PIPE, stdout=PIPE, stderr=STDOUT, executable="bash"
+            *self.shell_command, stdin=PIPE, stdout=PIPE, stderr=STDOUT, executable="bash", env=os.environ.copy()
        )
        await self._check_state()

@ -150,6 +151,7 @@ class Bash(Terminal):

    def __init__(self):
        """init"""
+        os.environ["SWE_CMD_WORK_DIR"] = str(DEFAULT_WORKSPACE_ROOT)
        super().__init__()
        self.start_flag = False

--- a/metagpt/tools/swe_agent_commands/setup_default.sh
+++ b/metagpt/tools/swe_agent_commands/setup_default.sh
@ -16,4 +16,4 @@ source $REPO_ROOT_DIR/metagpt/tools/swe_agent_commands/defaults.sh
 source $REPO_ROOT_DIR/metagpt/tools/swe_agent_commands/search.sh
 source $REPO_ROOT_DIR/metagpt/tools/swe_agent_commands/edit_linting.sh

-export SWE_CMD_WORK_DIR="$REPO_ROOT_DIR/workspace/swe_agent_workdir"
+echo "SWE_CMD_WORK_DIR: $SWE_CMD_WORK_DIR"
--- a/metagpt/utils/common.py
+++ b/metagpt/utils/common.py
@ -12,6 +12,7 @@
 from __future__ import annotations

 import ast
+import asyncio
 import base64
 import contextlib
 import csv
@ -870,7 +871,10 @@ async def get_mime_type(filename: str | Path, force_read: bool = False) -> str:
    }

    try:
-        stdout, _, _ = await shell_execute(f"file --mime-type {str(filename)}")
+        stdout, stderr, _ = await shell_execute(f"file --mime-type {str(filename)}")
+        if stderr:
+            logger.debug(f"file:{filename}, error:{stderr}")
+            return guess_mime_type
        ix = stdout.rfind(" ")
        mime_type = stdout[ix:].strip()
        if mime_type == "text/plain" and guess_mime_type in text_set:
@ -1068,6 +1072,32 @@ def tool2name(cls, methods: List[str], entry) -> Dict[str, Any]:
    return mappings


+def run_coroutine_sync(coroutine, *args, **kwargs):
+    """
+    Runs a coroutine function synchronously by encapsulating its invocation as a non-coroutine function call.
+
+    Args:
+        coroutine: The coroutine function to be encapsulated.
+        *args: Positional arguments to be passed to the coroutine.
+        **kwargs: Keyword arguments to be passed to the coroutine.
+
+    Returns:
+        The return value of the coroutine.
+    """
+    try:
+        loop = asyncio.get_running_loop()
+    except RuntimeError:  # No running event loop
+        loop = None
+
+    if loop and loop.is_running():
+        # The event loop is already running
+        future = asyncio.run_coroutine_threadsafe(coroutine(*args, **kwargs), loop)
+        return future.result()
+    else:
+        # The event loop is not running
+        return asyncio.run(coroutine(*args, **kwargs))
+
+
 def log_time(method):
    """A time-consuming decorator for printing execution duration."""

--- a/metagpt/utils/omniparse_client.py
+++ b/metagpt/utils/omniparse_client.py
@ -0,0 +1,238 @@
+import mimetypes
+from pathlib import Path
+from typing import Union
+
+import httpx
+
+from metagpt.rag.schema import OmniParsedResult
+from metagpt.utils.common import aread_bin
+
+
+class OmniParseClient:
+    """
+    OmniParse Server Client
+    This client interacts with the OmniParse server to parse different types of media, documents.
+
+    OmniParse API Documentation: https://docs.cognitivelab.in/api
+
+    Attributes:
+        ALLOWED_DOCUMENT_EXTENSIONS (set): A set of supported document file extensions.
+        ALLOWED_AUDIO_EXTENSIONS (set): A set of supported audio file extensions.
+        ALLOWED_VIDEO_EXTENSIONS (set): A set of supported video file extensions.
+    """
+
+    ALLOWED_DOCUMENT_EXTENSIONS = {".pdf", ".ppt", ".pptx", ".doc", ".docx"}
+    ALLOWED_AUDIO_EXTENSIONS = {".mp3", ".wav", ".aac"}
+    ALLOWED_VIDEO_EXTENSIONS = {".mp4", ".mkv", ".avi", ".mov"}
+
+    def __init__(self, api_key: str = None, base_url: str = "http://localhost:8000", max_timeout: int = 120):
+        """
+        Args:
+            api_key: Default None, can be used for authentication later.
+            base_url: Base URL for the API.
+            max_timeout: Maximum request timeout in seconds.
+        """
+        self.api_key = api_key
+        self.base_url = base_url
+        self.max_timeout = max_timeout
+
+        self.parse_media_endpoint = "/parse_media"
+        self.parse_website_endpoint = "/parse_website"
+        self.parse_document_endpoint = "/parse_document"
+
+    async def _request_parse(
+        self,
+        endpoint: str,
+        method: str = "POST",
+        files: dict = None,
+        params: dict = None,
+        data: dict = None,
+        json: dict = None,
+        headers: dict = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Request OmniParse API to parse a document.
+
+        Args:
+            endpoint (str): API endpoint.
+            method (str, optional): HTTP method to use. Default is "POST".
+            files (dict, optional): Files to include in the request.
+            params (dict, optional): Query string parameters.
+            data (dict, optional): Form data to include in the request body.
+            json (dict, optional): JSON data to include in the request body.
+            headers (dict, optional): HTTP headers to include in the request.
+            **kwargs: Additional keyword arguments for httpx.AsyncClient.request()
+
+        Returns:
+            dict: JSON response data.
+        """
+        url = f"{self.base_url}{endpoint}"
+        method = method.upper()
+        headers = headers or {}
+        _headers = {"Authorization": f"Bearer {self.api_key}"} if self.api_key else {}
+        headers.update(**_headers)
+        async with httpx.AsyncClient() as client:
+            response = await client.request(
+                url=url,
+                method=method,
+                files=files,
+                params=params,
+                json=json,
+                data=data,
+                headers=headers,
+                timeout=self.max_timeout,
+                **kwargs,
+            )
+            response.raise_for_status()
+            return response.json()
+
+    async def parse_document(self, file_input: Union[str, bytes, Path], bytes_filename: str = None) -> OmniParsedResult:
+        """
+        Parse document-type data (supports ".pdf", ".ppt", ".pptx", ".doc", ".docx").
+
+        Args:
+            file_input: File path or file byte data.
+            bytes_filename: Filename for byte data, useful for determining MIME type for the HTTP request.
+
+        Raises:
+            ValueError: If the file extension is not allowed.
+
+        Returns:
+            OmniParsedResult: The result of the document parsing.
+        """
+        self.verify_file_ext(file_input, self.ALLOWED_DOCUMENT_EXTENSIONS, bytes_filename)
+        file_info = await self.get_file_info(file_input, bytes_filename)
+        resp = await self._request_parse(self.parse_document_endpoint, files={"file": file_info})
+        data = OmniParsedResult(**resp)
+        return data
+
+    async def parse_pdf(self, file_input: Union[str, bytes, Path]) -> OmniParsedResult:
+        """
+        Parse pdf document.
+
+        Args:
+            file_input: File path or file byte data.
+
+        Raises:
+            ValueError: If the file extension is not allowed.
+
+        Returns:
+            OmniParsedResult: The result of the pdf parsing.
+        """
+        self.verify_file_ext(file_input, {".pdf"})
+        # parse_pdf supports parsing by accepting only the byte data of the file.
+        file_info = await self.get_file_info(file_input, only_bytes=True)
+        endpoint = f"{self.parse_document_endpoint}/pdf"
+        resp = await self._request_parse(endpoint=endpoint, files={"file": file_info})
+        data = OmniParsedResult(**resp)
+        return data
+
+    async def parse_video(self, file_input: Union[str, bytes, Path], bytes_filename: str = None) -> dict:
+        """
+        Parse video-type data (supports ".mp4", ".mkv", ".avi", ".mov").
+
+        Args:
+            file_input: File path or file byte data.
+            bytes_filename: Filename for byte data, useful for determining MIME type for the HTTP request.
+
+        Raises:
+            ValueError: If the file extension is not allowed.
+
+        Returns:
+            dict: JSON response data.
+        """
+        self.verify_file_ext(file_input, self.ALLOWED_VIDEO_EXTENSIONS, bytes_filename)
+        file_info = await self.get_file_info(file_input, bytes_filename)
+        return await self._request_parse(f"{self.parse_media_endpoint}/video", files={"file": file_info})
+
+    async def parse_audio(self, file_input: Union[str, bytes, Path], bytes_filename: str = None) -> dict:
+        """
+        Parse audio-type data (supports ".mp3", ".wav", ".aac").
+
+        Args:
+            file_input: File path or file byte data.
+            bytes_filename: Filename for byte data, useful for determining MIME type for the HTTP request.
+
+        Raises:
+            ValueError: If the file extension is not allowed.
+
+        Returns:
+            dict: JSON response data.
+        """
+        self.verify_file_ext(file_input, self.ALLOWED_AUDIO_EXTENSIONS, bytes_filename)
+        file_info = await self.get_file_info(file_input, bytes_filename)
+        return await self._request_parse(f"{self.parse_media_endpoint}/audio", files={"file": file_info})
+
+    @staticmethod
+    def verify_file_ext(file_input: Union[str, bytes, Path], allowed_file_extensions: set, bytes_filename: str = None):
+        """
+        Verify the file extension.
+
+        Args:
+            file_input: File path or file byte data.
+            allowed_file_extensions: Set of allowed file extensions.
+            bytes_filename: Filename to use for verification when `file_input` is byte data.
+
+        Raises:
+            ValueError: If the file extension is not allowed.
+
+        Returns:
+        """
+        verify_file_path = None
+        if isinstance(file_input, (str, Path)):
+            verify_file_path = str(file_input)
+        elif isinstance(file_input, bytes) and bytes_filename:
+            verify_file_path = bytes_filename
+
+        if not verify_file_path:
+            # Do not verify if only byte data is provided
+            return
+
+        file_ext = Path(verify_file_path).suffix
+        if file_ext not in allowed_file_extensions:
+            raise ValueError(f"Not allowed {file_ext} File extension must be one of {allowed_file_extensions}")
+
+    @staticmethod
+    async def get_file_info(
+        file_input: Union[str, bytes, Path],
+        bytes_filename: str = None,
+        only_bytes: bool = False,
+    ) -> Union[bytes, tuple]:
+        """
+        Get file information.
+
+        Args:
+            file_input: File path or file byte data.
+            bytes_filename: Filename to use when uploading byte data, useful for determining MIME type.
+            only_bytes: Whether to return only byte data. Default is False, which returns a tuple.
+
+        Raises:
+            ValueError: If bytes_filename is not provided when file_input is bytes or if file_input is not a valid type.
+
+        Notes:
+            Since `parse_document`,`parse_video`, `parse_audio` supports parsing various file types,
+            the MIME type of the file must be specified when uploading.
+
+        Returns: [bytes, tuple]
+            Returns bytes if only_bytes is True, otherwise returns a tuple (filename, file_bytes, mime_type).
+        """
+        if isinstance(file_input, (str, Path)):
+            filename = Path(file_input).name
+            file_bytes = await aread_bin(file_input)
+
+            if only_bytes:
+                return file_bytes
+
+            mime_type = mimetypes.guess_type(file_input)[0]
+            return filename, file_bytes, mime_type
+        elif isinstance(file_input, bytes):
+            if only_bytes:
+                return file_input
+            if not bytes_filename:
+                raise ValueError("bytes_filename must be set when passing bytes")
+
+            mime_type = mimetypes.guess_type(bytes_filename)[0]
+            return bytes_filename, file_input, mime_type
+        else:
+            raise ValueError("file_input must be a string (file path) or bytes.")
--- a/metagpt/utils/repo_to_markdown.py
+++ b/metagpt/utils/repo_to_markdown.py
@ -7,7 +7,7 @@ from __future__ import annotations

 import re
 from pathlib import Path
-from typing import Tuple
+from typing import Tuple, Union

 from gitignore_parser import parse_gitignore

@ -82,7 +82,7 @@ async def _write_files(repo_path, gitignore_rules=None) -> str:


 async def _write_file(filename: Path, repo_path: Path) -> str:
-    is_text, mime_type = await _is_text_file(filename)
+    is_text, mime_type = await is_text_file(filename)
    if not is_text:
        logger.info(f"Ignore content: {filename}")
        return ""
@ -100,7 +100,17 @@ async def _write_file(filename: Path, repo_path: Path) -> str:
        return ""


-async def _is_text_file(filename: Path) -> Tuple[bool, str]:
+async def is_text_file(filename: Union[str, Path]) -> Tuple[bool, str]:
+    """
+    Determines if the specified file is a text file based on its MIME type.
+
+    Args:
+        filename (Union[str, Path]): The path to the file.
+
+    Returns:
+        Tuple[bool, str]: A tuple where the first element indicates if the file is a text file
+        (True for text file, False otherwise), and the second element is the MIME type of the file.
+    """
    pass_set = {
        "application/json",
        "application/vnd.chipnuts.karaoke-mmd",
@ -129,7 +139,7 @@ async def _is_text_file(filename: Path) -> Tuple[bool, str]:
        "image/vnd.microsoft.icon",
        "video/mp4",
    }
-    mime_type = await get_mime_type(filename, force_read=True)
+    mime_type = await get_mime_type(Path(filename), force_read=True)
    v = "text/" in mime_type or mime_type in pass_set
    if v:
        return True, mime_type