Merge branch 'mgx_ops' into di_mgx

2026-04-27 09:46:24 +02:00 · 2024-03-30 16:45:21 +08:00 · 2024-03-30 16:45:21 +08:00 · 3e10d34468
commit 3e10d34468
parent d5c3c5a147 6e3c16a580
304 changed files with 10747 additions and 662 deletions
--- a/metagpt/utils/async_helper.py
+++ b/metagpt/utils/async_helper.py
@ -0,0 +1,22 @@
+import asyncio
+import threading
+from typing import Any
+
+
+def run_coroutine_in_new_loop(coroutine) -> Any:
+    """Runs a coroutine in a new, separate event loop on a different thread.
+
+    This function is useful when try to execute an async function within a sync function, but encounter the error `RuntimeError: This event loop is already running`.
+    """
+    new_loop = asyncio.new_event_loop()
+    t = threading.Thread(target=lambda: new_loop.run_forever())
+    t.start()
+
+    future = asyncio.run_coroutine_threadsafe(coroutine, new_loop)
+
+    try:
+        return future.result()
+    finally:
+        new_loop.call_soon_threadsafe(new_loop.stop)
+        t.join()
+        new_loop.close()
--- a/metagpt/utils/common.py
+++ b/metagpt/utils/common.py
@ -18,6 +18,7 @@ import csv
 import importlib
 import inspect
 import json
+import mimetypes
 import os
 import platform
 import re
@ -29,6 +30,7 @@ from typing import Any, Callable, List, Literal, Tuple, Union
 from urllib.parse import quote, unquote

 import aiofiles
+import chardet
 import loguru
 import requests
 from PIL import Image
@ -663,14 +665,21 @@ def role_raise_decorator(func):


@handle_exception
-async def aread(filename: str | Path, encoding=None) -> str:
+async def aread(filename: str | Path, encoding="utf-8") -> str:
    """Read file asynchronously."""
-    async with aiofiles.open(str(filename), mode="r", encoding=encoding) as reader:
-        content = await reader.read()
+    try:
+        async with aiofiles.open(str(filename), mode="r", encoding=encoding) as reader:
+            content = await reader.read()
+    except UnicodeDecodeError:
+        async with aiofiles.open(str(filename), mode="rb") as reader:
+            raw = await reader.read()
+            result = chardet.detect(raw)
+            detected_encoding = result["encoding"]
+            content = raw.decode(detected_encoding)
    return content


-async def awrite(filename: str | Path, data: str, encoding=None):
+async def awrite(filename: str | Path, data: str, encoding="utf-8"):
    """Write file asynchronously."""
    pathname = Path(filename)
    pathname.parent.mkdir(parents=True, exist_ok=True)
@ -765,7 +774,7 @@ def is_coroutine_func(func: Callable) -> bool:


 def load_mc_skills_code(skill_names: list[str] = None, skills_dir: Path = None) -> list[str]:
-    """load mincraft skill from js files"""
+    """load minecraft skill from js files"""
    if not skills_dir:
        skills_dir = Path(__file__).parent.absolute()
    if skill_names is None:
@ -802,29 +811,6 @@ def decode_image(img_url_or_b64: str) -> Image:
    return img


-def process_message(messages: Union[str, Message, list[dict], list[Message], list[str]]) -> list[dict]:
-    """convert messages to list[dict]."""
-    from metagpt.schema import Message
-
-    # 全部转成list
-    if not isinstance(messages, list):
-        messages = [messages]
-
-    # 转成list[dict]
-    processed_messages = []
-    for msg in messages:
-        if isinstance(msg, str):
-            processed_messages.append({"role": "user", "content": msg})
-        elif isinstance(msg, dict):
-            assert set(msg.keys()) == set(["role", "content"])
-            processed_messages.append(msg)
-        elif isinstance(msg, Message):
-            processed_messages.append(msg.to_dict())
-        else:
-            raise ValueError(f"Only support message type are: str, Message, dict, but got {type(messages).__name__}!")
-    return processed_messages
-
-
 def log_and_reraise(retry_state: RetryCallState):
    logger.error(f"Retry attempts exhausted. Last exception: {retry_state.outcome.exception()}")
    logger.warning(
@ -834,3 +820,21 @@ See FAQ 5.8
 """
    )
    raise retry_state.outcome.exception()
+
+
+def get_markdown_codeblock_type(filename: str) -> str:
+    """Return the markdown code-block type corresponding to the file extension."""
+    mime_type, _ = mimetypes.guess_type(filename)
+    mappings = {
+        "text/x-shellscript": "bash",
+        "text/x-c++src": "cpp",
+        "text/css": "css",
+        "text/html": "html",
+        "text/x-java": "java",
+        "application/javascript": "javascript",
+        "application/json": "json",
+        "text/x-python": "python",
+        "text/x-ruby": "ruby",
+        "application/sql": "sql",
+    }
+    return mappings.get(mime_type, "text")
--- a/metagpt/utils/dependency_file.py
+++ b/metagpt/utils/dependency_file.py
@ -13,9 +13,7 @@ import re
 from pathlib import Path
 from typing import Set

-import aiofiles
-
-from metagpt.utils.common import aread
+from metagpt.utils.common import aread, awrite
 from metagpt.utils.exceptions import handle_exception


@ -45,8 +43,7 @@ class DependencyFile:
    async def save(self):
        """Save dependencies to the file asynchronously."""
        data = json.dumps(self._dependencies)
-        async with aiofiles.open(str(self._filename), mode="w") as writer:
-            await writer.write(data)
+        await awrite(filename=self._filename, data=data)

    async def update(self, filename: Path | str, dependencies: Set[Path | str], persist=True):
        """Update dependencies for a file asynchronously.
--- a/metagpt/utils/embedding.py
+++ b/metagpt/utils/embedding.py
@ -5,12 +5,15 @@
@Author  : alexanderwu
@File    : embedding.py
 """
-from langchain_community.embeddings import OpenAIEmbeddings
+from llama_index.embeddings.openai import OpenAIEmbedding

 from metagpt.config2 import config


-def get_embedding():
+def get_embedding() -> OpenAIEmbedding:
    llm = config.get_openai_llm()
-    embedding = OpenAIEmbeddings(openai_api_key=llm.api_key, openai_api_base=llm.base_url)
+    if llm is None:
+        raise ValueError("To use OpenAIEmbedding, please ensure that config.llm.api_type is correctly set to 'openai'.")
+
+    embedding = OpenAIEmbedding(api_key=llm.api_key, api_base=llm.base_url)
    return embedding
--- a/metagpt/utils/file_repository.py
+++ b/metagpt/utils/file_repository.py
@ -14,11 +14,9 @@ from datetime import datetime
 from pathlib import Path
 from typing import Dict, List, Set

-import aiofiles
-
 from metagpt.logs import logger
 from metagpt.schema import Document
-from metagpt.utils.common import aread
+from metagpt.utils.common import aread, awrite
 from metagpt.utils.json_to_markdown import json_to_markdown


@ -55,8 +53,7 @@ class FileRepository:
        pathname = self.workdir / filename
        pathname.parent.mkdir(parents=True, exist_ok=True)
        content = content if content else ""  # avoid `argument must be str, not None` to make it continue
-        async with aiofiles.open(str(pathname), mode="w") as writer:
-            await writer.write(content)
+        await awrite(filename=str(pathname), data=content)
        logger.info(f"save to: {str(pathname)}")

        if dependencies is not None:
--- a/metagpt/utils/mermaid.py
+++ b/metagpt/utils/mermaid.py
@ -9,11 +9,9 @@ import asyncio
 import os
 from pathlib import Path

-import aiofiles
-
 from metagpt.config2 import config
 from metagpt.logs import logger
-from metagpt.utils.common import check_cmd_exists
+from metagpt.utils.common import awrite, check_cmd_exists


 async def mermaid_to_file(engine, mermaid_code, output_file_without_suffix, width=2048, height=2048) -> int:
@ -30,9 +28,7 @@ async def mermaid_to_file(engine, mermaid_code, output_file_without_suffix, widt
    if dir_name and not os.path.exists(dir_name):
        os.makedirs(dir_name)
    tmp = Path(f"{output_file_without_suffix}.mmd")
-    async with aiofiles.open(tmp, "w", encoding="utf-8") as f:
-        await f.write(mermaid_code)
-    # tmp.write_text(mermaid_code, encoding="utf-8")
+    await awrite(filename=tmp, data=mermaid_code)

    if engine == "nodejs":
        if check_cmd_exists(config.mermaid.path) != 0:
--- a/metagpt/utils/recovery_util.py
+++ b/metagpt/utils/recovery_util.py
@ -54,5 +54,5 @@ def save_history(role: Role, save_dir: str = ""):
    with open(save_path / "plan.json", "w", encoding="utf-8") as plan_file:
        json.dump(plan, plan_file, indent=4, ensure_ascii=False)

-    save_code_file(name=Path(record_time) / "history_nb", code_context=role.execute_code.nb, file_format="ipynb")
+    save_code_file(name=Path(record_time), code_context=role.execute_code.nb, file_format="ipynb")
    return save_path
--- a/metagpt/utils/reflection.py
+++ b/metagpt/utils/reflection.py
@ -0,0 +1,18 @@
+"""class tools, including method inspection, class attributes, inheritance relationships, etc."""
+
+
+def check_methods(C, *methods):
+    """Check if the class has methods. borrow from _collections_abc.
+
+    Useful when implementing implicit interfaces, such as defining an abstract class, isinstance can be used for determination without inheritance.
+    """
+    mro = C.__mro__
+    for method in methods:
+        for B in mro:
+            if method in B.__dict__:
+                if B.__dict__[method] is None:
+                    return NotImplemented
+                break
+        else:
+            return NotImplemented
+    return True
--- a/metagpt/utils/repair_llm_raw_output.py
+++ b/metagpt/utils/repair_llm_raw_output.py
@ -340,7 +340,9 @@ def extract_state_value_from_output(content: str) -> str:
        content (str): llm's output from `Role._think`
    """
    content = content.strip()  # deal the output cases like " 0", "0\n" and so on.
-    pattern = r"([0-9])"  # TODO find the number using a more proper method not just extract from content using pattern
+    pattern = (
+        r"(?<!-)[0-9]"  # TODO find the number using a more proper method not just extract from content using pattern
+    )
    matches = re.findall(pattern, content, re.DOTALL)
    matches = list(set(matches))
    state = matches[0] if len(matches) > 0 else "-1"
--- a/metagpt/utils/repo_to_markdown.py
+++ b/metagpt/utils/repo_to_markdown.py
@ -0,0 +1,80 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+This file provides functionality to convert a local repository into a markdown representation.
+"""
+from __future__ import annotations
+
+import mimetypes
+from pathlib import Path
+
+from gitignore_parser import parse_gitignore
+
+from metagpt.logs import logger
+from metagpt.utils.common import aread, awrite, get_markdown_codeblock_type, list_files
+from metagpt.utils.tree import tree
+
+
+async def repo_to_markdown(repo_path: str | Path, output: str | Path = None, gitignore: str | Path = None) -> str:
+    """
+    Convert a local repository into a markdown representation.
+
+    This function takes a path to a local repository and generates a markdown representation of the repository structure,
+    including directory trees and file listings.
+
+    Args:
+        repo_path (str | Path): The path to the local repository.
+        output (str | Path, optional): The path to save the generated markdown file. Defaults to None.
+        gitignore (str | Path, optional): The path to the .gitignore file. Defaults to None.
+
+    Returns:
+        str: The markdown representation of the repository.
+    """
+    repo_path = Path(repo_path)
+    gitignore = Path(gitignore or Path(__file__).parent / "../../.gitignore").resolve()
+
+    markdown = await _write_dir_tree(repo_path=repo_path, gitignore=gitignore)
+
+    gitignore_rules = parse_gitignore(full_path=str(gitignore))
+    markdown += await _write_files(repo_path=repo_path, gitignore_rules=gitignore_rules)
+
+    if output:
+        await awrite(filename=str(output), data=markdown, encoding="utf-8")
+    return markdown
+
+
+async def _write_dir_tree(repo_path: Path, gitignore: Path) -> str:
+    try:
+        content = tree(repo_path, gitignore, run_command=True)
+    except Exception as e:
+        logger.info(f"{e}, using safe mode.")
+        content = tree(repo_path, gitignore, run_command=False)
+
+    doc = f"## Directory Tree\n```text\n{content}\n```\n---\n\n"
+    return doc
+
+
+async def _write_files(repo_path, gitignore_rules) -> str:
+    filenames = list_files(repo_path)
+    markdown = ""
+    for filename in filenames:
+        if gitignore_rules(str(filename)):
+            continue
+        markdown += await _write_file(filename=filename, repo_path=repo_path)
+    return markdown
+
+
+async def _write_file(filename: Path, repo_path: Path) -> str:
+    relative_path = filename.relative_to(repo_path)
+    markdown = f"## {relative_path}\n"
+
+    mime_type, _ = mimetypes.guess_type(filename.name)
+    if "text/" not in mime_type:
+        logger.info(f"Ignore content: {filename}")
+        markdown += "<binary file>\n---\n\n"
+        return markdown
+    content = await aread(filename, encoding="utf-8")
+    content = content.replace("```", "\\`\\`\\`").replace("---", "\\-\\-\\-")
+    code_block_type = get_markdown_codeblock_type(filename.name)
+    markdown += f"```{code_block_type}\n{content}\n```\n---\n\n"
+    return markdown
--- a/metagpt/utils/token_counter.py
+++ b/metagpt/utils/token_counter.py
@ -21,6 +21,7 @@ TOKEN_COSTS = {
    "gpt-35-turbo": {"prompt": 0.0015, "completion": 0.002},
    "gpt-35-turbo-16k": {"prompt": 0.003, "completion": 0.004},
    "gpt-3.5-turbo-1106": {"prompt": 0.001, "completion": 0.002},
+    "gpt-3.5-turbo-0125": {"prompt": 0.001, "completion": 0.002},
    "gpt-4-0314": {"prompt": 0.03, "completion": 0.06},
    "gpt-4": {"prompt": 0.03, "completion": 0.06},
    "gpt-4-32k": {"prompt": 0.06, "completion": 0.12},
@ -48,6 +49,8 @@ TOKEN_COSTS = {
    "claude-2.1": {"prompt": 0.008, "completion": 0.024},
    "claude-3-sonnet-20240229": {"prompt": 0.003, "completion": 0.015},
    "claude-3-opus-20240229": {"prompt": 0.015, "completion": 0.075},
+    "yi-34b-chat-0205": {"prompt": 0.0003, "completion": 0.0003},
+    "yi-34b-chat-200k": {"prompt": 0.0017, "completion": 0.0017},
 }


@ -140,25 +143,24 @@ FIREWORKS_GRADE_TOKEN_COSTS = {
    "mixtral-8x7b": {"prompt": 0.4, "completion": 1.6},
 }

+# https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo
 TOKEN_MAX = {
-    "gpt-3.5-turbo": 4096,
-    "gpt-3.5-turbo-0301": 4096,
-    "gpt-3.5-turbo-0613": 4096,
-    "gpt-3.5-turbo-16k": 16384,
-    "gpt-3.5-turbo-16k-0613": 16384,
-    "gpt-35-turbo": 4096,
-    "gpt-35-turbo-16k": 16384,
-    "gpt-3.5-turbo-1106": 16384,
-    "gpt-4-0314": 8192,
-    "gpt-4": 8192,
-    "gpt-4-32k": 32768,
-    "gpt-4-32k-0314": 32768,
-    "gpt-4-0613": 8192,
-    "gpt-4-turbo-preview": 128000,
    "gpt-4-0125-preview": 128000,
+    "gpt-4-turbo-preview": 128000,
    "gpt-4-1106-preview": 128000,
    "gpt-4-vision-preview": 128000,
    "gpt-4-1106-vision-preview": 128000,
+    "gpt-4": 8192,
+    "gpt-4-0613": 8192,
+    "gpt-4-32k": 32768,
+    "gpt-4-32k-0613": 32768,
+    "gpt-3.5-turbo-0125": 16385,
+    "gpt-3.5-turbo": 16385,
+    "gpt-3.5-turbo-1106": 16385,
+    "gpt-3.5-turbo-instruct": 4096,
+    "gpt-3.5-turbo-16k": 16385,
+    "gpt-3.5-turbo-0613": 4096,
+    "gpt-3.5-turbo-16k-0613": 16385,
    "text-embedding-ada-002": 8192,
    "glm-3-turbo": 128000,
    "glm-4": 128000,
@ -176,10 +178,12 @@ TOKEN_MAX = {
    "claude-2.1": 200000,
    "claude-3-sonnet-20240229": 200000,
    "claude-3-opus-20240229": 200000,
+    "yi-34b-chat-0205": 4000,
+    "yi-34b-chat-200k": 200000,
 }


-def count_message_tokens(messages, model="gpt-3.5-turbo-0613"):
+def count_message_tokens(messages, model="gpt-3.5-turbo-0125"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
@ -193,6 +197,7 @@ def count_message_tokens(messages, model="gpt-3.5-turbo-0613"):
        "gpt-35-turbo-16k",
        "gpt-3.5-turbo-16k",
        "gpt-3.5-turbo-1106",
+        "gpt-3.5-turbo-0125",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
@ -209,8 +214,8 @@ def count_message_tokens(messages, model="gpt-3.5-turbo-0613"):
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" == model:
-        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
-        return count_message_tokens(messages, model="gpt-3.5-turbo-0613")
+        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0125.")
+        return count_message_tokens(messages, model="gpt-3.5-turbo-0125")
    elif "gpt-4" == model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return count_message_tokens(messages, model="gpt-4-0613")
@ -224,7 +229,7 @@ def count_message_tokens(messages, model="gpt-3.5-turbo-0613"):
    else:
        raise NotImplementedError(
            f"num_tokens_from_messages() is not implemented for model {model}. "
-            f"See https://github.com/openai/openai-python/blob/main/chatml.md "
+            f"See https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken "
            f"for information on how messages are converted to tokens."
        )
    num_tokens = 0
--- a/metagpt/utils/tree.py
+++ b/metagpt/utils/tree.py
@ -0,0 +1,140 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2024/3/11
+@Author  : mashenquan
+@File    : tree.py
+@Desc    : Implement the same functionality as the `tree` command.
+        Example:
+            >>> print_tree(".")
+            utils
+            +-- serialize.py
+            +-- project_repo.py
+            +-- tree.py
+            +-- mmdc_playwright.py
+            +-- cost_manager.py
+            +-- __pycache__
+            |   +-- __init__.cpython-39.pyc
+            |   +-- redis.cpython-39.pyc
+            |   +-- singleton.cpython-39.pyc
+            |   +-- embedding.cpython-39.pyc
+            |   +-- make_sk_kernel.cpython-39.pyc
+            |   +-- file_repository.cpython-39.pyc
+            +-- file.py
+            +-- save_code.py
+            +-- common.py
+            +-- redis.py
+"""
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+from typing import Callable, Dict, List
+
+from gitignore_parser import parse_gitignore
+
+
+def tree(root: str | Path, gitignore: str | Path = None, run_command: bool = False) -> str:
+    """
+    Recursively traverses the directory structure and prints it out in a tree-like format.
+
+    Args:
+        root (str or Path): The root directory from which to start traversing.
+        gitignore (str or Path): The filename of gitignore file.
+        run_command (bool): Whether to execute `tree` command. Execute the `tree` command and return the result if True,
+            otherwise execute python code instead.
+
+    Returns:
+        str: A string representation of the directory tree.
+
+    Example:
+            >>> tree(".")
+            utils
+            +-- serialize.py
+            +-- project_repo.py
+            +-- tree.py
+            +-- mmdc_playwright.py
+            +-- __pycache__
+            |   +-- __init__.cpython-39.pyc
+            |   +-- redis.cpython-39.pyc
+            |   +-- singleton.cpython-39.pyc
+            +-- parse_docstring.py
+
+            >>> tree(".", gitignore="../../.gitignore")
+            utils
+            +-- serialize.py
+            +-- project_repo.py
+            +-- tree.py
+            +-- mmdc_playwright.py
+            +-- parse_docstring.py
+
+            >>> tree(".", gitignore="../../.gitignore", run_command=True)
+            utils
+            ├── serialize.py
+            ├── project_repo.py
+            ├── tree.py
+            ├── mmdc_playwright.py
+            └── parse_docstring.py
+
+
+    """
+    root = Path(root).resolve()
+    if run_command:
+        return _execute_tree(root, gitignore)
+
+    git_ignore_rules = parse_gitignore(gitignore) if gitignore else None
+    dir_ = {root.name: _list_children(root=root, git_ignore_rules=git_ignore_rules)}
+    v = _print_tree(dir_)
+    return "\n".join(v)
+
+
+def _list_children(root: Path, git_ignore_rules: Callable) -> Dict[str, Dict]:
+    dir_ = {}
+    for i in root.iterdir():
+        if git_ignore_rules and git_ignore_rules(str(i)):
+            continue
+        try:
+            if i.is_file():
+                dir_[i.name] = {}
+            else:
+                dir_[i.name] = _list_children(root=i, git_ignore_rules=git_ignore_rules)
+        except (FileNotFoundError, PermissionError, OSError):
+            dir_[i.name] = {}
+    return dir_
+
+
+def _print_tree(dir_: Dict[str:Dict]) -> List[str]:
+    ret = []
+    for name, children in dir_.items():
+        ret.append(name)
+        if not children:
+            continue
+        lines = _print_tree(children)
+        for j, v in enumerate(lines):
+            if v[0] not in ["+", " ", "|"]:
+                ret = _add_line(ret)
+                row = f"+-- {v}"
+            else:
+                row = f"    {v}"
+            ret.append(row)
+    return ret
+
+
+def _add_line(rows: List[str]) -> List[str]:
+    for i in range(len(rows) - 1, -1, -1):
+        v = rows[i]
+        if v[0] != " ":
+            return rows
+        rows[i] = "|" + v[1:]
+    return rows
+
+
+def _execute_tree(root: Path, gitignore: str | Path) -> str:
+    args = ["--gitfile", str(gitignore)] if gitignore else []
+    try:
+        result = subprocess.run(["tree"] + args + [str(root)], capture_output=True, text=True, check=True)
+        if result.returncode != 0:
+            raise ValueError(f"tree exits with code {result.returncode}")
+        return result.stdout
+    except subprocess.CalledProcessError as e:
+        raise e