diff --git a/metagpt/config2.py b/metagpt/config2.py index f3273419f..cf5ed0da1 100644 --- a/metagpt/config2.py +++ b/metagpt/config2.py @@ -47,7 +47,7 @@ class Config(CLIParams, YamlModel): # Key Parameters llm: LLMConfig - # Global Proxy. Will be used if llm.proxy is not set + # Global Proxy. Not used by LLM, but by other tools such as browsers. proxy: str = "" # Tool Parameters diff --git a/metagpt/tools/libs/git.py b/metagpt/tools/libs/git.py new file mode 100644 index 000000000..afbcb8b0b --- /dev/null +++ b/metagpt/tools/libs/git.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from __future__ import annotations + +from pathlib import Path + +from metagpt.tools.tool_registry import register_tool +from metagpt.utils.git_repository import GitRepository + + +@register_tool(tags=["git"]) +async def git_clone(url: str, output_dir: str | Path = None) -> Path: + """ + Clones a Git repository from the given URL. + + Args: + url (str): The URL of the Git repository to clone. + output_dir (str or Path, optional): The directory where the repository will be cloned. + If not provided, the repository will be cloned into the current working directory. + + Returns: + Path: The path to the cloned repository. + + Raises: + ValueError: If the specified Git root is invalid. + + Example: + >>> # git clone to /TO/PATH + >>> url = 'https://github.com/geekan/MetaGPT.git' + >>> output_dir = "/TO/PATH" + >>> repo_dir = await git_clone(url=url, output_dir=output_dir) + >>> print(repo_dir) + /TO/PATH/MetaGPT + + >>> # git clone to default directory. + >>> url = 'https://github.com/geekan/MetaGPT.git' + >>> repo_dir = await git_clone(url) + >>> print(repo_dir) + /WORK_SPACE/downloads/MetaGPT + """ + repo = await GitRepository.clone_from(url, output_dir) + return repo.workdir + + +async def git_checkout(repo_dir: str | Path, commit_id: str): + """ + Checks out a specific commit in a Git repository. + + Args: + repo_dir (str or Path): The directory containing the Git repository. + commit_id (str): The ID of the commit to check out. + + Raises: + ValueError: If the specified Git root is invalid. + + Example: + >>> repo_dir = '/TO/GIT/REPO' + >>> commit_id = 'main' + >>> await git_checkout(repo_dir=repo_dir, commit_id=commit_id) + git checkout main + """ + repo = GitRepository(local_path=repo_dir, auto_init=False) + if not repo.is_valid: + ValueError(f"Invalid git root: {repo_dir}") + await repo.checkout(commit_id) diff --git a/metagpt/tools/libs/shell.py b/metagpt/tools/libs/shell.py new file mode 100644 index 000000000..046410f8d --- /dev/null +++ b/metagpt/tools/libs/shell.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from __future__ import annotations + +import subprocess +from pathlib import Path +from typing import Dict, List, Tuple, Union + +from metagpt.tools.tool_registry import register_tool + + +@register_tool(tags=["shell"]) +async def shell_execute( + command: Union[List[str], str], cwd: str | Path = None, env: Dict = None, timeout: int = 600 +) -> Tuple[str, str, int]: + """ + Execute a command asynchronously and return its standard output and standard error. + + Args: + command (Union[List[str], str]): The command to execute and its arguments. It can be provided either as a list + of strings or as a single string. + cwd (str | Path, optional): The current working directory for the command. Defaults to None. + env (Dict, optional): Environment variables to set for the command. Defaults to None. + timeout (int, optional): Timeout for the command execution in seconds. Defaults to 600. + + Returns: + Tuple[str, str, int]: A tuple containing the string type standard output and string type standard error of the executed command and int type return code. + + Raises: + ValueError: If the command times out, this error is raised. The error message contains both standard output and + standard error of the timed-out process. + + Example: + >>> # command is a list + >>> stdout, stderr, returncode = await shell_execute(command=["ls", "-l"], cwd="/home/user", env={"PATH": "/usr/bin"}) + >>> print(stdout) + total 8 + -rw-r--r-- 1 user user 0 Mar 22 10:00 file1.txt + -rw-r--r-- 1 user user 0 Mar 22 10:00 file2.txt + ... + + >>> # command is a string of shell script + >>> stdout, stderr, returncode = await shell_execute(command="ls -l", cwd="/home/user", env={"PATH": "/usr/bin"}) + >>> print(stdout) + total 8 + -rw-r--r-- 1 user user 0 Mar 22 10:00 file1.txt + -rw-r--r-- 1 user user 0 Mar 22 10:00 file2.txt + ... + + References: + This function uses `subprocess.Popen` for executing shell commands asynchronously. + """ + cwd = str(cwd) if cwd else None + shell = True if isinstance(command, str) else False + process = subprocess.Popen(command, cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, shell=shell) + try: + # Wait for the process to complete, with a timeout + stdout, stderr = process.communicate(timeout=timeout) + return stdout.decode("utf-8"), stderr.decode("utf-8"), process.returncode + except subprocess.TimeoutExpired: + process.kill() # Kill the process if it times out + stdout, stderr = process.communicate() + raise ValueError(f"{stdout.decode('utf-8')}\n{stderr.decode('utf-8')}") diff --git a/metagpt/utils/common.py b/metagpt/utils/common.py index e443c3466..3d9fc5a9f 100644 --- a/metagpt/utils/common.py +++ b/metagpt/utils/common.py @@ -822,19 +822,60 @@ See FAQ 5.8 raise retry_state.outcome.exception() -def get_markdown_codeblock_type(filename: str) -> str: +async def get_mime_type(filename: str | Path, force_read: bool = False) -> str: + guess_mime_type, _ = mimetypes.guess_type(filename.name) + if not guess_mime_type: + ext_mappings = {".yml": "text/yaml", ".yaml": "text/yaml"} + guess_mime_type = ext_mappings.get(filename.suffix) + if not force_read and guess_mime_type: + return guess_mime_type + + from metagpt.tools.libs.shell import shell_execute # avoid circular import + + text_set = { + "application/json", + "application/vnd.chipnuts.karaoke-mmd", + "application/javascript", + "application/xml", + "application/x-sh", + "application/sql", + "text/yaml", + } + + try: + stdout, _, _ = await shell_execute(f"file --mime-type {str(filename)}") + ix = stdout.rfind(" ") + mime_type = stdout[ix:].strip() + if mime_type == "text/plain" and guess_mime_type in text_set: + return guess_mime_type + return mime_type + except Exception as e: + logger.debug(f"file:{filename}, error:{e}") + return "unknown" + + +def get_markdown_codeblock_type(filename: str = None, mime_type: str = None) -> str: """Return the markdown code-block type corresponding to the file extension.""" - mime_type, _ = mimetypes.guess_type(filename) + if not filename and not mime_type: + raise ValueError("Either filename or mime_type must be valid.") + + if not mime_type: + mime_type, _ = mimetypes.guess_type(filename) mappings = { "text/x-shellscript": "bash", "text/x-c++src": "cpp", "text/css": "css", "text/html": "html", "text/x-java": "java", - "application/javascript": "javascript", - "application/json": "json", "text/x-python": "python", "text/x-ruby": "ruby", + "text/x-c": "cpp", + "text/yaml": "yaml", + "application/javascript": "javascript", + "application/json": "json", "application/sql": "sql", + "application/vnd.chipnuts.karaoke-mmd": "mermaid", + "application/x-sh": "bash", + "application/xml": "xml", } return mappings.get(mime_type, "text") diff --git a/metagpt/utils/git_repository.py b/metagpt/utils/git_repository.py index 16f675175..c4bdf0f4a 100644 --- a/metagpt/utils/git_repository.py +++ b/metagpt/utils/git_repository.py @@ -9,6 +9,7 @@ from __future__ import annotations import shutil +import uuid from enum import Enum from pathlib import Path from typing import Dict, List @@ -16,8 +17,10 @@ from typing import Dict, List from git.repo import Repo from git.repo.fun import is_git_dir from gitignore_parser import parse_gitignore +from tenacity import retry, stop_after_attempt, wait_random_exponential from metagpt.logs import logger +from metagpt.tools.libs.shell import shell_execute from metagpt.utils.dependency_file import DependencyFile from metagpt.utils.file_repository import FileRepository @@ -283,3 +286,33 @@ class GitRepository: continue files.append(filename) return files + + @classmethod + @retry(wait=wait_random_exponential(min=1, max=15), stop=stop_after_attempt(3)) + async def clone_from(cls, url: str | Path, output_dir: str | Path = None) -> "GitRepository": + from metagpt.context import Context + + to_path = Path(output_dir or Path(__file__).parent / f"../../workspace/downloads/{uuid.uuid4().hex}").resolve() + to_path.mkdir(parents=True, exist_ok=True) + repo_dir = to_path / Path(url).stem + if repo_dir.exists(): + shutil.rmtree(repo_dir, ignore_errors=True) + ctx = Context() + env = ctx.new_environ() + proxy = ["-c", f"http.proxy={ctx.config.proxy}"] if ctx.config.proxy else [] + command = ["git", "clone"] + proxy + [str(url)] + logger.info(" ".join(command)) + + stdout, stderr, return_code = await shell_execute(command=command, cwd=str(to_path), env=env, timeout=600) + info = f"{stdout}\n{stderr}\nexit: {return_code}\n" + logger.info(info) + dir_name = Path(url).with_suffix("").name + to_path = to_path / dir_name + if not cls.is_git_dir(to_path): + raise ValueError(info) + logger.info(f"git clone to {to_path}") + return GitRepository(local_path=to_path, auto_init=False) + + async def checkout(self, commit_id: str): + self._repository.git.checkout(commit_id) + logger.info(f"git checkout {commit_id}") diff --git a/metagpt/utils/repo_to_markdown.py b/metagpt/utils/repo_to_markdown.py index 76dfe1b82..65065025a 100644 --- a/metagpt/utils/repo_to_markdown.py +++ b/metagpt/utils/repo_to_markdown.py @@ -5,17 +5,24 @@ This file provides functionality to convert a local repository into a markdown r """ from __future__ import annotations -import mimetypes +import re from pathlib import Path +from typing import Tuple from gitignore_parser import parse_gitignore from metagpt.logs import logger -from metagpt.utils.common import aread, awrite, get_markdown_codeblock_type, list_files +from metagpt.utils.common import ( + aread, + awrite, + get_markdown_codeblock_type, + get_mime_type, + list_files, +) from metagpt.utils.tree import tree -async def repo_to_markdown(repo_path: str | Path, output: str | Path = None, gitignore: str | Path = None) -> str: +async def repo_to_markdown(repo_path: str | Path, output: str | Path = None) -> str: """ Convert a local repository into a markdown representation. @@ -25,56 +32,108 @@ async def repo_to_markdown(repo_path: str | Path, output: str | Path = None, git Args: repo_path (str | Path): The path to the local repository. output (str | Path, optional): The path to save the generated markdown file. Defaults to None. - gitignore (str | Path, optional): The path to the .gitignore file. Defaults to None. Returns: str: The markdown representation of the repository. """ - repo_path = Path(repo_path) - gitignore = Path(gitignore or Path(__file__).parent / "../../.gitignore").resolve() + repo_path = Path(repo_path).resolve() + gitignore_file = repo_path / ".gitignore" - markdown = await _write_dir_tree(repo_path=repo_path, gitignore=gitignore) + markdown = await _write_dir_tree(repo_path=repo_path, gitignore=gitignore_file) - gitignore_rules = parse_gitignore(full_path=str(gitignore)) + gitignore_rules = parse_gitignore(full_path=str(gitignore_file)) if gitignore_file.exists() else None markdown += await _write_files(repo_path=repo_path, gitignore_rules=gitignore_rules) if output: - await awrite(filename=str(output), data=markdown, encoding="utf-8") + output_file = Path(output).resolve() + output_file.parent.mkdir(parents=True, exist_ok=True) + await awrite(filename=str(output_file), data=markdown, encoding="utf-8") + logger.info(f"save: {output_file}") return markdown async def _write_dir_tree(repo_path: Path, gitignore: Path) -> str: try: - content = tree(repo_path, gitignore, run_command=True) + content = await tree(repo_path, gitignore, run_command=True) except Exception as e: logger.info(f"{e}, using safe mode.") - content = tree(repo_path, gitignore, run_command=False) + content = await tree(repo_path, gitignore, run_command=False) doc = f"## Directory Tree\n```text\n{content}\n```\n---\n\n" return doc -async def _write_files(repo_path, gitignore_rules) -> str: +async def _write_files(repo_path, gitignore_rules=None) -> str: filenames = list_files(repo_path) markdown = "" + pattern = r"^\..*" # Hidden folders/files for filename in filenames: - if gitignore_rules(str(filename)): + if gitignore_rules and gitignore_rules(str(filename)): + continue + ignore = False + for i in filename.parts: + if re.match(pattern, i): + ignore = True + break + if ignore: continue markdown += await _write_file(filename=filename, repo_path=repo_path) return markdown async def _write_file(filename: Path, repo_path: Path) -> str: - relative_path = filename.relative_to(repo_path) - markdown = f"## {relative_path}\n" - - mime_type, _ = mimetypes.guess_type(filename.name) - if "text/" not in mime_type: + is_text, mime_type = await _is_text_file(filename) + if not is_text: logger.info(f"Ignore content: {filename}") - markdown += "\n---\n\n" + return "" + + try: + relative_path = filename.relative_to(repo_path) + markdown = f"## {relative_path}\n" + content = await aread(filename, encoding="utf-8") + content = content.replace("```", "\\`\\`\\`").replace("---", "\\-\\-\\-") + code_block_type = get_markdown_codeblock_type(filename.name) + markdown += f"```{code_block_type}\n{content}\n```\n---\n\n" return markdown - content = await aread(filename, encoding="utf-8") - content = content.replace("```", "\\`\\`\\`").replace("---", "\\-\\-\\-") - code_block_type = get_markdown_codeblock_type(filename.name) - markdown += f"```{code_block_type}\n{content}\n```\n---\n\n" - return markdown + except Exception as e: + logger.error(e) + return "" + + +async def _is_text_file(filename: Path) -> Tuple[bool, str]: + pass_set = { + "application/json", + "application/vnd.chipnuts.karaoke-mmd", + "application/javascript", + "application/xml", + "application/x-sh", + "application/sql", + } + denied_set = { + "application/zlib", + "application/octet-stream", + "image/svg+xml", + "application/pdf", + "application/msword", + "application/vnd.ms-excel", + "audio/x-wav", + "application/x-git", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/zip", + "image/jpeg", + "audio/mpeg", + "video/mp2t", + "inode/x-empty", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "image/png", + "image/vnd.microsoft.icon", + "video/mp4", + } + mime_type = await get_mime_type(filename, force_read=True) + v = "text/" in mime_type or mime_type in pass_set + if v: + return True, mime_type + + if mime_type not in denied_set: + logger.info(mime_type) + return False, mime_type diff --git a/metagpt/utils/tree.py b/metagpt/utils/tree.py index bd7922290..2fcbb5022 100644 --- a/metagpt/utils/tree.py +++ b/metagpt/utils/tree.py @@ -27,14 +27,15 @@ """ from __future__ import annotations -import subprocess from pathlib import Path from typing import Callable, Dict, List from gitignore_parser import parse_gitignore +from metagpt.tools.libs.shell import shell_execute -def tree(root: str | Path, gitignore: str | Path = None, run_command: bool = False) -> str: + +async def tree(root: str | Path, gitignore: str | Path = None, run_command: bool = False) -> str: """ Recursively traverses the directory structure and prints it out in a tree-like format. @@ -80,7 +81,7 @@ def tree(root: str | Path, gitignore: str | Path = None, run_command: bool = Fal """ root = Path(root).resolve() if run_command: - return _execute_tree(root, gitignore) + return await _execute_tree(root, gitignore) git_ignore_rules = parse_gitignore(gitignore) if gitignore else None dir_ = {root.name: _list_children(root=root, git_ignore_rules=git_ignore_rules)} @@ -129,12 +130,7 @@ def _add_line(rows: List[str]) -> List[str]: return rows -def _execute_tree(root: Path, gitignore: str | Path) -> str: +async def _execute_tree(root: Path, gitignore: str | Path) -> str: args = ["--gitfile", str(gitignore)] if gitignore else [] - try: - result = subprocess.run(["tree"] + args + [str(root)], capture_output=True, text=True, check=True) - if result.returncode != 0: - raise ValueError(f"tree exits with code {result.returncode}") - return result.stdout - except subprocess.CalledProcessError as e: - raise e + stdout, _, _ = await shell_execute(["tree"] + args + [str(root)]) + return stdout diff --git a/tests/metagpt/tools/libs/test_git.py b/tests/metagpt/tools/libs/test_git.py new file mode 100644 index 000000000..12192ca86 --- /dev/null +++ b/tests/metagpt/tools/libs/test_git.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import pytest +from pydantic import BaseModel + +from metagpt.tools.libs.git import git_checkout, git_clone +from metagpt.utils.git_repository import GitRepository + + +class SWEBenchItem(BaseModel): + base_commit: str + repo: str + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ["url", "commit_id"], [("https://github.com/sqlfluff/sqlfluff.git", "d19de0ecd16d298f9e3bfb91da122734c40c01e5")] +) +async def test_git(url: str, commit_id: str): + repo_dir = await git_clone(url) + assert repo_dir + + await git_checkout(repo_dir, commit_id) + + repo = GitRepository(repo_dir, auto_init=False) + repo.delete_repository() + + +if __name__ == "__main__": + pytest.main([__file__, "-s"]) diff --git a/tests/metagpt/tools/libs/test_shell.py b/tests/metagpt/tools/libs/test_shell.py new file mode 100644 index 000000000..283cc8229 --- /dev/null +++ b/tests/metagpt/tools/libs/test_shell.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import pytest + +from metagpt.tools.libs.shell import execute + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ["command", "expect_stdout", "expect_stderr"], + [ + (["file", f"{__file__}"], "Python script text executable, ASCII text", ""), + (f"file {__file__}", "Python script text executable, ASCII text", ""), + ], +) +async def test_shell(command, expect_stdout, expect_stderr): + stdout, stderr = await execute(command) + assert expect_stdout in stdout + assert stderr == expect_stderr + + +if __name__ == "__main__": + pytest.main([__file__, "-s"]) diff --git a/tests/metagpt/utils/test_repo_to_markdown.py b/tests/metagpt/utils/test_repo_to_markdown.py index 914c50dd7..28bdf87b7 100644 --- a/tests/metagpt/utils/test_repo_to_markdown.py +++ b/tests/metagpt/utils/test_repo_to_markdown.py @@ -10,7 +10,12 @@ from metagpt.utils.repo_to_markdown import repo_to_markdown @pytest.mark.parametrize( ["repo_path", "output"], - [(Path(__file__).parent.parent, Path(__file__).parent.parent.parent / f"workspace/unittest/{uuid.uuid4().hex}.md")], + [ + ( + Path(__file__).parent.parent.parent, + Path(__file__).parent / f"../../../workspace/unittest/{uuid.uuid4().hex}.md", + ), + ], ) @pytest.mark.asyncio async def test_repo_to_markdown(repo_path: Path, output: Path):