From 9ed335bdac21b8d7aba177593b870b7173398f67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8E=98=E6=9D=83=20=E9=A9=AC?= Date: Fri, 22 Mar 2024 20:00:25 +0800 Subject: [PATCH] fixbug: guess mimetype --- metagpt/tools/libs/shell.py | 16 +-- metagpt/utils/common.py | 49 ++++++++- metagpt/utils/git_repository.py | 6 +- metagpt/utils/repo_to_markdown.py | 104 ++++++++++++++----- metagpt/utils/tree.py | 18 ++-- tests/metagpt/utils/test_repo_to_markdown.py | 7 +- 6 files changed, 147 insertions(+), 53 deletions(-) diff --git a/metagpt/tools/libs/shell.py b/metagpt/tools/libs/shell.py index 5eb54f310..046410f8d 100644 --- a/metagpt/tools/libs/shell.py +++ b/metagpt/tools/libs/shell.py @@ -10,9 +10,9 @@ from metagpt.tools.tool_registry import register_tool @register_tool(tags=["shell"]) -async def execute( +async def shell_execute( command: Union[List[str], str], cwd: str | Path = None, env: Dict = None, timeout: int = 600 -) -> Tuple[str, str]: +) -> Tuple[str, str, int]: """ Execute a command asynchronously and return its standard output and standard error. @@ -24,8 +24,7 @@ async def execute( timeout (int, optional): Timeout for the command execution in seconds. Defaults to 600. Returns: - Tuple[str, str]: A tuple containing the standard output and standard error of the executed command, both as - strings. + Tuple[str, str, int]: A tuple containing the string type standard output and string type standard error of the executed command and int type return code. Raises: ValueError: If the command times out, this error is raised. The error message contains both standard output and @@ -33,7 +32,7 @@ async def execute( Example: >>> # command is a list - >>> stdout, stderr = await execute(command=["ls", "-l"], cwd="/home/user", env={"PATH": "/usr/bin"}) + >>> stdout, stderr, returncode = await shell_execute(command=["ls", "-l"], cwd="/home/user", env={"PATH": "/usr/bin"}) >>> print(stdout) total 8 -rw-r--r-- 1 user user 0 Mar 22 10:00 file1.txt @@ -41,12 +40,15 @@ async def execute( ... >>> # command is a string of shell script - >>> stdout, stderr = await execute(command="ls -l", cwd="/home/user", env={"PATH": "/usr/bin"}) + >>> stdout, stderr, returncode = await shell_execute(command="ls -l", cwd="/home/user", env={"PATH": "/usr/bin"}) >>> print(stdout) total 8 -rw-r--r-- 1 user user 0 Mar 22 10:00 file1.txt -rw-r--r-- 1 user user 0 Mar 22 10:00 file2.txt ... + + References: + This function uses `subprocess.Popen` for executing shell commands asynchronously. """ cwd = str(cwd) if cwd else None shell = True if isinstance(command, str) else False @@ -54,7 +56,7 @@ async def execute( try: # Wait for the process to complete, with a timeout stdout, stderr = process.communicate(timeout=timeout) - return stdout.decode("utf-8"), stderr.decode("utf-8") + return stdout.decode("utf-8"), stderr.decode("utf-8"), process.returncode except subprocess.TimeoutExpired: process.kill() # Kill the process if it times out stdout, stderr = process.communicate() diff --git a/metagpt/utils/common.py b/metagpt/utils/common.py index e443c3466..3d9fc5a9f 100644 --- a/metagpt/utils/common.py +++ b/metagpt/utils/common.py @@ -822,19 +822,60 @@ See FAQ 5.8 raise retry_state.outcome.exception() -def get_markdown_codeblock_type(filename: str) -> str: +async def get_mime_type(filename: str | Path, force_read: bool = False) -> str: + guess_mime_type, _ = mimetypes.guess_type(filename.name) + if not guess_mime_type: + ext_mappings = {".yml": "text/yaml", ".yaml": "text/yaml"} + guess_mime_type = ext_mappings.get(filename.suffix) + if not force_read and guess_mime_type: + return guess_mime_type + + from metagpt.tools.libs.shell import shell_execute # avoid circular import + + text_set = { + "application/json", + "application/vnd.chipnuts.karaoke-mmd", + "application/javascript", + "application/xml", + "application/x-sh", + "application/sql", + "text/yaml", + } + + try: + stdout, _, _ = await shell_execute(f"file --mime-type {str(filename)}") + ix = stdout.rfind(" ") + mime_type = stdout[ix:].strip() + if mime_type == "text/plain" and guess_mime_type in text_set: + return guess_mime_type + return mime_type + except Exception as e: + logger.debug(f"file:{filename}, error:{e}") + return "unknown" + + +def get_markdown_codeblock_type(filename: str = None, mime_type: str = None) -> str: """Return the markdown code-block type corresponding to the file extension.""" - mime_type, _ = mimetypes.guess_type(filename) + if not filename and not mime_type: + raise ValueError("Either filename or mime_type must be valid.") + + if not mime_type: + mime_type, _ = mimetypes.guess_type(filename) mappings = { "text/x-shellscript": "bash", "text/x-c++src": "cpp", "text/css": "css", "text/html": "html", "text/x-java": "java", - "application/javascript": "javascript", - "application/json": "json", "text/x-python": "python", "text/x-ruby": "ruby", + "text/x-c": "cpp", + "text/yaml": "yaml", + "application/javascript": "javascript", + "application/json": "json", "application/sql": "sql", + "application/vnd.chipnuts.karaoke-mmd": "mermaid", + "application/x-sh": "bash", + "application/xml": "xml", } return mappings.get(mime_type, "text") diff --git a/metagpt/utils/git_repository.py b/metagpt/utils/git_repository.py index 2471b7025..0b76d06ab 100644 --- a/metagpt/utils/git_repository.py +++ b/metagpt/utils/git_repository.py @@ -19,7 +19,7 @@ from git.repo.fun import is_git_dir from gitignore_parser import parse_gitignore from metagpt.logs import logger -from metagpt.tools.libs.shell import execute +from metagpt.tools.libs.shell import shell_execute from metagpt.utils.dependency_file import DependencyFile from metagpt.utils.file_repository import FileRepository @@ -298,8 +298,8 @@ class GitRepository: command = ["git", "clone"] + proxy + [str(url)] logger.info(" ".join(command)) - stdout, stderr = await execute(command=command, cwd=str(to_path), env=env, timeout=600) - info = f"{stdout}\n{stderr}" + stdout, stderr, return_code = await shell_execute(command=command, cwd=str(to_path), env=env, timeout=600) + info = f"{stdout}\n{stderr}\nexit: {return_code}\n" logger.info(info) dir_name = Path(url).with_suffix("").name to_path = to_path / dir_name diff --git a/metagpt/utils/repo_to_markdown.py b/metagpt/utils/repo_to_markdown.py index df916eb5d..9ca39c061 100644 --- a/metagpt/utils/repo_to_markdown.py +++ b/metagpt/utils/repo_to_markdown.py @@ -5,14 +5,20 @@ This file provides functionality to convert a local repository into a markdown r """ from __future__ import annotations -import mimetypes +import re from pathlib import Path +from typing import Tuple from gitignore_parser import parse_gitignore from metagpt.logs import logger -from metagpt.tools.libs.shell import execute -from metagpt.utils.common import aread, awrite, get_markdown_codeblock_type, list_files +from metagpt.utils.common import ( + aread, + awrite, + get_markdown_codeblock_type, + get_mime_type, + list_files, +) from metagpt.utils.tree import tree @@ -31,7 +37,7 @@ async def repo_to_markdown(repo_path: str | Path, output: str | Path = None, git Returns: str: The markdown representation of the repository. """ - repo_path = Path(repo_path) + repo_path = Path(repo_path).resolve() gitignore = Path(gitignore or Path(__file__).parent / "../../.gitignore").resolve() markdown = await _write_dir_tree(repo_path=repo_path, gitignore=gitignore) @@ -40,16 +46,19 @@ async def repo_to_markdown(repo_path: str | Path, output: str | Path = None, git markdown += await _write_files(repo_path=repo_path, gitignore_rules=gitignore_rules) if output: - await awrite(filename=str(output), data=markdown, encoding="utf-8") + output_file = Path(output).resolve() + output_file.parent.mkdir(parents=True, exist_ok=True) + await awrite(filename=str(output_file), data=markdown, encoding="utf-8") + logger.info(f"save: {output_file}") return markdown async def _write_dir_tree(repo_path: Path, gitignore: Path) -> str: try: - content = tree(repo_path, gitignore, run_command=True) + content = await tree(repo_path, gitignore, run_command=True) except Exception as e: logger.info(f"{e}, using safe mode.") - content = tree(repo_path, gitignore, run_command=False) + content = await tree(repo_path, gitignore, run_command=False) doc = f"## Directory Tree\n```text\n{content}\n```\n---\n\n" return doc @@ -58,33 +67,74 @@ async def _write_dir_tree(repo_path: Path, gitignore: Path) -> str: async def _write_files(repo_path, gitignore_rules) -> str: filenames = list_files(repo_path) markdown = "" + pattern = r"^\..*" # Hidden folders/files for filename in filenames: if gitignore_rules(str(filename)): continue + ignore = False + for i in filename.parts: + if re.match(pattern, i): + ignore = True + break + if ignore: + continue markdown += await _write_file(filename=filename, repo_path=repo_path) return markdown async def _write_file(filename: Path, repo_path: Path) -> str: - relative_path = filename.relative_to(repo_path) - markdown = f"## {relative_path}\n" - - mime_type, _ = mimetypes.guess_type(filename.name) - if not mime_type: - try: - stdout, stderr = await execute(f"file {str(filename)}") - if "text" in stdout.lower(): - mime_type = "text/*" - except Exception as e: - logger.debug(f"file:{filename}, error:{e}") - mime_type = "unknown" - - if "text/" not in mime_type: + is_text, mime_type = await _is_text_file(filename) + if not is_text: logger.info(f"Ignore content: {filename}") - markdown += "\n---\n\n" + return "" + + try: + relative_path = filename.relative_to(repo_path) + markdown = f"## {relative_path}\n" + content = await aread(filename, encoding="utf-8") + content = content.replace("```", "\\`\\`\\`").replace("---", "\\-\\-\\-") + code_block_type = get_markdown_codeblock_type(filename.name) + markdown += f"```{code_block_type}\n{content}\n```\n---\n\n" return markdown - content = await aread(filename, encoding="utf-8") - content = content.replace("```", "\\`\\`\\`").replace("---", "\\-\\-\\-") - code_block_type = get_markdown_codeblock_type(filename.name) - markdown += f"```{code_block_type}\n{content}\n```\n---\n\n" - return markdown + except Exception as e: + logger.error(e) + return "" + + +async def _is_text_file(filename: Path) -> Tuple[bool, str]: + pass_set = { + "application/json", + "application/vnd.chipnuts.karaoke-mmd", + "application/javascript", + "application/xml", + "application/x-sh", + "application/sql", + } + denied_set = { + "application/zlib", + "application/octet-stream", + "image/svg+xml", + "application/pdf", + "application/msword", + "application/vnd.ms-excel", + "audio/x-wav", + "application/x-git", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/zip", + "image/jpeg", + "audio/mpeg", + "video/mp2t", + "inode/x-empty", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "image/png", + "image/vnd.microsoft.icon", + "video/mp4", + } + mime_type = await get_mime_type(filename, force_read=True) + v = "text/" in mime_type or mime_type in pass_set + if v: + return True, mime_type + + if mime_type not in denied_set: + logger.info(mime_type) + return False, mime_type diff --git a/metagpt/utils/tree.py b/metagpt/utils/tree.py index bd7922290..2fcbb5022 100644 --- a/metagpt/utils/tree.py +++ b/metagpt/utils/tree.py @@ -27,14 +27,15 @@ """ from __future__ import annotations -import subprocess from pathlib import Path from typing import Callable, Dict, List from gitignore_parser import parse_gitignore +from metagpt.tools.libs.shell import shell_execute -def tree(root: str | Path, gitignore: str | Path = None, run_command: bool = False) -> str: + +async def tree(root: str | Path, gitignore: str | Path = None, run_command: bool = False) -> str: """ Recursively traverses the directory structure and prints it out in a tree-like format. @@ -80,7 +81,7 @@ def tree(root: str | Path, gitignore: str | Path = None, run_command: bool = Fal """ root = Path(root).resolve() if run_command: - return _execute_tree(root, gitignore) + return await _execute_tree(root, gitignore) git_ignore_rules = parse_gitignore(gitignore) if gitignore else None dir_ = {root.name: _list_children(root=root, git_ignore_rules=git_ignore_rules)} @@ -129,12 +130,7 @@ def _add_line(rows: List[str]) -> List[str]: return rows -def _execute_tree(root: Path, gitignore: str | Path) -> str: +async def _execute_tree(root: Path, gitignore: str | Path) -> str: args = ["--gitfile", str(gitignore)] if gitignore else [] - try: - result = subprocess.run(["tree"] + args + [str(root)], capture_output=True, text=True, check=True) - if result.returncode != 0: - raise ValueError(f"tree exits with code {result.returncode}") - return result.stdout - except subprocess.CalledProcessError as e: - raise e + stdout, _, _ = await shell_execute(["tree"] + args + [str(root)]) + return stdout diff --git a/tests/metagpt/utils/test_repo_to_markdown.py b/tests/metagpt/utils/test_repo_to_markdown.py index 914c50dd7..efd38e8ab 100644 --- a/tests/metagpt/utils/test_repo_to_markdown.py +++ b/tests/metagpt/utils/test_repo_to_markdown.py @@ -10,7 +10,12 @@ from metagpt.utils.repo_to_markdown import repo_to_markdown @pytest.mark.parametrize( ["repo_path", "output"], - [(Path(__file__).parent.parent, Path(__file__).parent.parent.parent / f"workspace/unittest/{uuid.uuid4().hex}.md")], + [ + ( + Path(__file__).parent.parent.parent.parent, + Path(__file__).parent / f"../../../workspace/unittest/{uuid.uuid4().hex}.md", + ), + ], ) @pytest.mark.asyncio async def test_repo_to_markdown(repo_path: Path, output: Path):