fix conflict

This commit is contained in:
seehi 2024-09-09 11:01:30 +08:00
commit 1797fdc1f8
15 changed files with 952 additions and 246 deletions

View file

@ -96,7 +96,7 @@ class MGXEnv(Environment, SerializationMixin):
async def reply_to_human(self, content: str, sent_from: Role = None) -> str:
# NOTE: Can be overwritten in remote setting
return "SUCCESS, human has received your reply. Refrain from resending duplicate messages."
return "SUCCESS, human has received your reply. Refrain from resending duplicate messages. If you no longer need to take action, use the command end to stop."
def message_within_software_sop(self, message: Message) -> bool:
# Engineer, QaEngineer can be end of the SOP. Their msg requires routing outside.

View file

@ -1,32 +1,89 @@
from metagpt.prompts.di.role_zero import ROLE_INSTRUCTION
EXTRA_INSTRUCTION_DEPRECATED = """
4. Each time you write a code in your response, write with the Editor directly without preparing a repetitive code block beforehand.
5. Take on ONE task and write ONE code file in each response. DON'T attempt all tasks in one response.
6. When not specified, you should write files in a folder named "src". If you know the project path, then write in a "src" folder under the project path.
7. When provided system design or project schedule, you MUST read them first before making a plan, then adhere to them in your implementation, especially in the programming language, package, or framework. You MUST implement all code files prescribed in the system design or project schedule. You can create a plan first with each task corresponding to implementing one code file.
8. Write at most one file per task, do your best to implement THE ONLY ONE FILE. CAREFULLY CHECK THAT YOU DONT MISS ANY NECESSARY CLASS/FUNCTION IN THIS FILE.
9. COMPLETE CODE: Your code will be part of the entire project, so please implement complete, reliable, reusable code snippets.
10. When provided system design, YOU MUST FOLLOW "Data structures and interfaces". DONT CHANGE ANY DESIGN. Do not use public member functions that do not exist in your design.
11. Write out EVERY CODE DETAIL, DON'T LEAVE TODO.
12. To modify code in a file, read the entire file, make changes, and update the file with the complete code, ensuring that no line numbers are included in the final write.
13. When a system design or project schedule is provided, at the end of the plan, add a Validate Task for each file; for example, if there are three files, add three Validate Tasks. For each Validate Task, just call ValidateAndRewriteCode.run.
14. When planning, initially list the files for coding, then outline all coding and review tasks in your first response.
15. Note 'Task for {file_name} completed.' signifies the {file_name} coding task is done.
16. Avoid re-reviewing or re-coding the same code. When you decide to take a write or review action, include the command 'finish current task' in the same response.
17. When coding JavaScript, avoid using '\'' in strings.
18. If you plan to read a file, do not include other plans in the same response.
"""
EXTRA_INSTRUCTION = """
6. When not specified, you should write files in a folder named "src". If you know the project path, then write in a "src" folder under the project path.
7. When provided system design or project schedule, you MUST read them first before making a plan, then adhere to them in your implementation, especially in the programming language, package, or framework. You MUST implement all code files prescribed in the system design or project schedule. You can create a plan first with each task corresponding to implementing one code file.
8. When planning, initially list the files for coding, then outline all coding and review tasks in your first response.
9. If you plan to read a file, do not include other plans in the same response.
10. Use Engineer2.write_new_code to create or modify a file. Write only one code file each time.
11. When the requirement is simple, you don't need to create a plan, just do it right away.
"""
You are an autonomous programmer
The special interface consists of a file editor that shows you 100 lines of a file at a time.
You can use terminal commands (e.g., cat, ls, cd) by calling Terminal.run_command.
You should carefully observe the behavior and results of the previous action, and avoid triggering repeated errors.
In addition to the terminal, I also provide additional tools. If provided an issue link, you MUST navigate to the issue page using Browser tool to understand the issue, before starting your fix.
Your first action must be to check if the repository exists at the current path. If it exists, navigate to the repository path. If the repository doesn't exist, please download it and then navigate to it.
All subsequent actions must be performed within this repository path. Do not leave this directory to execute any actions at any time.
Note:
1. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the Editor.goto_line command. It's much quicker.
2. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current open file.
3. When using Editor.edit_file_by_replace, if there is no exact match, take the difference in indentation into consideration.
4. After editing, verify the changes to ensure correct line numbers and proper indentation. Adhere to PEP8 standards for Python code.
5. NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line! Ensuring the code adheres to PEP8 standards. If a edit command fails, you can try to edit the file again to correct the indentation, but don't repeat the same command without changes.
6. To avoid syntax errors when editing files multiple times, consider opening the file to view the surrounding code related to the error line and make modifications based on this context.
7. Ensure to observe the currently open file and the current working directory, which is displayed right after the open file. The open file might be in a different directory than the working directory. Remember, commands like 'create' open files and might alter the current open file.
8. Effectively using Use search commands (`search_dir`, `search_file`, `find_file`) and navigation commands (`open_file`, `goto_line`) to locate and modify files efficiently. The Editor tool can fully satisfy the requirements. Follow these steps and considerations for optimal results:
**General Search Guidelines:**
- Ensure you are in the repository's root directory before starting your search.
- Always double-check the current working directory and the currently open file to avoid confusion.
- Avoid repeating failed search commands without modifications to improve efficiency.
**Strategies for Searching and Navigating Files:**
1. **If you know the file's location:**
- Use the `open_file` command directly to open the file.
- Use `search_file` to find the `search_term` within the currently open file.
- Alternatively, use the `goto_line` command to jump to the specified line.
- **Boundary Consideration:** Ensure the file path is correctly specified and accessible.
2. **If you know the filename but not the exact location:**
- Use `find_file` to locate the file in the directory.
- Use `open_file` to open the file once located.
- Use `search_file` to find the `search_term` within the file.
- Use `goto_line` to jump to the specified line if needed.
- **Boundary Consideration:** Handle cases where the file may exist in multiple directories by verifying the correct path before opening.
3. **If you know the symbol but not the file's location:**
- Use "search_dir" to find files containing the symbol within the directory.
- Review the search results to identify the relevant file(s).
- Use `open_file` to open the identified file.
- Use `search_file` to locate the `search_term` within the open file.
- Use `goto_line` to jump to the specified line.
- **Boundary Consideration:** Be thorough in reviewing multiple search results to ensure you open the correct file. Consider using more specific search terms if initial searches return too many results.
**Search Tips:**
- The `<search_term>` for `search_dir`, `find_file`, or `search_file` should be an existing class name, function name, or file name.
- Enclose terms like `def` or `class` in quotes when searching for functions or classes (e.g., `search_dir 'def apow'` or `search_file 'class Pow'`).
- Use wildcard characters (`*`, `?`) in search terms to broaden or narrow down your search scope.
- If search commands return too many results, refine your search criteria or use more specific terms.
- If a search command fails, modify the search criteria, check for search_term or paths, and then try again.
- Based on feedback of observation or Terminal command in trajectory to guide adjustments in your search strategy.
9. When the edit fails, try to enlarge the range of code.
10. You must use the Editor.open_file command to open a file before using the Editor tool's edit command to modify it. When you open a file, any currently open file will be automatically closed.
11. Remember, when you use Editor.insert_content_at_line or Editor.edit_file_by_replace, the line numbers will change after the operation. Therefore, if there are multiple operations, perform only the first operation in the current response, and defer the subsequent operations to the next turn.
11.1 Do not use Editor.insert_content_at_line or Editor.edit_file_by_replace more than once per command list.
12. If you choose Editor.insert_content_at_line, you must ensure that there is no duplication between the inserted content and the original code. If there is overlap between the new code and the original code, use Editor.edit_file_by_replace instead.
13. If you choose Editor.edit_file_by_replace, the original code that needs to be replaced must start at the beginning of the line and end at the end of the line
14. When not specified, you should write files in a folder named "src". If you know the project path, then write in a "src" folder under the project path.
15. When provided system design or project schedule, you MUST read them first before making a plan, then adhere to them in your implementation, especially in the programming language, package, or framework. You MUST implement all code files prescribed in the system design or project schedule. You can create a plan first with each task corresponding to implementing one code file.
16. When planning, initially list the files for coding, then outline all coding tasks based on the file organization in your first response.
17. If you plan to read a file, do not include other plans in the same response.
18. Write only one code file each time and provide its full implementation.
19. When the requirement is simple, you don't need to create a plan, just do it right away.
20. If the code exists, use the Editor tool's open and edit commands to modify it. Since it is not a new code, do not use write_new_code.
21. When using the editor, pay attention to the editor's current directory. When you use editor tools, the paths must be either absolute or relative to the editor's current directory.
"""
CURRENT_STATE = """
The current editor state is:
(Editor current directory: {editor_current_directory})
(Editor open file: {editor_open_file})
The current terminal state is:
(Terminal current directory: {terminal_current_directory})
"""
ENGINEER2_INSTRUCTION = ROLE_INSTRUCTION + EXTRA_INSTRUCTION.strip()
WRITE_CODE_SYSTEM_PROMPT = """
@ -35,7 +92,7 @@ You are a world-class engineer, your goal is to write google-style, elegant, mod
Pay attention to the conversation history and the following constraints:
1. When provided system design, YOU MUST FOLLOW "Data structures and interfaces". DONT CHANGE ANY DESIGN. Do not use public member functions that do not exist in your design.
2. When modifying a code, rewrite the full code instead of updating or inserting a snippet.
3. Write out EVERY CODE DETAIL, DON'T LEAVE TODO.
3. Write out EVERY CODE DETAIL, DON'T LEAVE TODO OR PLACEHOLDER.
"""
WRITE_CODE_PROMPT = """

View file

@ -71,6 +71,7 @@ Pay close attention to the Example provided, you can reuse the example for your
You may use any of the available commands to create a plan or update the plan. You may output mutiple commands, they will be executed sequentially.
If you finish current task, you will automatically take the next task in the existing plan, use Plan.finish_task, DON'T append a new task.
Review the latest plan's outcome, focusing on achievements. If your completed task matches the current, consider it finished.
Using Editor.insert_content_at_line and Editor.edit_file_by_replace more than once in the current command list is forbidden. Because the command is mutually exclusive and will change the line number after execution.
In your response, include at least one command.
# Your commands in a json array, in the following output format with correct command_name and args. If there is nothing to do, use the pass or end command:
@ -103,6 +104,7 @@ Fifth, describe if you should terminate, you should use **end** command to termi
REGENERATE_PROMPT = """
Review and reflect on the history carefully, provide a different response.
Describe if you should terminate using **end** command, or use **RoleZero.ask_human** to ask human for help, or try a different approach and output different commands. You are NOT allowed to provide the same commands again.
You should use "end" to stop when all tasks have been completed and the requirements are satisfied.
Your reflection, then the commands in a json array:
"""
ASK_HUMAN_COMMAND = """

View file

@ -24,7 +24,6 @@ Note:
- XL: Social media platform, e-commerce app, real-time multiplayer game
- For XS and S requirements, you don't need the standard software development process, you may directly ask Engineer to write the code. Otherwise, estimate if any part of the standard software development process may contribute to a better final code. If so, assign team members accordingly.
3.1 If the task involves code review (CR) or code checking, you should assign it to Engineer.
3.2. If the requirement is to fix a bug or issue, you should assign it to Issue Solver. However, if the code is written by Engineer, Engineer must maintain the code.
4. If the requirement is a common-sense, logical, or math problem, you should respond directly without assigning any task to team members.
5. If you think the requirement is not clear or ambiguous, you should ask the user for clarification immediately. Assign tasks only after all info is clear.
6. It is helpful for Engineer to have both the system design and the project schedule for writing the code, so include paths of both files (if available) and remind Engineer to definitely read them when publishing message to Engineer.
@ -43,7 +42,6 @@ Sixth, describe the requirements as they pertain to software development, data a
Seventh, describe the technologies you must use.
"""
)
TL_INFO = """
{role_info}
Your team member:

View file

@ -4,8 +4,11 @@ from pathlib import Path
from pydantic import Field
from metagpt.logs import logger
# from metagpt.actions.write_code_review import ValidateAndRewriteCode
from metagpt.prompts.di.engineer2 import (
CURRENT_STATE,
ENGINEER2_INSTRUCTION,
WRITE_CODE_PROMPT,
WRITE_CODE_SYSTEM_PROMPT,
@ -14,6 +17,7 @@ from metagpt.roles.di.role_zero import RoleZero
from metagpt.schema import UserMessage
from metagpt.strategy.experience_retriever import ENGINEER_EXAMPLE
from metagpt.tools.libs.cr import CodeReview
from metagpt.tools.libs.git import git_create_pull
from metagpt.tools.libs.terminal import Terminal
from metagpt.tools.tool_registry import register_tool
from metagpt.utils.common import CodeParser, awrite
@ -26,24 +30,69 @@ class Engineer2(RoleZero):
profile: str = "Engineer"
goal: str = "Take on game, app, and web development."
instruction: str = ENGINEER2_INSTRUCTION
terminal: Terminal = Field(default_factory=Terminal, exclude=True)
tools: list[str] = ["Plan", "Editor:read", "RoleZero", "Terminal:run_command", "Engineer2", "SearchEnhancedQA", "CodeReview"]
tools: list[str] = [
"Plan",
"Editor",
"RoleZero",
"Terminal:run_command",
"Browser:goto,scroll",
"git_create_pull",
"SearchEnhancedQA",
"Engineer2",
"CodeReview",
]
# SWE Agent parameter
run_eval: bool = False
output_diff: str = ""
max_react_loop: int = 40
async def _think(self) -> bool:
await self._format_instruction()
res = await super()._think()
return res
async def _format_instruction(self):
"""
Display the current terminal and editor state.
This information will be dynamically added to the command prompt.
"""
state = {
"editor_open_file": self.editor.current_file,
"editor_current_directory": self.editor.working_dir,
"terminal_current_directory": await self.terminal.run_command("pwd"),
}
self.cmd_prompt_current_state = CURRENT_STATE.format(**state).strip()
def _update_tool_execution(self):
# validate = ValidateAndRewriteCode()
cr = CodeReview()
self.tool_execution_map.update(
{
"Terminal.run_command": self.terminal.run_command,
"Engineer2.write_new_code": self.write_new_code,
"CodeReview.review": cr.review,
"CodeReview.fix": cr.fix,
# "ValidateAndRewriteCode.run": validate.run,
# "ValidateAndRewriteCode": validate.run,
}
)
self.exclusive_tool_commands.append("Engineer2.write_new_code")
if self.run_eval is True:
# Evalute tool map
self.tool_execution_map.update(
{
"git_create_pull": git_create_pull,
"Engineer2.write_new_code": self.write_new_code,
"CodeReview.review": cr.review,
"CodeReview.fix": cr.fix,
"Terminal.run_command": self._eval_terminal_run,
"RoleZero.ask_human": self._end,
"RoleZero.reply_to_human": self._end,
}
)
else:
# Default tool map
self.tool_execution_map.update(
{
"git_create_pull": git_create_pull,
"Engineer2.write_new_code": self.write_new_code,
"CodeReview.review": cr.review,
"CodeReview.fix": cr.fix,
"Terminal.run_command": self.terminal.run_command,
}
)
def _retrieve_experience(self) -> str:
return ENGINEER_EXAMPLE
@ -82,3 +131,14 @@ class Engineer2(RoleZero):
# TODO: Consider adding line no to be ready for editing.
return f"The file {path} has been successfully created, with content:\n{code}"
async def _eval_terminal_run(self, cmd):
"""change command pull/push/commit to end."""
if any([cmd_key_word in cmd for cmd_key_word in ["pull", "push", "commit"]]):
# The Engineer2 attempts to submit the repository after fixing the bug, thereby reaching the end of the fixing process.
logger.info("Engineer2 use cmd:{cmd}\nCurrent test case is finished.")
# Set self.rc.todo to None to stop the engineer.
self._set_state(-1)
else:
command_output = await self.terminal.run_command(cmd)
return command_output

View file

@ -75,8 +75,15 @@ class RoleZero(Role):
tool_recommender: Optional[ToolRecommender] = None
tool_execution_map: Annotated[dict[str, Callable], Field(exclude=True)] = {}
special_tool_commands: list[str] = ["Plan.finish_current_task", "end", "Bash.run"]
# List of exclusive tool commands.
# If multiple instances of these commands appear, only the first occurrence will be retained.
exclusive_tool_commands: list[str] = [
"Editor.edit_file_by_replace",
"Editor.insert_content_at_line",
"Editor.append_file",
]
# Equipped with three basic tools by default for optional use
editor: Editor = Editor()
editor: Editor = Editor(enable_auto_lint=True)
browser: Browser = Browser()
# Experience
@ -152,7 +159,7 @@ class RoleZero(Role):
"scroll_up",
"search_dir",
"search_file",
"set_workdir",
# "set_workdir",
"write",
]
}
@ -233,10 +240,8 @@ class RoleZero(Role):
async with ThoughtReporter(enable_llm_stream=True) as reporter:
await reporter.async_report({"type": "react"})
self.command_rsp = await self.llm_cached_aask(req=req, system_msgs=[system_prompt], state_data=state_data)
self.command_rsp = await self._check_duplicates(req, self.command_rsp)
self._add_memory(AIMessage(content=self.command_rsp))
return True
@exp_cache(context_builder=RoleZeroContextBuilder(), serializer=RoleZeroSerializer())
@ -276,7 +281,8 @@ class RoleZero(Role):
if self.use_fixed_sop:
return await super()._act()
commands, ok = await self._parse_commands(self.command_rsp)
commands, ok, self.command_rsp = await self._parse_commands(self.command_rsp)
self._add_memory(AIMessage(content=self.command_rsp))
if not ok:
error_msg = commands
self._add_memory(UserMessage(content=error_msg))
@ -424,12 +430,27 @@ class RoleZero(Role):
tb = traceback.format_exc()
print(tb)
error_msg = str(e)
return error_msg, False
return error_msg, False, command_rsp
# 为了对LLM不按格式生成进行容错
if isinstance(commands, dict):
commands = commands["commands"] if "commands" in commands else [commands]
return commands, True
# Set the exclusive command flag to False.
command_flag = [command["command_name"] not in self.exclusive_tool_commands for command in commands]
if command_flag.count(False) > 1:
# Keep only the first exclusive command
index_of_first_exclusive = command_flag.index(False)
commands = [
cmd
for index, cmd in enumerate(commands)
if index == index_of_first_exclusive or cmd["command_name"] not in self.exclusive_tool_commands
]
command_rsp = "```json\n" + json.dumps(commands, indent=4, ensure_ascii=False) + "\n```json"
logger.info(
"exclusive command more than one in current command list. change the command list.\n" + command_rsp
)
return commands, True, command_rsp
async def _run_commands(self, commands) -> str:
outputs = []
@ -473,7 +494,9 @@ class RoleZero(Role):
if cmd["command_name"] == "Plan.finish_current_task":
if not self.planner.plan.is_plan_finished():
self.planner.plan.finish_current_task()
command_output = "Current task is finished. If all tasks are finished, use 'end' to stop."
command_output = (
"Current task is finished. If you no longer need to take action, use the command end to stop."
)
elif cmd["command_name"] == "end":
command_output = await self._end()
@ -488,6 +511,7 @@ class RoleZero(Role):
)
else:
command_output += f"\n[command]: {cmd['args']['cmd']} \n[command output] : {tool_output}"
return command_output
def _get_plan_status(self) -> Tuple[str, str]:
@ -536,7 +560,7 @@ class RoleZero(Role):
return "Not in MGXEnv, command will not be executed."
return await self.rc.env.reply_to_human(content, sent_from=self)
async def _end(self):
async def _end(self, **kwarg):
self._set_state(-1)
memory = self._fetch_memories()
# Ensure reply to the human before the "end" command is executed. Hard code k=5 for checking.

View file

@ -46,7 +46,6 @@ class SWEAgent(RoleZero):
async def _format_instruction(self):
"""
Formats the instruction message for the SWE agent.
Runs the "state" command in the terminal, parses its output as JSON,
and uses it to format the `_instruction` template.
"""
@ -63,10 +62,8 @@ class SWEAgent(RoleZero):
async def _parse_commands_for_eval(self):
"""
Handles actions based on parsed commands.
Parses commands, checks for a "submit" action, and generates a patch using `git diff`.
Stores the cleaned patch in `output_diff`. Logs any exceptions.
This function is specifically added for SWE bench evaluation.
"""
# If todo switches to None, it indicates that this is the final round of reactions, and the Swe-Agent will stop. Use git diff to store any changes made.
@ -79,7 +76,6 @@ class SWEAgent(RoleZero):
logger.info(f"Diff output: \n{clear_diff}")
if clear_diff:
self.output_diff = clear_diff
except Exception as e:
logger.error(f"Error during submission: {e}")

View file

@ -842,7 +842,7 @@ Explanation: I will first need to read the system design document and the projec
## example 2
Consider this example only after you have obtained the content of system design and project schedule documents.
Suppose the system design and project schedule prescribes three files index.html, style.css, script.js, to follow the design and schedule, I will create a plan consisting of three tasks, each corresponding to the creation of one of the required files: `index.html`, `style.css`, and `script.js`. Following the completion of these tasks, I will add a code review task for each file to ensure the implementation aligns with the provided system design and project schedule documents.
Suppose the system design and project schedule prescribes three files index.html, style.css, script.js, to follow the design and schedule, I will create a plan consisting of three tasks, each corresponding to the creation of one of the required files: `index.html`, `style.css`, and `script.js`.
Here's the plan:
@ -901,8 +901,132 @@ Explanation: Take on one task, such as writing a file. Upon completion, finish c
}
]
```
"""
## example 4
I have received a GitHub issue URL.
I will use browser to review the detailed information of this issue in order to understand the problem.
```json
[
{
"command_name": "Browser.goto",
"args": {
"url": "https://github.com/geekan/MetaGPT/issues/1275"
}
}
]
```
## example 6
I need to locating the `openai_api.py` file, so I will search for the `openai_api.py` file.
```json
[
{
"command_name": "Editor.find_file",
"args": {
"file_name": "openai_api.py"
}
}
]
```
## example 7
I have located the openai_api.py file. I want to edit this file, so I will open it first.
```json
[
{
"command_name": "Editor.open_file",
"args": {
"path": "/workspace/MetaGPT/provider/openai_api.py"
}
}
]
```
## example 8
I have opened the openai_api.py file. However, the range of lines shown is from 001 to 100, and I want to see more. Therefore, I want to use the scroll_down command to view additional lines.
```json
[
{
"command_name": "Editor.scroll_down",
"args": {{}}
}
]
```
## example 9
I've found the bug and will start fixing it. I'll pay close attention to the indentation.
Since I only need to modify a few lines in this file, I will use Editor.edit_file_by_replace. The original content will be replaced by the new code.
Editor tool is exclusive. If I use this tool, I cannot use any other commands in the current response.
```json
[
{
"command_name": "Editor.edit_file_by_replace",
"args": {
"file_name":"/workspace/MetaGPT/provider/openai_api.py",
"to_replace": " inv_trig_table = ["asin", "acos", "atan", "acot"]"
"new_content": " inv_trig_table = ["asin", "acos", "atan", "acsc", "asec", "acot"]"
}
}
]
```
## example 10
I only need to add a few lines to the file, so I will use Editor.insert_content_at_line. The new code will not cover the original code.
Note that the Editor command must be executed in a single response, so this step will only involve using the Editor command.
```json
[
{
"command_name": "Editor.insert_content_at_line",
"args": {
"file_name":"/workspace/MetaGPT/provider/openai_api.py"
"line_number":727,
"content": "if hasattr(self, '_print_' + func) and not isinstance(expr.func, UndefinedFunction):\\n return getattr(self, '_print_' + func)(expr, exp)"
}
}
]
```
## example 10.1
To enhance the functionality of the 2048 game, including game end detection and score tracking, we need to add these features to the existing game_2048.py file. First, we will add a score tracking feature, and then we will insert game end detection logic into the game loop.
We will use the Editor.insert_content_at_line command to insert new code into the file for adding score tracking and game end detection.
Since Editor.insert_content_at_line can only be used once per response, this time I will use it to create the variable self.score
```json
[
{
"command_name": "Editor.insert_content_at_line",
"args": {
"file_name": "/home/mgx/mgx/MetaGPT/workspace/2048_game_py/game_2048.py",
"line_number": 4,
"content": " self.score = 0\n"
}
}
]
```
In the next turn, I will try to add another code snippet
## example 11
Create a pull request (Optional): Merge the changes from the new branch into the master branch.
Thought: Now that the changes have been pushed to the remote repository, due to the user's requirement, let's create a pull request to merge the changes into the master branch.
```json
[
{
"command_name": "git_create_pull",
"args": {
"base": "master",
"head": "test-fix",
"base_repo_name": "garylin2099/MetaGPT",
"head_repo_name": "seeker-jie/MetaGPT",
"app_name": "github",
"title": "Fix Issue #1275: produced TypeError: openai.types.completion_usage.CompletionUsage() argument after ** must be a mapping, not NoneType"",
"body": "This pull request addresses issue #1275 by ensuring that chunk.usage is not None before passing it to CompletionUsage."
}
}
]
```
"""
WEB_SCRAPING_EXAMPLE = """
## action 1

View file

@ -3,31 +3,65 @@ This file is borrowed from OpenDevin
You can find the original repository here:
https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/plugins/agent_skills/file_ops/file_ops.py
"""
import base64
import os
import re
import shutil
import tempfile
from pathlib import Path
from typing import List, Optional, Tuple, Union
from typing import List, Optional, Union
from pydantic import BaseModel, ConfigDict
from metagpt.config2 import Config
from metagpt.const import DEFAULT_WORKSPACE_ROOT
from metagpt.logs import logger
from metagpt.tools.libs.index_repo import IndexRepo
from metagpt.tools.libs.linter import Linter
from metagpt.tools.tool_registry import register_tool
from metagpt.utils import read_docx
from metagpt.utils.common import aread, aread_bin, awrite_bin, check_http_endpoint
from metagpt.utils.repo_to_markdown import is_text_file
from metagpt.utils.file import File
from metagpt.utils.report import EditorReporter
# This is also used in unit tests!
MSG_FILE_UPDATED = "[File updated (edited at line {line_number}). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]"
LINTER_ERROR_MSG = "[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]\n"
INDENTATION_INFO = """
The previous line is:
"{pre_line}"
The indentation has {pre_line_indent} spaces.
The error line is:
"{insert_line}"
The indentation has {insert_line_indent} spaces.
Please check the indentation of the code to ensure that it is not causing any errors.
Try using indentation with either {sub_4_space} or {add_4_space} spaces.
"""
ERROR_GUIDANCE = """
{linter_error_msg}
[This is how your edit would have looked if applied]
-------------------------------------------------
{window_after_applied}
-------------------------------------------------
[This is the original code before your edit]
-------------------------------------------------
{window_before_applied}
-------------------------------------------------
Your changes have NOT been applied. Please fix your edit command and try again
{guidance_message}
"""
SUCCESS_EDIT_INFO = """
[File: {file_name} ({n_total_lines} lines total after edit)]
{window_after_applied}
[File updated (edited at line {line_number}). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
"""
class FileBlock(BaseModel):
"""A block of content in a file"""
@ -70,23 +104,12 @@ class Editor(BaseModel):
async def read(self, path: str) -> FileBlock:
"""Read the whole content of a file. Using absolute paths as the argument for specifying the file location."""
is_text, mime_type = await is_text_file(path)
if is_text:
lines = await self._read_text(path)
elif mime_type == "application/pdf":
lines = await self._read_pdf(path)
elif mime_type in {
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.ms-word.document.macroEnabled.12",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
"application/vnd.ms-word.template.macroEnabled.12",
}:
lines = await self._read_docx(path)
else:
content = await File.read_text_file(path)
if not content:
return FileBlock(file_path=str(path), block_content="")
self.resource.report(str(path), "path")
lines = content.splitlines(keepends=True)
lines_with_num = [f"{i + 1:03}|{line}" for i, line in enumerate(lines)]
result = FileBlock(
file_path=str(path),
@ -94,80 +117,6 @@ class Editor(BaseModel):
)
return result
@staticmethod
async def _read_text(path: Union[str, Path]) -> List[str]:
content = await aread(path)
lines = content.split("\n")
return lines
@staticmethod
async def _read_pdf(path: Union[str, Path]) -> List[str]:
result = await Editor._omniparse_read_file(path)
if result:
return result
from llama_index.readers.file import PDFReader
reader = PDFReader()
lines = reader.load_data(file=Path(path))
return [i.text for i in lines]
@staticmethod
async def _read_docx(path: Union[str, Path]) -> List[str]:
result = await Editor._omniparse_read_file(path)
if result:
return result
return read_docx(str(path))
@staticmethod
async def _omniparse_read_file(path: Union[str, Path]) -> Optional[List[str]]:
from metagpt.tools.libs import get_env_default
from metagpt.utils.omniparse_client import OmniParseClient
env_base_url = await get_env_default(key="base_url", app_name="OmniParse", default_value="")
env_timeout = await get_env_default(key="timeout", app_name="OmniParse", default_value="")
conf_base_url, conf_timeout = await Editor._read_omniparse_config()
base_url = env_base_url or conf_base_url
if not base_url:
return None
api_key = await get_env_default(key="api_key", app_name="OmniParse", default_value="")
timeout = env_timeout or conf_timeout or 600
try:
timeout = int(timeout)
except ValueError:
timeout = 600
try:
if not await check_http_endpoint(url=base_url):
logger.warning(f"{base_url}: NOT AVAILABLE")
return None
client = OmniParseClient(api_key=api_key, base_url=base_url, max_timeout=timeout)
file_data = await aread_bin(filename=path)
ret = await client.parse_document(file_input=file_data, bytes_filename=str(path))
except (ValueError, Exception) as e:
logger.exception(f"{path}: {e}")
return None
if not ret.images:
return [ret.text] if ret.text else None
result = [ret.text]
img_dir = Path(path).parent / (Path(path).name.replace(".", "_") + "_images")
img_dir.mkdir(parents=True, exist_ok=True)
for i in ret.images:
byte_data = base64.b64decode(i.image)
filename = img_dir / i.image_name
await awrite_bin(filename=filename, data=byte_data)
result.append(f"![{i.image_name}]({str(filename)})")
return result
@staticmethod
async def _read_omniparse_config() -> Tuple[str, int]:
config = Config.default()
if config.omniparse and config.omniparse.url:
return config.omniparse.url, config.omniparse.timeout
return "", 0
@staticmethod
def _is_valid_filename(file_name: str) -> bool:
if not file_name or not file_name.strip():
@ -277,7 +226,7 @@ class Editor(BaseModel):
return ""
return f"[File: {current_file.resolve()} ({total_lines} lines total)]\n"
def set_workdir(self, path: str) -> None:
def _set_workdir(self, path: str) -> None:
"""
Sets the working directory to the given path. eg: repo directory.
You MUST to set it up before open the file.
@ -321,6 +270,7 @@ class Editor(BaseModel):
output = self._cur_file_header(path, total_lines)
output += self._print_window(path, self.current_line, self._clamp(context_lines, 1, 2000))
self.resource.report(path, "path")
return output
def goto_line(self, line_number: int) -> str:
@ -499,6 +449,25 @@ class Editor(BaseModel):
content = "".join(new_lines)
return content, n_added_lines
def _get_indentation_info(self, content, first_line):
"""
The indentation of the first insert line and the previous line, along with guidance for the next attempt.
"""
content_lines = content.split("\n")
pre_line = content_lines[first_line - 2] if first_line - 2 >= 0 else ""
pre_line_indent = len(pre_line) - len(pre_line.lstrip())
insert_line = content_lines[first_line - 1]
insert_line_indent = len(insert_line) - len(insert_line.lstrip())
ret_str = INDENTATION_INFO.format(
pre_line=pre_line,
pre_line_indent=pre_line_indent,
insert_line=insert_line,
insert_line_indent=insert_line_indent,
sub_4_space=max(insert_line_indent - 4, 0),
add_4_space=insert_line_indent + 4,
)
return ret_str
def _edit_file_impl(
self,
file_name: Path,
@ -518,7 +487,6 @@ class Editor(BaseModel):
is_insert: bool = False: Whether to insert content at the given line number instead of editing.
is_append: bool = False: Whether to append content to the file instead of editing.
"""
ret_str = ""
ERROR_MSG = f"[Error editing file {file_name}. Please confirm the file is correct.]"
ERROR_MSG_SUFFIX = (
@ -568,14 +536,12 @@ class Editor(BaseModel):
try:
content, n_added_lines = self._insert_impl(lines, start, content)
except LineNumberError as e:
ret_str += (f"{ERROR_MSG}\n" f"{e}\n" f"{ERROR_MSG_SUFFIX}") + "\n"
return ret_str
return (f"{ERROR_MSG}\n" f"{e}\n" f"{ERROR_MSG_SUFFIX}") + "\n"
else:
try:
content, n_added_lines = self._edit_impl(lines, start, end, content)
except LineNumberError as e:
ret_str += (f"{ERROR_MSG}\n" f"{e}\n" f"{ERROR_MSG_SUFFIX}") + "\n"
return ret_str
return (f"{ERROR_MSG}\n" f"{e}\n" f"{ERROR_MSG_SUFFIX}") + "\n"
if not content.endswith("\n"):
content += "\n"
@ -622,9 +588,11 @@ class Editor(BaseModel):
first_error_line = None
if lint_error is not None:
if first_error_line is not None:
show_line = int(first_error_line)
elif is_append:
# if first_error_line is not None:
# show_line = int(first_error_line)
# show the first insert line.
if is_append:
# original end-of-file
show_line = len(lines)
# insert OR edit WILL provide meaningful line numbers
@ -633,52 +601,52 @@ class Editor(BaseModel):
else:
raise ValueError("Invalid state. This should never happen.")
ret_str += LINTER_ERROR_MSG
ret_str += lint_error + "\n"
editor_lines = n_added_lines + 20
ret_str += "[This is how your edit would have looked if applied]\n"
ret_str += "-------------------------------------------------\n"
ret_str += self._print_window(file_name, show_line, editor_lines, return_str=True) + "\n"
ret_str += "-------------------------------------------------\n\n"
ret_str += "[This is the original code before your edit]\n"
ret_str += "-------------------------------------------------\n"
ret_str += (
self._print_window(
original_file_backup_path,
show_line,
editor_lines,
)
+ "\n"
)
ret_str += "-------------------------------------------------\n"
ret_str += (
"Your changes have NOT been applied. Please fix your edit command and try again.\n"
guidance_message = self._get_indentation_info(content, start or len(lines))
guidance_message += (
"You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n"
"DO NOT re-run the same failed edit command. Running it again will lead to the same error."
)
lint_error_info = ERROR_GUIDANCE.format(
linter_error_msg=LINTER_ERROR_MSG + lint_error,
window_after_applied=self._print_window(file_name, show_line, n_added_lines + 20),
window_before_applied=self._print_window(
original_file_backup_path, show_line, n_added_lines + 20
),
guidance_message=guidance_message,
).strip()
# recover the original file
with original_file_backup_path.open() as fin, file_name.open("w") as fout:
fout.write(fin.read())
original_file_backup_path.unlink()
return ret_str
return lint_error_info
except FileNotFoundError as e:
ret_str += f"File not found: {e}\n"
return f"File not found: {e}\n"
except IOError as e:
ret_str += f"An error occurred while handling the file: {e}\n"
return f"An error occurred while handling the file: {e}\n"
except ValueError as e:
ret_str += f"Invalid input: {e}\n"
return f"Invalid input: {e}\n"
except Exception as e:
guidance_message = self._get_indentation_info(content, start or len(lines))
guidance_message += (
"You either need to 1) Specify the correct start/end line arguments or 2) Enlarge the range of original code.\n"
"DO NOT re-run the same failed edit command. Running it again will lead to the same error."
)
error_info = ERROR_GUIDANCE.format(
linter_error_msg=LINTER_ERROR_MSG + str(e),
window_after_applied=self._print_window(file_name, start or len(lines), 40),
window_before_applied=self._print_window(original_file_backup_path, start or len(lines), 40),
guidance_message=guidance_message,
).strip()
# Clean up the temporary file if an error occurs
with original_file_backup_path.open() as fin, file_name.open("w") as fout:
fout.write(fin.read())
if temp_file_path and Path(temp_file_path).exists():
Path(temp_file_path).unlink()
logger.warning(f"An unexpected error occurred: {e}")
raise e
# logger.warning(f"An unexpected error occurred: {e}")
raise Exception(f"{error_info}") from e
# Update the file information and print the updated content
with file_name.open("r", encoding="utf-8") as file:
@ -690,11 +658,13 @@ class Editor(BaseModel):
self.current_line = max(1, len(lines)) # end of original file
else:
self.current_line = start or n_total_lines or 1
ret_str += f"[File: {file_name.resolve()} ({n_total_lines} lines total after edit)]\n"
CURRENT_FILE = file_name
ret_str += self._print_window(CURRENT_FILE, self.current_line, self.window) + "\n"
ret_str += MSG_FILE_UPDATED.format(line_number=self.current_line)
return ret_str
success_edit_info = SUCCESS_EDIT_INFO.format(
file_name=file_name.resolve(),
n_total_lines=n_total_lines,
window_after_applied=self._print_window(file_name, self.current_line, self.window),
line_number=self.current_line,
).strip()
return success_edit_info
def edit_file_by_replace(self, file_name: str, to_replace: str, new_content: str) -> str:
"""Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`.
@ -741,6 +711,10 @@ class Editor(BaseModel):
file_name: str: The name of the file to edit.
to_replace: str: The content to search for and replace.
new_content: str: The new content to replace the old content with.
NOTE:
This tool is exclusive. If you use this tool, you cannot use any other commands in the current response.
If you need to use it multiple times, wait for the next turn.
"""
# FIXME: support replacing *all* occurrences
if to_replace.strip() == "":
@ -792,6 +766,7 @@ class Editor(BaseModel):
)
# lint_error = bool(LINTER_ERROR_MSG in ret_str)
# TODO: automatically tries to fix linter error (maybe involve some static analysis tools on the location near the edit to figure out indentation)
self.resource.report(file_name, "path")
return ret_str
def insert_content_at_line(self, file_name: str, line_number: int, content: str) -> str:
@ -816,6 +791,9 @@ class Editor(BaseModel):
file_name: str: The name of the file to edit.
line_number: int: The line number (starting from 1) to insert the content after.
content: str: The content to insert.
NOTE:
This tool is exclusive. If you use this tool, you cannot use any other commands in the current response.
If you need to use it multiple times, wait for the next turn.
"""
file_name = self._try_fix_path(file_name)
@ -836,6 +814,9 @@ class Editor(BaseModel):
Args:
file_name: str: The name of the file to edit.
content: str: The content to insert.
NOTE:
This tool is exclusive. If you use this tool, you cannot use any other commands in the current response.
If you need to use it multiple times, wait for the next turn.
"""
file_name = self._try_fix_path(file_name)
@ -914,6 +895,9 @@ class Editor(BaseModel):
res_list.append(f'[End of matches for "{search_term}" in {file_path}]')
else:
res_list.append(f'[No matches found for "{search_term}" in {file_path}]')
extra = {"type": "search", "symbol": search_term, "lines": [i[0] - 1 for i in matches]} if matches else None
self.resource.report(file_path, "path", extra=extra)
return "\n".join(res_list)
def find_file(self, file_name: str, dir_path: str = "./") -> str:
@ -951,3 +935,21 @@ class Editor(BaseModel):
if not path.is_absolute():
path = self.working_dir / path
return path
@staticmethod
async def search_index_repo(query: str, file_or_path: Union[str, Path]) -> List[str]:
"""Searches the index repository for a given query across specified files or paths.
This method classifies the provided files or paths, performing a search on each cluster
of files while handling other types of files separately. It merges results from structured
indices with any results from non-indexed files.
Args:
query (str): The search query string to look for in the indexed files.
file_or_path (Union[str, Path]): A path or a filename to search within.
Returns:
List[str]: A list of search results as strings, containing the text from the merged results
and any direct results from other files.
"""
return await IndexRepo.cross_repo_search(query=query, file_or_path=file_or_path)

View file

@ -1,9 +1,10 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import asyncio
import json
import re
from pathlib import Path
from typing import Dict, List, Optional, Set, Union
from typing import Dict, List, Optional, Set, Tuple, Union
import tiktoken
from llama_index.core.base.embeddings.base import BaseEmbedding
@ -16,7 +17,23 @@ from metagpt.rag.engines import SimpleEngine
from metagpt.rag.factories.embedding import RAGEmbeddingFactory
from metagpt.rag.schema import FAISSIndexConfig, FAISSRetrieverConfig, LLMRankerConfig
from metagpt.utils.common import aread, awrite, generate_fingerprint, list_files
from metagpt.utils.repo_to_markdown import is_text_file
from metagpt.utils.file import File
UPLOADS_INDEX_ROOT = "/data/.index/uploads"
DEFAULT_INDEX_ROOT = UPLOADS_INDEX_ROOT
UPLOAD_ROOT = "/data/uploads"
DEFAULT_ROOT = UPLOAD_ROOT
CHATS_INDEX_ROOT = "/data/.index/chats"
CHATS_ROOT = "/data/chats/"
OTHER_TYPE = "other"
DEFAULT_MIN_TOKEN_COUNT = 10000
DEFAULT_MAX_TOKEN_COUNT = 100000000
class IndexRepoMeta(BaseModel):
min_token_count: int
max_token_count: int
class TextScore(BaseModel):
@ -26,12 +43,15 @@ class TextScore(BaseModel):
class IndexRepo(BaseModel):
persist_path: str # The persist path of the index repo, {DEFAULT_WORKSPACE_ROOT}/.index/{chat_id or 'uploads'}/
root_path: str # `/data/uploads` or r`/data/chats/\d+`, the root path of files indexed by the index repo.
persist_path: str = DEFAULT_INDEX_ROOT # The persist path of the index repo, `/data/.index/uploads/` or `/data/.index/chats/{chat_id}/`
root_path: str = (
DEFAULT_ROOT # `/data/uploads` or r`/data/chats/\d+`, the root path of files indexed by the index repo.
)
fingerprint_filename: str = "fingerprint.json"
meta_filename: str = "meta.json"
model: Optional[str] = None
min_token_count: int = 10000
max_token_count: int = 100000000
min_token_count: int = DEFAULT_MIN_TOKEN_COUNT
max_token_count: int = DEFAULT_MAX_TOKEN_COUNT
recall_count: int = 5
embedding: Optional[BaseEmbedding] = Field(default=None, exclude=True)
fingerprints: Dict[str, str] = Field(default_factory=dict)
@ -65,16 +85,21 @@ class IndexRepo(BaseModel):
"""
encoding = tiktoken.get_encoding("cl100k_base")
result: List[Union[NodeWithScore, TextScore]] = []
filenames, _ = await self._filter(filenames)
filenames, excludes = await self._filter(filenames)
if not filenames:
raise ValueError(f"Unsupported file types: {[str(i) for i in excludes]}")
filter_filenames = set()
meta = await self._read_meta()
for i in filenames:
content = await aread(filename=i)
content = await File.read_text_file(i)
token_count = len(encoding.encode(content))
if not self._is_buildable(token_count):
if not self._is_buildable(
token_count, min_token_count=meta.min_token_count, max_token_count=meta.max_token_count
):
result.append(TextScore(filename=str(i), text=content))
continue
file_fingerprint = generate_fingerprint(content)
if self.fingerprints.get(str(i)) != file_fingerprint:
if self.fingerprints.get(str(i)) != file_fingerprint and Path(i).suffix.lower() not in {".pdf"}:
logger.error(f'file: "{i}" changed but not indexed')
continue
filter_filenames.add(str(i))
@ -93,6 +118,10 @@ class IndexRepo(BaseModel):
Returns:
List[Union[NodeWithScore, TextScore]]: A list of merged results sorted by similarity.
"""
flat_nodes = [node for indices in indices_list if indices for node in indices if node]
if len(flat_nodes) <= self.recall_count:
return flat_nodes
if not self.embedding:
config = Config.default()
if self.model:
@ -102,7 +131,6 @@ class IndexRepo(BaseModel):
scores = []
query_embedding = await self.embedding.aget_text_embedding(query)
flat_nodes = [node for indices in indices_list for node in indices]
for i in flat_nodes:
text_embedding = await self.embedding.aget_text_embedding(i.text)
similarity = self.embedding.similarity(query_embedding, text_embedding)
@ -121,7 +149,7 @@ class IndexRepo(BaseModel):
filter_filenames = []
delete_filenames = []
for i in filenames:
content = await aread(filename=i)
content = await File.read_text_file(i)
if not self._is_fingerprint_changed(filename=i, content=content):
continue
token_count = len(encoding.encode(content))
@ -169,10 +197,11 @@ class IndexRepo(BaseModel):
logger.debug(f"add docs {filenames}")
engine.persist(persist_dir=self.persist_path)
for i in filenames:
content = await aread(i)
content = await File.read_text_file(i)
fp = generate_fingerprint(content)
self.fingerprints[str(i)] = fp
await awrite(filename=Path(self.persist_path) / self.fingerprint_filename, data=json.dumps(self.fingerprints))
await self._save_meta()
def __str__(self):
"""Return a string representation of the IndexRepo.
@ -182,7 +211,7 @@ class IndexRepo(BaseModel):
"""
return f"{self.persist_path}"
def _is_buildable(self, token_count: int) -> bool:
def _is_buildable(self, token_count: int, min_token_count: int = -1, max_token_count=-1) -> bool:
"""Check if the token count is within the buildable range.
Args:
@ -191,7 +220,9 @@ class IndexRepo(BaseModel):
Returns:
bool: True if buildable, False otherwise.
"""
if token_count < self.min_token_count or token_count > self.max_token_count:
min_token_count = min_token_count if min_token_count >= 0 else self.min_token_count
max_token_count = max_token_count if max_token_count >= 0 else self.max_token_count
if token_count < min_token_count or token_count > max_token_count:
return False
return True
@ -216,13 +247,13 @@ class IndexRepo(BaseModel):
logger.debug(f"{path} not is_relative_to {root_path})")
continue
if not path.is_dir():
is_text, _ = await is_text_file(path)
is_text = await File.is_textual_file(path)
if is_text:
pathnames.append(path)
continue
subfiles = list_files(path)
for j in subfiles:
is_text, _ = await is_text_file(j)
is_text = await File.is_textual_file(j)
if is_text:
pathnames.append(j)
@ -240,7 +271,7 @@ class IndexRepo(BaseModel):
List[NodeWithScore]: A list of nodes with scores matching the query.
"""
if not Path(self.persist_path).exists():
return []
raise ValueError(f"IndexRepo {Path(self.persist_path).name} not exists.")
engine = SimpleEngine.from_index(
index_config=FAISSIndexConfig(persist_path=self.persist_path), retriever_configs=[FAISSRetrieverConfig()]
)
@ -262,3 +293,114 @@ class IndexRepo(BaseModel):
return True
fp = generate_fingerprint(content)
return old_fp != fp
@staticmethod
def find_index_repo_path(files: List[Union[str, Path]]) -> Tuple[Dict[str, Set[Path]], Dict[str, str]]:
"""Map the file path to the corresponding index repo.
Args:
files (List[Union[str, Path]]): A list of file paths or Path objects to be classified.
Returns:
Tuple[Dict[str, Set[Path]], Dict[str, str]]:
- A dictionary mapping the index repo path to the files.
- A dictionary mapping the index repo path to their corresponding root directories.
"""
mappings = {
UPLOADS_INDEX_ROOT: re.compile(r"^/data/uploads($|/.*)"),
CHATS_INDEX_ROOT: re.compile(r"^/data/chats/\d+($|/.*)"),
}
clusters = {}
roots = {}
for i in files:
path = Path(i).absolute()
path_type = OTHER_TYPE
for type_, pattern in mappings.items():
if re.match(pattern, str(i)):
path_type = type_
break
if path_type == CHATS_INDEX_ROOT:
chat_id = path.parts[3]
path_type = str(Path(path_type) / chat_id)
roots[path_type] = str(Path(CHATS_ROOT) / chat_id)
elif path_type == UPLOADS_INDEX_ROOT:
roots[path_type] = UPLOAD_ROOT
if path_type in clusters:
clusters[path_type].add(path)
else:
clusters[path_type] = {path}
return clusters, roots
async def _save_meta(self):
meta = IndexRepoMeta(min_token_count=self.min_token_count, max_token_count=self.max_token_count)
await awrite(filename=Path(self.persist_path) / self.meta_filename, data=meta.model_dump_json())
async def _read_meta(self) -> IndexRepoMeta:
default_meta = IndexRepoMeta(min_token_count=self.min_token_count, max_token_count=self.max_token_count)
filename = Path(self.persist_path) / self.meta_filename
if not filename.exists():
return default_meta
meta_data = await aread(filename=filename)
try:
meta = IndexRepoMeta.model_validate_json(meta_data)
return meta
except Exception as e:
logger.warning(f"Load meta error: {e}")
return default_meta
@staticmethod
async def cross_repo_search(query: str, file_or_path: Union[str, Path]) -> List[str]:
"""Search for a query across multiple repositories.
This asynchronous function searches for the specified query in files
located at the given path or file.
Args:
query (str): The search term to look for in the files.
file_or_path (Union[str, Path]): The path to the file or directory
where the search should be conducted. This can be a string path
or a Path object.
Returns:
List[str]: A list of strings containing the paths of files that
contain the query results.
Raises:
ValueError: If the query string is empty.
"""
if not file_or_path or not Path(file_or_path).exists():
raise ValueError(f'"{str(file_or_path)}" not exists')
files = [file_or_path] if not Path(file_or_path).is_dir() else list_files(file_or_path)
clusters, roots = IndexRepo.find_index_repo_path(files)
futures = []
others = set()
for persist_path, filenames in clusters.items():
if persist_path == OTHER_TYPE:
others.update(filenames)
continue
root = roots[persist_path]
repo = IndexRepo(persist_path=persist_path, root_path=root)
futures.append(repo.search(query=query, filenames=list(filenames)))
for i in others:
futures.append(File.read_text_file(i))
futures_results = []
if futures:
futures_results = await asyncio.gather(*futures)
result = []
v_result = []
for i in futures_results:
if isinstance(i, str):
result.append(i)
else:
v_result.append(i)
repo = IndexRepo()
merged = await repo.merge(query=query, indices_list=v_result)
return [i.text for i in merged] + result

View file

@ -6,13 +6,19 @@
@File : file.py
@Describe : General file operations.
"""
import base64
from pathlib import Path
from typing import Optional, Tuple, Union
import aiofiles
from fsspec.implementations.memory import MemoryFileSystem as _MemoryFileSystem
from metagpt.config2 import Config
from metagpt.logs import logger
from metagpt.utils import read_docx
from metagpt.utils.common import aread, aread_bin, awrite_bin, check_http_endpoint
from metagpt.utils.exceptions import handle_exception
from metagpt.utils.repo_to_markdown import is_text_file
class File:
@ -70,6 +76,125 @@ class File:
logger.debug(f"Successfully read file, the path of file: {file_path}")
return content
@staticmethod
async def is_textual_file(filename: Union[str, Path]) -> bool:
"""Determines if a given file is a textual file.
A file is considered a textual file if it is plain text or has a
specific set of MIME types associated with textual formats,
including PDF and Microsoft Word documents.
Args:
filename (Union[str, Path]): The path to the file to be checked.
Returns:
bool: True if the file is a textual file, False otherwise.
"""
is_text, mime_type = await is_text_file(filename)
if is_text:
return True
if mime_type == "application/pdf":
return True
if mime_type in {
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.ms-word.document.macroEnabled.12",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
"application/vnd.ms-word.template.macroEnabled.12",
}:
return True
return False
@staticmethod
async def read_text_file(filename: Union[str, Path]) -> Optional[str]:
"""Read the whole content of a file. Using absolute paths as the argument for specifying the file location."""
is_text, mime_type = await is_text_file(filename)
if is_text:
return await File._read_text(filename)
if mime_type == "application/pdf":
return await File._read_pdf(filename)
if mime_type in {
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.ms-word.document.macroEnabled.12",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
"application/vnd.ms-word.template.macroEnabled.12",
}:
return await File._read_docx(filename)
return None
@staticmethod
async def _read_text(path: Union[str, Path]) -> str:
return await aread(path)
@staticmethod
async def _read_pdf(path: Union[str, Path]) -> str:
result = await File._omniparse_read_file(path)
if result:
return result
from llama_index.readers.file import PDFReader
reader = PDFReader()
lines = reader.load_data(file=Path(path))
return "\n".join([i.text for i in lines])
@staticmethod
async def _read_docx(path: Union[str, Path]) -> str:
result = await File._omniparse_read_file(path)
if result:
return result
return "\n".join(read_docx(str(path)))
@staticmethod
async def _omniparse_read_file(path: Union[str, Path], auto_save_image: bool = False) -> Optional[str]:
from metagpt.tools.libs import get_env_default
from metagpt.utils.omniparse_client import OmniParseClient
env_base_url = await get_env_default(key="base_url", app_name="OmniParse", default_value="")
env_timeout = await get_env_default(key="timeout", app_name="OmniParse", default_value="")
conf_base_url, conf_timeout = await File._read_omniparse_config()
base_url = env_base_url or conf_base_url
if not base_url:
return None
api_key = await get_env_default(key="api_key", app_name="OmniParse", default_value="")
timeout = env_timeout or conf_timeout or 600
try:
timeout = int(timeout)
except ValueError:
timeout = 600
try:
if not await check_http_endpoint(url=base_url):
logger.warning(f"{base_url}: NOT AVAILABLE")
return None
client = OmniParseClient(api_key=api_key, base_url=base_url, max_timeout=timeout)
file_data = await aread_bin(filename=path)
ret = await client.parse_document(file_input=file_data, bytes_filename=str(path))
except (ValueError, Exception) as e:
logger.exception(f"{path}: {e}")
return None
if not ret.images or not auto_save_image:
return ret.text
result = [ret.text]
img_dir = Path(path).parent / (Path(path).name.replace(".", "_") + "_images")
img_dir.mkdir(parents=True, exist_ok=True)
for i in ret.images:
byte_data = base64.b64decode(i.image)
filename = img_dir / i.image_name
await awrite_bin(filename=filename, data=byte_data)
result.append(f"![{i.image_name}]({str(filename)})")
return "\n".join(result)
@staticmethod
async def _read_omniparse_config() -> Tuple[str, int]:
config = Config.default()
if config.omniparse and config.omniparse.url:
return config.omniparse.url, config.omniparse.timeout
return "", 0
class MemoryFileSystem(_MemoryFileSystem):
@classmethod

View file

@ -8,7 +8,6 @@ from metagpt.environment.mgx.mgx_env import MGXEnv
from metagpt.roles import Architect, Engineer, ProductManager, ProjectManager
from metagpt.roles.di.data_analyst import DataAnalyst
from metagpt.roles.di.engineer2 import Engineer2
from metagpt.roles.di.swe_agent import SWEAgent
from metagpt.roles.di.team_leader import TeamLeader
from metagpt.schema import Message
@ -29,7 +28,6 @@ async def main(requirement="", enable_human_input=False, use_fixed_sop=False, al
engineer,
# QaEngineer(),
DataAnalyst(),
SWEAgent(),
]
)

View file

@ -1,16 +1,23 @@
import argparse
import asyncio
import json
import os
import shutil
import sys
from datetime import datetime
from pathlib import Path
from metagpt.config2 import Config
from metagpt.const import DEFAULT_WORKSPACE_ROOT, METAGPT_ROOT
from metagpt.logs import logger
from metagpt.roles.di.swe_agent import SWEAgent
from metagpt.roles.di.engineer2 import Engineer2
from metagpt.tools.libs.editor import Editor
from metagpt.tools.libs.terminal import Terminal
from metagpt.tools.swe_agent_commands.swe_agent_utils import load_hf_dataset
config = Config.default()
# Specify by yourself
GLOBAL_TERMINAL = Terminal()
TEST_REPO_DIR = METAGPT_ROOT / "data" / "test_repo"
DATA_DIR = METAGPT_ROOT / "data/hugging_face"
@ -51,20 +58,61 @@ def check_instance_status(instance, swe_result_dir):
return True
async def run(instance, swe_result_dir):
async def terminal_run_command(cmd):
cmd_output = await GLOBAL_TERMINAL.run_command(cmd)
logger.info(f"command:{cmd} output:\n {cmd_output}")
return cmd_output
async def refresh_repo(instance, test_repo_dir, reclone_existing_repo=False):
repo_path = Path(test_repo_dir) / (
instance["repo"].replace("-", "_").replace("/", "__") + "_" + instance["version"]
)
repo_identifier = instance["repo"]
base_commit = instance["base_commit"]
if os.path.exists(repo_path) and reclone_existing_repo is True:
logger.info(f"remove exist repo path:{repo_path}")
shutil.rmtree(repo_path)
if os.path.exists(repo_path):
logger.info(f"reset exist repo path:{repo_path}")
await terminal_run_command(f"cd {repo_path} && git reset --hard && git clean -n -d && git clean -f -d")
await terminal_run_command("BRANCH=$(git remote show origin | awk '/HEAD branch/ {print $NF}')")
await terminal_run_command("echo $BRANCH")
await terminal_run_command('git checkout "$BRANCH"')
else:
logger.info(f"clone repo to path:{repo_path}")
clone_command = f"git clone 'https://github.com/{repo_identifier}.git' {repo_path}"
checkout_command = f"cd {repo_path} " + "&& git checkout -f {base_commit}" if base_commit else ""
await terminal_run_command(clone_command)
await terminal_run_command(checkout_command)
await terminal_run_command("git branch")
# ignore backup file
await terminal_run_command("echo '.backup.*' >> .gitignore")
return repo_path
async def get_git_diff():
git_diff = ""
try:
await terminal_run_command("git add -A")
git_diff = await terminal_run_command("git diff --cached")
except Exception as e:
logger.error(f"Error during submission: {e}")
return git_diff
async def run(instance, swe_result_dir, args):
if not check_instance_status(instance, swe_result_dir):
logger.info(f"Instance {instance['instance_id']} already exists, skipping execution.")
return
repo_path = TEST_REPO_DIR / (instance["repo"].replace("-", "_").replace("/", "__") + "_" + instance["version"])
# 前处理
terminal = Terminal()
await terminal.run_command(f"cd {repo_path} && git reset --hard && git clean -n -d && git clean -f -d")
await terminal.run_command("BRANCH=$(git remote show origin | awk '/HEAD branch/ {print $NF}')")
logger.info(await terminal.run_command("echo $BRANCH"))
logger.info(await terminal.run_command('git checkout "$BRANCH"'))
logger.info(await terminal.run_command("git branch"))
# preparation for the repo
logger.info(f"**** Preparing to run {instance['instance_id']}****")
test_repo_dir = args.test_repo_dir
repo_path = await refresh_repo(instance, test_repo_dir, args.reclone_existing_repo)
user_requirement_and_issue = INSTANCE_TEMPLATE.format(
issue=instance["problem_statement"],
@ -75,18 +123,22 @@ async def run(instance, swe_result_dir):
)
logger.info(f"**** Starting to run {instance['instance_id']}****")
swe_agent = SWEAgent()
swe_agent.run_eval = True
await swe_agent.run(user_requirement_and_issue)
save_predictions(swe_agent, instance, swe_result_dir)
logger.info("User Requirement", user_requirement_and_issue)
try:
engineer = Engineer2(run_eval=True, editor=Editor(enable_auto_lint=True))
await asyncio.wait_for(engineer.run(user_requirement_and_issue), timeout=args.max_wait_time_per_case * 60)
except Exception as e:
logger.warning(f"**** exception lead to end: {instance['instance_id']}****\n\nerror:{e}")
# save the difference of repo
await save_predictions(engineer, instance, swe_result_dir)
logger.info(f"**** Finished running {instance['instance_id']}****")
def save_predictions(swe_agent: SWEAgent, instance, swe_result_dir):
async def save_predictions(engineer, instance, swe_result_dir):
output_file = swe_result_dir / "all_preds.jsonl"
instance["model_name_or_path"] = swe_agent.config.llm.model
instance["model_patch"] = swe_agent.output_diff
instance["model_name_or_path"] = engineer.config.llm.model
instance["model_patch"] = await get_git_diff()
logger.info(f"'model_patch':\n{instance['model_patch']}")
logger.info(f"Preparing to save predictions to {output_file}")
# Save the predictions to a JSONL file
@ -96,19 +148,61 @@ def save_predictions(swe_agent: SWEAgent, instance, swe_result_dir):
logger.info(f"Saved prediction of {instance['instance_id']} to {output_file}")
async def async_main():
async def async_main(args):
dataset_path = "manna-ai/SWE-bench_Nano" # "princeton-nlp/SWE-bench_Lite" #"manna-ai/SWE-bench_Nano"
dataset = load_hf_dataset(dataset_name_or_path=dataset_path, cache_dir=DATA_DIR, split="test")
date_time = datetime.now().strftime("%m%d")
_round = "first"
# _round = "second"
exp_name = f"nano_mgx_{date_time}_{_round}"
swe_result_dir = DEFAULT_WORKSPACE_ROOT / f"result_{config.llm.model.replace('/', '_')}" / exp_name
swe_result_dir = Path(args.save_folder)
if swe_result_dir.exists():
logger.info(f"{swe_result_dir} exists; resuming test from last checkpoint.")
swe_result_dir.mkdir(parents=True, exist_ok=True)
for instance in dataset:
await run(instance, swe_result_dir)
for index, instance in enumerate(dataset):
# switch to a new logger file
logger.remove()
logger.add(sys.stderr, level="INFO")
logger.add(swe_result_dir / "logs" / f"{index+1}_{instance['instance_id']}.log", level="DEBUG")
await run(instance, swe_result_dir, args)
if __name__ == "__main__":
asyncio.run(async_main())
parser = argparse.ArgumentParser(description="the argument of scripts")
# 添加参数
swe_result_dir = (
DEFAULT_WORKSPACE_ROOT
/ f"result_{config.llm.model.replace('/', '_')}_start_time_{datetime.now().strftime('%Y_%m_%d_%H_%M_%S') }"
)
test_repo_dir = TEST_REPO_DIR.absolute()
swe_result_dir = swe_result_dir.absolute()
parser.add_argument(
"-rw", "--test_repo_dir", default=test_repo_dir, help="The directory to save temporary repositories", type=str
)
parser.add_argument("-s", "--save_folder", default=swe_result_dir, help="Folder to save results and logs", type=str)
parser.add_argument(
"-mwtc", "--max_wait_time_per_case", help="Maximum wait time allowed per test case (in minutes)", type=int
)
parser.add_argument(
"-o",
"--reclone_existing_repo",
action="store_true",
help="If set, the existing repository will be removed and recloned.",
)
# 解析命令行参数
args = parser.parse_args()
asyncio.run(async_main(args))
"""
#
python tests/metagpt/roles/di/run_swe_agent_for_benchmark.py \
--test_repo_dir "./data/test_repo" \
--save_folder "./workspace/deepseek_coder_0907" \
--max_wait_time_per_case 10
"""
"""
# 重新克隆仓库
python tests/metagpt/roles/di/run_swe_agent_for_benchmark.py \
--test_repo_dir "./data/test_repo" \
--save_folder "./workspace/deepseek_coder_0907" \
--max_wait_time_per_case 10 \
--reclone_existing_repo
"""

View file

@ -1,7 +1,19 @@
import os
import shutil
from pathlib import Path
import pytest
from metagpt.const import TEST_DATA_PATH
from metagpt.tools.libs.editor import Editor
from metagpt.tools.libs.index_repo import (
CHATS_INDEX_ROOT,
CHATS_ROOT,
UPLOAD_ROOT,
UPLOADS_INDEX_ROOT,
IndexRepo,
)
from metagpt.utils.common import list_files
TEST_FILE_CONTENT = """
# this is line one
@ -645,5 +657,54 @@ def test_append_to_single_empty_line_file():
assert n_added_lines == 1
async def mock_index_repo():
chat_id = "1"
chat_path = Path(CHATS_ROOT) / chat_id
chat_path.mkdir(parents=True, exist_ok=True)
src_path = TEST_DATA_PATH / "requirements"
command = f"cp -rf {str(src_path)} {str(chat_path)}"
os.system(command)
filenames = list_files(chat_path)
chat_files = [i for i in filenames if Path(i).suffix in {".md", ".txt", ".json", ".pdf"}]
chat_repo = IndexRepo(
persist_path=str(Path(CHATS_INDEX_ROOT) / chat_id), root_path=str(chat_path), min_token_count=0
)
await chat_repo.add(chat_files)
assert chat_files
Path(UPLOAD_ROOT).mkdir(parents=True, exist_ok=True)
command = f"cp -rf {str(src_path)} {str(UPLOAD_ROOT)}"
os.system(command)
filenames = list_files(UPLOAD_ROOT)
uploads_files = [i for i in filenames if Path(i).suffix in {".md", ".txt", ".json", ".pdf"}]
uploads_repo = IndexRepo(persist_path=UPLOADS_INDEX_ROOT, root_path=UPLOAD_ROOT, min_token_count=0)
await uploads_repo.add(uploads_files)
assert uploads_files
filenames = list_files(src_path)
other_files = [i for i in filenames if Path(i).suffix in {".md", ".txt", ".json", ".pdf"}]
assert other_files
return chat_path, UPLOAD_ROOT, src_path
@pytest.mark.skip
@pytest.mark.asyncio
async def test_index_repo():
# mock data
chat_path, UPLOAD_ROOT, src_path = await mock_index_repo()
editor = Editor()
rsp = await editor.search_index_repo(query="业务线", file_or_path=chat_path)
assert rsp
rsp = await editor.search_index_repo(query="业务线", file_or_path=UPLOAD_ROOT)
assert rsp
rsp = await editor.search_index_repo(query="业务线", file_or_path=src_path)
assert rsp
shutil.rmtree(CHATS_ROOT)
shutil.rmtree(UPLOAD_ROOT)
if __name__ == "__main__":
pytest.main([__file__, "-s"])

View file

@ -1,11 +1,17 @@
import shutil
from pathlib import Path
import pytest
from metagpt.const import DEFAULT_WORKSPACE_ROOT, TEST_DATA_PATH
from metagpt.tools.libs.index_repo import IndexRepo
from metagpt.tools.libs.index_repo import (
CHATS_INDEX_ROOT,
UPLOADS_INDEX_ROOT,
IndexRepo,
)
@pytest.mark.skip
@pytest.mark.asyncio
@pytest.mark.parametrize(("path", "query"), [(TEST_DATA_PATH / "requirements", "业务线")])
async def test_index_repo(path, query):
@ -28,5 +34,22 @@ async def test_index_repo(path, query):
shutil.rmtree(index_path)
@pytest.mark.parametrize(
("paths", "path_type", "root"),
[
(["/data/uploads"], UPLOADS_INDEX_ROOT, "/data/uploads"),
(["/data/uploads/"], UPLOADS_INDEX_ROOT, "/data/uploads"),
(["/data/chats/1/1.txt"], str(Path(CHATS_INDEX_ROOT) / "1"), "/data/chats/1"),
(["/data/chats/1/2.txt"], str(Path(CHATS_INDEX_ROOT) / "1"), "/data/chats/1"),
(["/data/chats/2/2.txt", "/data/chats/2/2.txt"], str(Path(CHATS_INDEX_ROOT) / "2"), "/data/chats/2"),
(["/data/chats.txt"], "other", ""),
],
)
def test_classify_path(paths, path_type, root):
result, result_root = IndexRepo.classify_path(paths)
assert path_type in set(result.keys())
assert root == result_root.get(path_type, "")
if __name__ == "__main__":
pytest.main([__file__, "-s"])