diff --git a/metagpt/actions/analyze_requirements.py b/metagpt/actions/analyze_requirements.py index d81da3e14..86088d824 100644 --- a/metagpt/actions/analyze_requirements.py +++ b/metagpt/actions/analyze_requirements.py @@ -48,7 +48,7 @@ INSTRUCTIONS = """ You must output in the same language as the Requirements. First, This language should be consistent with the language used in the requirement description. determine the natural language you must respond in. If the requirements specify a special language, follow those instructions. The default language for responses is English. Second, extract the restrictions in the requirements, specifically the steps. Do not include detailed demand descriptions; focus only on the restrictions. -Third, if the requirements is a software development, extract the program language. If If no specific programming language is required, Use HTML (*.html), CSS (*.css), and JavaScript (*.js) +Third, if the requirements is a software development, extract the program language. If no specific programming language is required, Use HTML (*.html), CSS (*.css), and JavaScript (*.js) Note: 1. if there is not restrictions, requirements_restrictions must be "" diff --git a/metagpt/actions/research.py b/metagpt/actions/research.py index 98edfddb0..99f72b076 100644 --- a/metagpt/actions/research.py +++ b/metagpt/actions/research.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio +from datetime import datetime from typing import Any, Callable, Coroutine, Optional, Union from pydantic import TypeAdapter, model_validator @@ -43,9 +44,10 @@ COLLECT_AND_RANKURLS_PROMPT = """### Topic {results} ### Requirements -Please remove irrelevant search results that are not related to the query or topic. Then, sort the remaining search results \ -based on the link credibility. If two results have equal credibility, prioritize them based on the relevance. Provide the -ranked results' indices in JSON format, like [0, 1, 3, 4, ...], without including other words. +Please remove irrelevant search results that are not related to the query or topic. +If the query is time-sensitive or specifies a certain time frame, please also remove search results that are outdated or outside the specified time frame. Notice that the current time is {time_stamp}. +Then, sort the remaining search results based on the link credibility. If two results have equal credibility, prioritize them based on the relevance. +Provide the ranked results' indices in JSON format, like [0, 1, 3, 4, ...], without including other words. """ WEB_BROWSE_AND_SUMMARIZE_PROMPT = """### Requirements @@ -165,7 +167,8 @@ class CollectLinks(Action): max_results = max_num_results or max(num_results * 2, 6) results = await self._search_urls(query, max_results=max_results) _results = "\n".join(f"{i}: {j}" for i, j in zip(range(max_results), results)) - prompt = COLLECT_AND_RANKURLS_PROMPT.format(topic=topic, query=query, results=_results) + time_stamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + prompt = COLLECT_AND_RANKURLS_PROMPT.format(topic=topic, query=query, results=_results, time_stamp=time_stamp) logger.debug(prompt) indices = await self._aask(prompt) try: diff --git a/metagpt/actions/search_enhanced_qa.py b/metagpt/actions/search_enhanced_qa.py index 23f21b2a8..1427f9b19 100644 --- a/metagpt/actions/search_enhanced_qa.py +++ b/metagpt/actions/search_enhanced_qa.py @@ -9,6 +9,7 @@ from pydantic import Field, PrivateAttr, model_validator from metagpt.actions import Action from metagpt.actions.research import CollectLinks, WebBrowseAndSummarize from metagpt.logs import logger +from metagpt.tools.tool_registry import register_tool from metagpt.tools.web_browser_engine import WebBrowserEngine from metagpt.utils.common import CodeParser from metagpt.utils.parse_html import WebPage @@ -57,8 +58,9 @@ Remember, don't blindly repeat the contexts verbatim. And here is the user quest """ +@register_tool(include_functions=["run"]) class SearchEnhancedQA(Action): - """Enhancing question-answering capabilities through search engine augmentation.""" + """Question answering and info searching through search engine.""" name: str = "SearchEnhancedQA" desc: str = "Integrating search engine results to anwser the question." diff --git a/metagpt/base/base_role.py b/metagpt/base/base_role.py index 55b8f6de2..e77819d76 100644 --- a/metagpt/base/base_role.py +++ b/metagpt/base/base_role.py @@ -29,13 +29,13 @@ class BaseRole(BaseSerialization): raise NotImplementedError @abstractmethod - async def react(self) -> Message: + async def react(self) -> "Message": """Entry to one of three strategies by which Role reacts to the observed Message.""" @abstractmethod - async def run(self, with_message: Optional[Union[str, Message, list[str]]] = None) -> Optional[Message]: + async def run(self, with_message: Optional[Union[str, "Message", list[str]]] = None) -> Optional["Message"]: """Observe, and think and act based on the results of the observation.""" @abstractmethod - def get_memories(self, k: int = 0) -> list[Message]: + def get_memories(self, k: int = 0) -> list["Message"]: """Return the most recent k memories of this role.""" diff --git a/metagpt/configs/embedding_config.py b/metagpt/configs/embedding_config.py index 20de47999..f9b41b9dc 100644 --- a/metagpt/configs/embedding_config.py +++ b/metagpt/configs/embedding_config.py @@ -20,11 +20,13 @@ class EmbeddingConfig(YamlModel): --------- api_type: "openai" api_key: "YOU_API_KEY" + dimensions: "YOUR_MODEL_DIMENSIONS" api_type: "azure" api_key: "YOU_API_KEY" base_url: "YOU_BASE_URL" api_version: "YOU_API_VERSION" + dimensions: "YOUR_MODEL_DIMENSIONS" api_type: "gemini" api_key: "YOU_API_KEY" @@ -32,6 +34,7 @@ class EmbeddingConfig(YamlModel): api_type: "ollama" base_url: "YOU_BASE_URL" model: "YOU_MODEL" + dimensions: "YOUR_MODEL_DIMENSIONS" """ api_type: Optional[EmbeddingType] = None @@ -41,6 +44,7 @@ class EmbeddingConfig(YamlModel): model: Optional[str] = None embed_batch_size: Optional[int] = None + dimensions: Optional[int] = None # output dimension of embedding model @field_validator("api_type", mode="before") @classmethod diff --git a/metagpt/ext/cr/actions/code_review.py b/metagpt/ext/cr/actions/code_review.py index 5f861c3e3..0235dc2c6 100644 --- a/metagpt/ext/cr/actions/code_review.py +++ b/metagpt/ext/cr/actions/code_review.py @@ -175,13 +175,16 @@ class CodeReview(Action): async def cr_by_points(self, patch: PatchSet, points: list[Point]): comments = [] + valid_patch_count = 0 for patched_file in patch: if not patched_file: continue if patched_file.path.endswith(".py"): points = [p for p in points if p.language == "Python"] + valid_patch_count += 1 elif patched_file.path.endswith(".java"): points = [p for p in points if p.language == "Java"] + valid_patch_count += 1 else: continue group_points = [points[i : i + 3] for i in range(0, len(points), 3)] @@ -198,6 +201,9 @@ class CodeReview(Action): c["commented_file"] = patched_file_path comments.extend(comments_batch) + if valid_patch_count == 0: + raise ValueError("Only code reviews for Python and Java languages are supported.") + return comments async def run(self, patch: PatchSet, points: list[Point], output_file: str): diff --git a/metagpt/prompts/di/data_analyst.py b/metagpt/prompts/di/data_analyst.py index 8e5b888d3..9f943b187 100644 --- a/metagpt/prompts/di/data_analyst.py +++ b/metagpt/prompts/di/data_analyst.py @@ -1,12 +1,12 @@ from metagpt.strategy.task_type import TaskType EXTRA_INSTRUCTION = """ -6. Carefully choose to use or not use the browser tool to assist you in web tasks. - - When no click action is required, no need to use the Browser tool to navigate to the webpage before scraping. - - Write code to view the HTML content rather than using the Browser tool. - - Make sure the command_name are certainly in Available Commands when you use the Browser tool. - - For information searching requirement, you should use the Browser tool instead of web scraping. - - When no link is provided, you should use the Browser tool to search for the information. +6. Carefully consider how you handle web tasks: + - Use SearchEnhancedQA for general information searching, i.e. querying search engines, such as googling news, weather, wiki, etc. Usually, no link is provided. + - Use Browser for reading, navigating, or in-domain searching within a specific web, such as reading a blog, searching products from a given e-commerce web link, or interacting with a web app. + - Use DataAnalyst.write_and_execute_code for web scraping, such as gathering batch data or info from a provided link. + - Write code to view the HTML content rather than using the Browser tool. + - Make sure the command_name are certainly in Available Commands when you use the Browser tool. 7. When you are making plan. It is highly recommend to plan and append all the tasks in first response once time, except for 7.1. 7.1. When the requirement is inquiring about a pdf, docx, md, or txt document, read the document first through either Editor.read WITHOUT a plan. After reading the document, use RoleZero.reply_to_human if the requirement can be answered straightaway, otherwise, make a plan if further calculation is needed. 8. Don't finish_current_task multiple times for the same task. diff --git a/metagpt/prompts/di/role_zero.py b/metagpt/prompts/di/role_zero.py index 3356ab1c0..29266c298 100644 --- a/metagpt/prompts/di/role_zero.py +++ b/metagpt/prompts/di/role_zero.py @@ -79,7 +79,7 @@ Output should adhere to the following format. ```json [ {{ - "command_name": str, + "command_name": "ClassName.method_name" or "function_name", "args": {{"arg_name": arg_value, ...}} }}, ... diff --git a/metagpt/prompts/di/team_leader.py b/metagpt/prompts/di/team_leader.py index 3ba9a8b0d..e5c119dc8 100644 --- a/metagpt/prompts/di/team_leader.py +++ b/metagpt/prompts/di/team_leader.py @@ -14,8 +14,15 @@ Pay close attention to new user message, review the conversation history, use Ro Pay close attention to messages from team members. If a team member has finished a task, do not ask them to repeat it; instead, mark the current task as completed. Note: 1. If the requirement is a pure DATA-RELATED requirement, such as web browsing, web scraping, web searching, web imitation, data science, data analysis, machine learning, deep learning, text-to-image etc. DON'T decompose it, assign a single task with the original user requirement as instruction directly to Data Analyst. -2. If the requirement is developing a software, game, app, or website, excluding the above data-related tasks, you should decompose the requirement into multiple tasks and assign them to different team members based on their expertise. The software default development process has four steps: creating a Product Requirement Document (PRD) by the Product Manager -> writing a System Design by the Architect -> creating tasks by the Project Manager -> and coding by the Engineer. You may choose to execute any of these steps. When publishing message to Product Manager, you should directly copy the full original user requirement. +2. If the requirement is developing a software, game, app, or website, excluding the above data-related tasks, you should decompose the requirement into multiple tasks and assign them to different team members based on their expertise. The standard software development process has four steps: creating a Product Requirement Document (PRD) by the Product Manager -> writing a System Design by the Architect -> creating tasks by the Project Manager -> and coding by the Engineer. You may choose to execute any of these steps. When publishing message to Product Manager, you should directly copy the full original user requirement. 2.1. If the requirement contains both DATA-RELATED part mentioned in 1 and software development part mentioned in 2, you should decompose the software development part and assign them to different team members based on their expertise, and assign the DATA-RELATED part to Data Analyst David directly. +2.2. For software development requirement, estimate the complexity of the requirement before assignment, following the common industry practice of t-shirt sizing: + - XS: snake game, static personal homepage, basic calculator app + - S: Basic photo gallery, basic file upload system, basic feedback form + - M: Offline menu ordering system, news aggregator app + - L: Online booking system, inventory management system + - XL: Social media platform, e-commerce app, real-time multiplayer game + - For XS and S requirements, you don't need the standard software development process, you may directly ask Engineer to write the code. Otherwise, estimate if any part of the standard software development process may contribute to a better final code. If so, assign team members accordingly. 3.1 If the task involves code review (CR) or code checking, you should assign it to Engineer. 3.2. If the requirement is to fix a bug or issue, you should assign it to Issue Solver. However, if the code is written by Engineer, Engineer must maintain the code. 4. If the requirement is a common-sense, logical, or math problem, you should respond directly without assigning any task to team members. diff --git a/metagpt/provider/base_llm.py b/metagpt/provider/base_llm.py index 75d8bfe00..f9111ffe0 100644 --- a/metagpt/provider/base_llm.py +++ b/metagpt/provider/base_llm.py @@ -27,7 +27,6 @@ from metagpt.configs.llm_config import LLMConfig from metagpt.const import IMAGES, LLM_API_TIMEOUT, USE_CONFIG_TIMEOUT from metagpt.logs import logger from metagpt.provider.constant import MULTI_MODAL_MODELS -from metagpt.schema import Message from metagpt.utils.common import log_and_reraise from metagpt.utils.cost_manager import CostManager, Costs from metagpt.utils.token_counter import TOKEN_MAX @@ -80,7 +79,7 @@ class BaseLLM(ABC): def support_image_input(self) -> bool: return any([m in self.config.model for m in MULTI_MODAL_MODELS]) - def format_msg(self, messages: Union[str, Message, list[dict], list[Message], list[str]]) -> list[dict]: + def format_msg(self, messages: Union[str, "Message", list[dict], list["Message"], list[str]]) -> list[dict]: """convert messages to list[dict].""" from metagpt.schema import Message @@ -173,7 +172,9 @@ class BaseLLM(ABC): context.append(self._assistant_msg(rsp_text)) return self._extract_assistant_rsp(context) - async def aask_code(self, messages: Union[str, Message, list[dict]], timeout=USE_CONFIG_TIMEOUT, **kwargs) -> dict: + async def aask_code( + self, messages: Union[str, "Message", list[dict]], timeout=USE_CONFIG_TIMEOUT, **kwargs + ) -> dict: raise NotImplementedError @abstractmethod diff --git a/metagpt/provider/google_gemini_api.py b/metagpt/provider/google_gemini_api.py index e4b3a3f17..5c1b92503 100644 --- a/metagpt/provider/google_gemini_api.py +++ b/metagpt/provider/google_gemini_api.py @@ -22,7 +22,6 @@ from metagpt.const import USE_CONFIG_TIMEOUT from metagpt.logs import log_llm_stream, logger from metagpt.provider.base_llm import BaseLLM from metagpt.provider.llm_provider_registry import register_provider -from metagpt.schema import Message class GeminiGenerativeModel(GenerativeModel): @@ -73,7 +72,7 @@ class GeminiLLM(BaseLLM): def _system_msg(self, msg: str) -> dict[str, str]: return {"role": "user", "parts": [msg]} - def format_msg(self, messages: Union[str, Message, list[dict], list[Message], list[str]]) -> list[dict]: + def format_msg(self, messages: Union[str, "Message", list[dict], list["Message"], list[str]]) -> list[dict]: """convert messages to list[dict].""" from metagpt.schema import Message diff --git a/metagpt/rag/engines/simple.py b/metagpt/rag/engines/simple.py index e48decdab..8d78fcad7 100644 --- a/metagpt/rag/engines/simple.py +++ b/metagpt/rag/engines/simple.py @@ -2,7 +2,8 @@ import json import os -from typing import Any, Optional, Union +from pathlib import Path +from typing import Any, List, Optional, Set, Union import fsspec from llama_index.core import SimpleDirectoryReader @@ -78,6 +79,7 @@ class SimpleEngine(RetrieverQueryEngine): callback_manager=callback_manager, ) self._transformations = transformations or self._default_transformations() + self._filenames = set() @classmethod def from_docs( @@ -192,11 +194,11 @@ class SimpleEngine(RetrieverQueryEngine): self._try_reconstruct_obj(nodes) return nodes - def add_docs(self, input_files: list[str]): + def add_docs(self, input_files: List[Union[str, Path]]): """Add docs to retriever. retriever must has add_nodes func.""" self._ensure_retriever_modifiable() - documents = SimpleDirectoryReader(input_files=input_files).load_data() + documents = SimpleDirectoryReader(input_files=[str(i) for i in input_files]).load_data() self._fix_document_metadata(documents) nodes = run_transformations(documents, transformations=self._transformations) @@ -227,6 +229,24 @@ class SimpleEngine(RetrieverQueryEngine): return self.retriever.clear(**kwargs) + def delete_docs(self, input_files: List[Union[str, Path]]): + """Delete documents from the index and document store. + + Args: + input_files (List[Union[str, Path]]): A list of file paths or file names to be deleted. + + Raises: + NotImplementedError: If the method is not implemented. + """ + exists_filenames = set() + filenames = {str(i) for i in input_files} + for doc_id, info in self.retriever._index.ref_doc_info.items(): + if info.metadata.get("file_path") in filenames: + exists_filenames.add(doc_id) + + for doc_id in exists_filenames: + self.retriever._index.delete_ref_doc(doc_id, delete_from_docstore=True) + @staticmethod def get_obj_nodes(objs: Optional[list[RAGObject]] = None) -> list[ObjectNode]: """Converts a list of RAGObjects to a list of ObjectNodes.""" @@ -333,3 +353,7 @@ class SimpleEngine(RetrieverQueryEngine): @staticmethod def _default_transformations(): return [SentenceSplitter()] + + @property + def filenames(self) -> Set[str]: + return self._filenames diff --git a/metagpt/rag/factories/embedding.py b/metagpt/rag/factories/embedding.py index d647883bd..19b8b36f6 100644 --- a/metagpt/rag/factories/embedding.py +++ b/metagpt/rag/factories/embedding.py @@ -5,9 +5,6 @@ from typing import Any, Optional from llama_index.core.embeddings import BaseEmbedding from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding -from llama_index.embeddings.gemini import GeminiEmbedding -from llama_index.embeddings.ollama import OllamaEmbedding -from llama_index.embeddings.openai import OpenAIEmbedding from metagpt.config2 import Config from metagpt.configs.embedding_config import EmbeddingType @@ -49,7 +46,9 @@ class RAGEmbeddingFactory(GenericFactory): raise TypeError("To use RAG, please set your embedding in config2.yaml.") - def _create_openai(self) -> OpenAIEmbedding: + def _create_openai(self) -> "OpenAIEmbedding": + from llama_index.embeddings.openai import OpenAIEmbedding + params = dict( api_key=self.config.embedding.api_key or self.config.llm.api_key, api_base=self.config.embedding.base_url or self.config.llm.base_url, @@ -70,7 +69,9 @@ class RAGEmbeddingFactory(GenericFactory): return AzureOpenAIEmbedding(**params) - def _create_gemini(self) -> GeminiEmbedding: + def _create_gemini(self) -> "GeminiEmbedding": + from llama_index.embeddings.gemini import GeminiEmbedding + params = dict( api_key=self.config.embedding.api_key, api_base=self.config.embedding.base_url, @@ -80,7 +81,9 @@ class RAGEmbeddingFactory(GenericFactory): return GeminiEmbedding(**params) - def _create_ollama(self) -> OllamaEmbedding: + def _create_ollama(self) -> "OllamaEmbedding": + from llama_index.embeddings.ollama import OllamaEmbedding + params = dict( base_url=self.config.embedding.base_url, ) diff --git a/metagpt/rag/factories/llm.py b/metagpt/rag/factories/llm.py index 59f6db4d9..bd252771a 100644 --- a/metagpt/rag/factories/llm.py +++ b/metagpt/rag/factories/llm.py @@ -13,7 +13,6 @@ from llama_index.core.llms.callbacks import llm_completion_callback from pydantic import Field from metagpt.config2 import Config -from metagpt.llm import LLM from metagpt.provider.base_llm import BaseLLM from metagpt.utils.async_helper import NestAsyncio from metagpt.utils.token_counter import TOKEN_MAX @@ -79,4 +78,6 @@ class RAGLLM(CustomLLM): def get_rag_llm(model_infer: BaseLLM = None) -> RAGLLM: """Get llm that can be used by LlamaIndex.""" + from metagpt.llm import LLM + return RAGLLM(model_infer=model_infer or LLM()) diff --git a/metagpt/roles/di/data_analyst.py b/metagpt/roles/di/data_analyst.py index 329b3c45d..f9bead1ac 100644 --- a/metagpt/roles/di/data_analyst.py +++ b/metagpt/roles/di/data_analyst.py @@ -30,8 +30,8 @@ class DataAnalyst(RoleZero): instruction: str = ROLE_INSTRUCTION + EXTRA_INSTRUCTION task_type_desc: str = TASK_TYPE_DESC - tools: list[str] = ["Plan", "DataAnalyst", "RoleZero", "Browser", "Editor:write,read"] - custom_tools: list[str] = ["web scraping", "Terminal"] + tools: list[str] = ["Plan", "DataAnalyst", "RoleZero", "Browser", "Editor:write,read", "SearchEnhancedQA"] + custom_tools: list[str] = ["web scraping", "Terminal", "Editor:write,read"] custom_tool_recommender: ToolRecommender = None experience_retriever: Annotated[ExpRetriever, Field(exclude=True)] = KeywordExpRetriever() diff --git a/metagpt/roles/di/engineer2.py b/metagpt/roles/di/engineer2.py index 4600e3254..bcd5f5738 100644 --- a/metagpt/roles/di/engineer2.py +++ b/metagpt/roles/di/engineer2.py @@ -13,6 +13,7 @@ from metagpt.prompts.di.engineer2 import ( from metagpt.roles.di.role_zero import RoleZero from metagpt.schema import UserMessage from metagpt.strategy.experience_retriever import ENGINEER_EXAMPLE +from metagpt.tools.libs.cr import CodeReview from metagpt.tools.libs.terminal import Terminal from metagpt.tools.tool_registry import register_tool from metagpt.utils.common import CodeParser, awrite @@ -28,14 +29,17 @@ class Engineer2(RoleZero): terminal: Terminal = Field(default_factory=Terminal, exclude=True) - tools: list[str] = ["Plan", "Editor:read", "RoleZero", "Terminal:run_command", "Engineer2"] + tools: list[str] = ["Plan", "Editor:read", "RoleZero", "Terminal:run_command", "Engineer2", "SearchEnhancedQA", "CodeReview"] def _update_tool_execution(self): # validate = ValidateAndRewriteCode() + cr = CodeReview() self.tool_execution_map.update( { "Terminal.run_command": self.terminal.run_command, "Engineer2.write_new_code": self.write_new_code, + "CodeReview.review": cr.review, + "CodeReview.fix": cr.fix, # "ValidateAndRewriteCode.run": validate.run, # "ValidateAndRewriteCode": validate.run, } diff --git a/metagpt/roles/di/role_zero.py b/metagpt/roles/di/role_zero.py index 738de2244..7f3d81b9d 100644 --- a/metagpt/roles/di/role_zero.py +++ b/metagpt/roles/di/role_zero.py @@ -32,7 +32,6 @@ from metagpt.prompts.di.role_zero import ( ROLE_INSTRUCTION, SUMMARY_PROMPT, SYSTEM_PROMPT, - THOUGHT_GUIDANCE, ) from metagpt.roles import Role from metagpt.schema import AIMessage, LongTermMemoryItem, Message, UserMessage @@ -64,7 +63,6 @@ class RoleZero(Role): system_prompt: str = SYSTEM_PROMPT # Use None to conform to the default value at llm.aask cmd_prompt: str = CMD_PROMPT cmd_prompt_current_state: str = "" - thought_guidance: str = THOUGHT_GUIDANCE instruction: str = ROLE_INSTRUCTION task_type_desc: Optional[str] = None @@ -87,7 +85,7 @@ class RoleZero(Role): # Others command_rsp: str = "" # the raw string containing the commands commands: list[dict] = [] # commands to be executed - memory_k: int = 20 # number of memories (messages) to use as historical context + memory_k: int = 200 # number of memories (messages) to use as historical context enable_longterm_memory: bool = True # whether to use longterm memory longterm_memory: RoleZeroLongTermMemory = None use_fixed_sop: bool = False @@ -117,11 +115,9 @@ class RoleZero(Role): "Plan.append_task": self.planner.plan.append_task, "Plan.reset_task": self.planner.plan.reset_task, "Plan.replace_task": self.planner.plan.replace_task, - "Editor.write": self.editor.write, - "Editor.write_content": self.editor.write_content, - "Editor.read": self.editor.read, "RoleZero.ask_human": self.ask_human, "RoleZero.reply_to_human": self.reply_to_human, + "SearchEnhancedQA.run": SearchEnhancedQA().run, } self.tool_execution_map.update( { @@ -140,6 +136,27 @@ class RoleZero(Role): ] } ) + self.tool_execution_map.update( + { + f"Editor.{i}": getattr(self.editor, i) + for i in [ + "append_file", + "create_file", + "edit_file_by_replace", + "find_file", + "goto_line", + "insert_content_at_line", + "open_file", + "read", + "scroll_down", + "scroll_up", + "search_dir", + "search_file", + "set_workdir", + "write", + ] + } + ) # can be updated by subclass self._update_tool_execution() return self diff --git a/metagpt/tools/libs/cr.py b/metagpt/tools/libs/cr.py index e3509591b..0a53dd194 100644 --- a/metagpt/tools/libs/cr.py +++ b/metagpt/tools/libs/cr.py @@ -45,12 +45,15 @@ class CodeReview: """ patch = await self._get_patch_content(patch_path) point_file = point_file if point_file else Path(metagpt.ext.cr.__file__).parent / "points.json" + await EditorReporter().async_report(str(point_file), "path") async with aiofiles.open(point_file, "rb") as f: cr_point_content = await f.read() cr_points = [Point(**i) for i in json.loads(cr_point_content)] - - comments = await CodeReview_().run(patch, cr_points, output_file) - return f"The number of defects: {len(comments)} and the comments are stored in {output_file}" + try: + comments = await CodeReview_().run(patch, cr_points, output_file) + except ValueError as e: + return str(e) + return f"The number of defects: {len(comments)}, the comments are stored in {output_file}, and the checkpoints are stored in {str(point_file)}" async def fix( self, diff --git a/metagpt/tools/libs/editor.py b/metagpt/tools/libs/editor.py index 478903d9c..8013b99c9 100644 --- a/metagpt/tools/libs/editor.py +++ b/metagpt/tools/libs/editor.py @@ -1,20 +1,32 @@ +""" +This file is borrowed from OpenDevin +You can find the original repository here: +https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/plugins/agent_skills/file_ops/file_ops.py +""" import base64 import os +import re import shutil -import subprocess +import tempfile from pathlib import Path from typing import List, Optional, Tuple, Union from pydantic import BaseModel, ConfigDict from metagpt.config2 import Config +from metagpt.const import DEFAULT_WORKSPACE_ROOT from metagpt.logs import logger +from metagpt.tools.libs.linter import Linter from metagpt.tools.tool_registry import register_tool from metagpt.utils import read_docx from metagpt.utils.common import aread, aread_bin, awrite_bin, check_http_endpoint from metagpt.utils.repo_to_markdown import is_text_file from metagpt.utils.report import EditorReporter +# This is also used in unit tests! +MSG_FILE_UPDATED = "[File updated (edited at line {line_number}). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]" +LINTER_ERROR_MSG = "[Your proposed edit has introduced new syntax error(s). Please understand the errors and retry your edit command.]\n" + class FileBlock(BaseModel): """A block of content in a file""" @@ -23,6 +35,10 @@ class FileBlock(BaseModel): block_content: str +class LineNumberError(Exception): + pass + + @register_tool() class Editor(BaseModel): """ @@ -31,8 +47,12 @@ class Editor(BaseModel): """ model_config = ConfigDict(arbitrary_types_allowed=True) - resource: EditorReporter = EditorReporter() + current_file: Optional[Path] = None + current_line: int = 1 + window: int = 100 + enable_auto_lint: bool = False + working_dir: Path = DEFAULT_WORKSPACE_ROOT def write(self, path: str, content: str): """Write the whole content to a file. When used, make sure content arg contains the full content of the file.""" @@ -74,153 +94,6 @@ class Editor(BaseModel): ) return result - def search_content(self, symbol: str, root_path: str = ".", window: int = 50) -> FileBlock: - """ - Search symbol in all files under root_path, return the context of symbol with window size - Useful for locating class or function in a large codebase. Example symbol can be "def some_function", "class SomeClass", etc. - In searching, attempt different symbols of different granualities, e.g. "def some_function", "class SomeClass", a certain line of code, etc. - - Args: - symbol (str): The symbol to search. - root_path (str, optional): The root path to search in, the path can be a folder or a file. If not provided, search in the current directory. Defaults to ".". - window (int, optional): The window size to return. Defaults to 20. - - Returns: - FileBlock: The block containing the symbol, a pydantic BaseModel with the schema below. - class FileBlock(BaseModel): - file_path: str - block_content: str - """ - if not os.path.exists(root_path): - print(f"Currently at {os.getcwd()} containing: {os.listdir()}. Path {root_path} does not exist.") - return None - not_found_msg = ( - "symbol not found, you may try searching another one, or break down your search term to search a part of it" - ) - if os.path.isfile(root_path): - result = self._search_content_in_file(symbol, root_path, window) - if not result: - print(not_found_msg) - return result - for root, _, files in os.walk(root_path or "."): - for file in files: - file_path = os.path.join(root, file) - result = self._search_content_in_file(symbol, file_path, window) - if result: - # FIXME: This returns the first found result, not all results. - return result - print(not_found_msg) - return None - - def _search_content_in_file(self, symbol: str, file_path: str, window: int = 50) -> FileBlock: - print("search in", file_path) - if not file_path.endswith(".py"): - return None - with open(file_path, "r", encoding="utf-8") as f: - try: - lines = f.readlines() - except Exception: - return None - for i, line in enumerate(lines): - if symbol in line: - start = max(i - window, 0) - end = min(i + window, len(lines) - 1) - for row_num in range(start, end + 1): - lines[row_num] = f"{(row_num + 1):03}|{lines[row_num]}" - block_content = "".join(lines[start : end + 1]) - result = FileBlock( - file_path=file_path, - block_content=block_content, - ) - self.resource.report(result.file_path, "path", extra={"type": "search", "line": i, "symbol": symbol}) - return result - return None - - def write_content(self, file_path: str, start_line: int, end_line: int, new_block_content: str = "") -> str: - """ - Write a new block of content into a file. Use this method to update a block of code in a file. There are three cases: - 1. If the new block content is empty, the original block will be deleted. - 2. If the new block content is not empty and end_line < start_line (e.g. set end_line = -1) the new block content will be inserted at start_line. - 3. If the new block content is not empty and end_line >= start_line, the original block from start_line to end_line (both inclusively) will be replaced by the new block content. - This function can sometimes be used given a FileBlock upstream. You should carefully review its row number. Determine the start_line and end_line based on the row number of the FileBlock. - The file content from start_line to end_line will be replaced by your new_block_content. DON'T replace more than you intend to. - - Args: - file_path (str): The file path to write the new block content. - start_line (int): start line of the original block to be updated (inclusive). - end_line (int): end line of the original block to be updated (inclusive). - new_block_content (str): The new block content to write. Don't include row number in the content. - - Returns: - str: A message indicating the status of the write operation. - """ - # Create a temporary copy of the file - temp_file_path = file_path + ".temp" - shutil.copy(file_path, temp_file_path) - - try: - # Modify the temporary file with the new content - self._write_content(temp_file_path, start_line, end_line, new_block_content) - - # Lint the modified temporary file - lint_passed, lint_message = self._lint_file(temp_file_path) - # if not lint_passed: - # return f"Linting the content at a temp file, failed with:\n{lint_message}" - - # If linting passes, overwrite the original file with the temporary file - shutil.move(temp_file_path, file_path) - - new_file_block = FileBlock( - file_path=file_path, - block_content=new_block_content, - ) - self.resource.report(new_file_block.file_path, "path") - - return f"Content written successfully to {file_path}" - - finally: - # Clean up: Ensure the temporary file is removed if it still exists - if os.path.exists(temp_file_path): - os.remove(temp_file_path) - - def _write_content(self, file_path: str, start_line: int, end_line: int, new_block_content: str = ""): - """start_line and end_line are both 1-based indices and inclusive.""" - with open(file_path, "r") as file: - lines = file.readlines() - - start_line_index = start_line - 1 # Adjusting because list indices start at 0 - end_line_index = end_line - - if new_block_content: - # Split the new_block_content by newline and ensure each line ends with a newline character - new_content_lines = new_block_content.splitlines( - keepends=True - ) # FIXME: This will split \n within a line, such as ab\ncd - if end_line >= start_line: - # This replaces the block between start_line and end_line with new_block_content - # irrespective of the length difference between the original and new content. - lines[start_line_index:end_line_index] = new_content_lines - else: - lines.insert(start_line_index, "".join(new_content_lines)) - else: - del lines[start_line_index:end_line_index] - - with open(file_path, "w") as file: - file.writelines(lines) - - @classmethod - def _lint_file(cls, file_path: str) -> (bool, str): - """Lints an entire Python file using pylint, returns True if linting passes, along with pylint's output.""" - result = subprocess.run( - ["pylint", file_path, "--disable=all", "--enable=E"], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - ) - lint_passed = result.returncode == 0 - lint_message = result.stdout - return lint_passed, lint_message - @staticmethod async def _read_text(path: Union[str, Path]) -> List[str]: content = await aread(path) @@ -294,3 +167,787 @@ class Editor(BaseModel): if config.omniparse and config.omniparse.url: return config.omniparse.url, config.omniparse.timeout return "", 0 + + @staticmethod + def _is_valid_filename(file_name: str) -> bool: + if not file_name or not file_name.strip(): + return False + invalid_chars = '<>:"/\\|?*' + if os.name == "nt": # Windows + invalid_chars = '<>:"/\\|?*' + elif os.name == "posix": # Unix-like systems + invalid_chars = "\0" + + for char in invalid_chars: + if char in file_name: + return False + return True + + @staticmethod + def _is_valid_path(path: Path) -> bool: + try: + return path.exists() + except PermissionError: + return False + + @staticmethod + def _create_paths(file_path: Path) -> bool: + try: + if file_path.parent: + file_path.parent.mkdir(parents=True, exist_ok=True) + return True + except PermissionError: + return False + + def _check_current_file(self, file_path: Optional[Path] = None) -> bool: + if file_path is None: + file_path = self.current_file + if not file_path or not file_path.is_file(): + raise ValueError("No file open. Use the open_file function first.") + return True + + @staticmethod + def _clamp(value, min_value, max_value): + return max(min_value, min(value, max_value)) + + def _lint_file(self, file_path: Path) -> tuple[Optional[str], Optional[int]]: + """Lint the file at the given path and return a tuple with a boolean indicating if there are errors, + and the line number of the first error, if any. + + Returns: + tuple[str | None, int | None]: (lint_error, first_error_line_number) + """ + + linter = Linter(root=self.working_dir) + lint_error = linter.lint(str(file_path)) + if not lint_error: + # Linting successful. No issues found. + return None, None + return "ERRORS:\n" + lint_error.text, lint_error.lines[0] + + def _print_window(self, file_path: Path, targeted_line: int, window: int): + self._check_current_file(file_path) + with file_path.open() as file: + content = file.read() + + # Ensure the content ends with a newline character + if not content.endswith("\n"): + content += "\n" + + lines = content.splitlines(True) # Keep all line ending characters + total_lines = len(lines) + + # cover edge cases + self.current_line = self._clamp(targeted_line, 1, total_lines) + half_window = max(1, window // 2) + + # Ensure at least one line above and below the targeted line + start = max(1, self.current_line - half_window) + end = min(total_lines, self.current_line + half_window) + + # Adjust start and end to ensure at least one line above and below + if start == 1: + end = min(total_lines, start + window - 1) + if end == total_lines: + start = max(1, end - window + 1) + + output = "" + + # only display this when there's at least one line above + if start > 1: + output += f"({start - 1} more lines above)\n" + else: + output += "(this is the beginning of the file)\n" + for i in range(start, end + 1): + _new_line = f"{i}|{lines[i - 1]}" + if not _new_line.endswith("\n"): + _new_line += "\n" + output += _new_line + if end < total_lines: + output += f"({total_lines - end} more lines below)\n" + else: + output += "(this is the end of the file)\n" + output = output.rstrip() + + return output + + @staticmethod + def _cur_file_header(current_file: Path, total_lines: int) -> str: + if not current_file: + return "" + return f"[File: {current_file.resolve()} ({total_lines} lines total)]\n" + + def set_workdir(self, path: str) -> None: + """ + Sets the working directory to the given path. eg: repo directory. + You MUST to set it up before open the file. + + Args: + path: str: The path to set as the working directory. + """ + self.working_dir = Path(path) + + def open_file( + self, path: Union[Path, str], line_number: Optional[int] = 1, context_lines: Optional[int] = None + ) -> str: + """Opens the file at the given path in the editor. If line_number is provided, the window will be moved to include that line. + It only shows the first 100 lines by default! Max `context_lines` supported is 2000, use `scroll up/down` + to view the file if you want to see more. + + Args: + path: str: The path to the file to open, preferred absolute path. + line_number: int | None = 1: The line number to move to. Defaults to 1. + context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100. + """ + if context_lines is None: + context_lines = self.window + + path = self._try_fix_path(path) + + if not path.is_file(): + raise FileNotFoundError(f"File {path} not found") + + self.current_file = path + with path.open() as file: + total_lines = max(1, sum(1 for _ in file)) + + if not isinstance(line_number, int) or line_number < 1 or line_number > total_lines: + raise ValueError(f"Line number must be between 1 and {total_lines}") + self.current_line = line_number + + # Override WINDOW with context_lines + if context_lines is None or context_lines < 1: + context_lines = self.window + + output = self._cur_file_header(path, total_lines) + output += self._print_window(path, self.current_line, self._clamp(context_lines, 1, 2000)) + return output + + def goto_line(self, line_number: int) -> str: + """Moves the window to show the specified line number. + + Args: + line_number: int: The line number to move to. + """ + self._check_current_file() + + with self.current_file.open() as file: + total_lines = max(1, sum(1 for _ in file)) + if not isinstance(line_number, int) or line_number < 1 or line_number > total_lines: + raise ValueError(f"Line number must be between 1 and {total_lines}") + + self.current_line = self._clamp(line_number, 1, total_lines) + + output = self._cur_file_header(self.current_file, total_lines) + output += self._print_window(self.current_file, self.current_line, self.window) + return output + + def scroll_down(self) -> str: + """Moves the window down by 100 lines.""" + self._check_current_file() + + with self.current_file.open() as file: + total_lines = max(1, sum(1 for _ in file)) + self.current_line = self._clamp(self.current_line + self.window, 1, total_lines) + output = self._cur_file_header(self.current_file, total_lines) + output += self._print_window(self.current_file, self.current_line, self.window) + return output + + def scroll_up(self) -> str: + """Moves the window up by 100 lines.""" + self._check_current_file() + + with self.current_file.open() as file: + total_lines = max(1, sum(1 for _ in file)) + self.current_line = self._clamp(self.current_line - self.window, 1, total_lines) + output = self._cur_file_header(self.current_file, total_lines) + output += self._print_window(self.current_file, self.current_line, self.window) + return output + + def create_file(self, filename: str) -> str: + """Creates and opens a new file with the given name. + + Args: + filename: str: The name of the file to create. + """ + filename = self._try_fix_path(filename) + + if filename.exists(): + raise FileExistsError(f"File '{filename}' already exists.") + + with filename.open("w") as file: + file.write("\n") + + self.open_file(filename) + return f"[File {filename} created.]" + + @staticmethod + def _append_impl(lines, content): + """Internal method to handle appending to a file. + + Args: + lines: list[str]: The lines in the original file. + content: str: The content to append to the file. + + Returns: + content: str: The new content of the file. + n_added_lines: int: The number of lines added to the file. + """ + content_lines = content.splitlines(keepends=True) + n_added_lines = len(content_lines) + if lines and not (len(lines) == 1 and lines[0].strip() == ""): + # file is not empty + if not lines[-1].endswith("\n"): + lines[-1] += "\n" + new_lines = lines + content_lines + content = "".join(new_lines) + else: + # file is empty + content = "".join(content_lines) + + return content, n_added_lines + + @staticmethod + def _insert_impl(lines, start, content): + """Internal method to handle inserting to a file. + + Args: + lines: list[str]: The lines in the original file. + start: int: The start line number for inserting. + content: str: The content to insert to the file. + + Returns: + content: str: The new content of the file. + n_added_lines: int: The number of lines added to the file. + + Raises: + LineNumberError: If the start line number is invalid. + """ + inserted_lines = [content + "\n" if not content.endswith("\n") else content] + if len(lines) == 0: + new_lines = inserted_lines + elif start is not None: + if len(lines) == 1 and lines[0].strip() == "": + # if the file with only 1 line and that line is empty + lines = [] + + if len(lines) == 0: + new_lines = inserted_lines + else: + new_lines = lines[: start - 1] + inserted_lines + lines[start - 1 :] + else: + raise LineNumberError( + f"Invalid line number: {start}. Line numbers must be between 1 and {len(lines)} (inclusive)." + ) + + content = "".join(new_lines) + n_added_lines = len(inserted_lines) + return content, n_added_lines + + @staticmethod + def _edit_impl(lines, start, end, content): + """Internal method to handle editing a file. + + REQUIRES (should be checked by caller): + start <= end + start and end are between 1 and len(lines) (inclusive) + content ends with a newline + + Args: + lines: list[str]: The lines in the original file. + start: int: The start line number for editing. + end: int: The end line number for editing. + content: str: The content to replace the lines with. + + Returns: + content: str: The new content of the file. + n_added_lines: int: The number of lines added to the file. + """ + # Handle cases where start or end are None + if start is None: + start = 1 # Default to the beginning + if end is None: + end = len(lines) # Default to the end + # Check arguments + if not (1 <= start <= len(lines)): + raise LineNumberError( + f"Invalid start line number: {start}. Line numbers must be between 1 and {len(lines)} (inclusive)." + ) + if not (1 <= end <= len(lines)): + raise LineNumberError( + f"Invalid end line number: {end}. Line numbers must be between 1 and {len(lines)} (inclusive)." + ) + if start > end: + raise LineNumberError(f"Invalid line range: {start}-{end}. Start must be less than or equal to end.") + + # Split content into lines and ensure it ends with a newline + if not content.endswith("\n"): + content += "\n" + content_lines = content.splitlines(True) + + # Calculate the number of lines to be added + n_added_lines = len(content_lines) + + # Remove the specified range of lines and insert the new content + new_lines = lines[: start - 1] + content_lines + lines[end:] + + # Handle the case where the original lines are empty + if len(lines) == 0: + new_lines = content_lines + + # Join the lines to create the new content + content = "".join(new_lines) + return content, n_added_lines + + def _edit_file_impl( + self, + file_name: Path, + start: Optional[int] = None, + end: Optional[int] = None, + content: str = "", + is_insert: bool = False, + is_append: bool = False, + ) -> str: + """Internal method to handle common logic for edit_/append_file methods. + + Args: + file_name: Path: The name of the file to edit or append to. + start: int | None = None: The start line number for editing. Ignored if is_append is True. + end: int | None = None: The end line number for editing. Ignored if is_append is True. + content: str: The content to replace the lines with or to append. + is_insert: bool = False: Whether to insert content at the given line number instead of editing. + is_append: bool = False: Whether to append content to the file instead of editing. + """ + ret_str = "" + + ERROR_MSG = f"[Error editing file {file_name}. Please confirm the file is correct.]" + ERROR_MSG_SUFFIX = ( + "Your changes have NOT been applied. Please fix your edit command and try again.\n" + "You either need to 1) Open the correct file and try again or 2) Specify the correct line number arguments.\n" + "DO NOT re-run the same failed edit command. Running it again will lead to the same error." + ) + + if not self._is_valid_filename(file_name.name): + raise FileNotFoundError("Invalid file name.") + + if not self._is_valid_path(file_name): + raise FileNotFoundError("Invalid path or file name.") + + if not self._create_paths(file_name): + raise PermissionError("Could not access or create directories.") + + if not file_name.is_file(): + raise FileNotFoundError(f"File {file_name} not found.") + + if is_insert and is_append: + raise ValueError("Cannot insert and append at the same time.") + + # Use a temporary file to write changes + content = str(content or "") + temp_file_path = "" + src_abs_path = file_name.resolve() + first_error_line = None + + try: + # lint the original file + # enable_auto_lint = os.getenv("ENABLE_AUTO_LINT", "false").lower() == "true" + if self.enable_auto_lint: + original_lint_error, _ = self._lint_file(file_name) + + # Create a temporary file + with tempfile.NamedTemporaryFile("w", delete=False) as temp_file: + temp_file_path = temp_file.name + + # Read the original file and check if empty and for a trailing newline + with file_name.open() as original_file: + lines = original_file.readlines() + + if is_append: + content, n_added_lines = self._append_impl(lines, content) + elif is_insert: + try: + content, n_added_lines = self._insert_impl(lines, start, content) + except LineNumberError as e: + ret_str += (f"{ERROR_MSG}\n" f"{e}\n" f"{ERROR_MSG_SUFFIX}") + "\n" + return ret_str + else: + try: + content, n_added_lines = self._edit_impl(lines, start, end, content) + except LineNumberError as e: + ret_str += (f"{ERROR_MSG}\n" f"{e}\n" f"{ERROR_MSG_SUFFIX}") + "\n" + return ret_str + + if not content.endswith("\n"): + content += "\n" + + # Write the new content to the temporary file + temp_file.write(content) + + # Replace the original file with the temporary file atomically + shutil.move(temp_file_path, src_abs_path) + + # Handle linting + # NOTE: we need to get env var inside this function + # because the env var will be set AFTER the agentskills is imported + if self.enable_auto_lint: + # BACKUP the original file + original_file_backup_path = file_name.parent / f".backup.{file_name.name}" + with original_file_backup_path.open("w") as f: + f.writelines(lines) + + lint_error, first_error_line = self._lint_file(file_name) + + # Select the errors caused by the modification + def extract_last_part(line): + parts = line.split(":") + if len(parts) > 1: + return parts[-1].strip() + return line.strip() + + def subtract_strings(str1, str2) -> str: + lines1 = str1.splitlines() + lines2 = str2.splitlines() + + last_parts1 = [extract_last_part(line) for line in lines1] + + remaining_lines = [line for line in lines2 if extract_last_part(line) not in last_parts1] + + result = "\n".join(remaining_lines) + return result + + if original_lint_error and lint_error: + lint_error = subtract_strings(original_lint_error, lint_error) + if lint_error == "": + lint_error = None + first_error_line = None + + if lint_error is not None: + if first_error_line is not None: + show_line = int(first_error_line) + elif is_append: + # original end-of-file + show_line = len(lines) + # insert OR edit WILL provide meaningful line numbers + elif start is not None and end is not None: + show_line = int((start + end) / 2) + else: + raise ValueError("Invalid state. This should never happen.") + + ret_str += LINTER_ERROR_MSG + ret_str += lint_error + "\n" + + editor_lines = n_added_lines + 20 + + ret_str += "[This is how your edit would have looked if applied]\n" + ret_str += "-------------------------------------------------\n" + ret_str += self._print_window(file_name, show_line, editor_lines, return_str=True) + "\n" + ret_str += "-------------------------------------------------\n\n" + + ret_str += "[This is the original code before your edit]\n" + ret_str += "-------------------------------------------------\n" + ret_str += ( + self._print_window( + original_file_backup_path, + show_line, + editor_lines, + ) + + "\n" + ) + ret_str += "-------------------------------------------------\n" + + ret_str += ( + "Your changes have NOT been applied. Please fix your edit command and try again.\n" + "You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n" + "DO NOT re-run the same failed edit command. Running it again will lead to the same error." + ) + + # recover the original file + with original_file_backup_path.open() as fin, file_name.open("w") as fout: + fout.write(fin.read()) + original_file_backup_path.unlink() + return ret_str + + except FileNotFoundError as e: + ret_str += f"File not found: {e}\n" + except IOError as e: + ret_str += f"An error occurred while handling the file: {e}\n" + except ValueError as e: + ret_str += f"Invalid input: {e}\n" + except Exception as e: + # Clean up the temporary file if an error occurs + if temp_file_path and Path(temp_file_path).exists(): + Path(temp_file_path).unlink() + logger.warning(f"An unexpected error occurred: {e}") + raise e + + # Update the file information and print the updated content + with file_name.open("r", encoding="utf-8") as file: + n_total_lines = max(1, len(file.readlines())) + if first_error_line is not None and int(first_error_line) > 0: + self.current_line = first_error_line + else: + if is_append: + self.current_line = max(1, len(lines)) # end of original file + else: + self.current_line = start or n_total_lines or 1 + ret_str += f"[File: {file_name.resolve()} ({n_total_lines} lines total after edit)]\n" + CURRENT_FILE = file_name + ret_str += self._print_window(CURRENT_FILE, self.current_line, self.window) + "\n" + ret_str += MSG_FILE_UPDATED.format(line_number=self.current_line) + return ret_str + + def edit_file_by_replace(self, file_name: str, to_replace: str, new_content: str) -> str: + """Edit a file. This will search for `to_replace` in the given file and replace it with `new_content`. + + Every *to_replace* must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc. + + Include enough lines to make code in `to_replace` unique. `to_replace` should NOT be empty. + + For example, given a file "/workspace/example.txt" with the following content: + ``` + line 1 + line 2 + line 2 + line 3 + ``` + + EDITING: If you want to replace the second occurrence of "line 2", you can make `to_replace` unique: + + edit_file_by_replace( + '/workspace/example.txt', + to_replace='line 2\nline 3', + new_content='new line\nline 3', + ) + + This will replace only the second "line 2" with "new line". The first "line 2" will remain unchanged. + + The resulting file will be: + ``` + line 1 + line 2 + new line + line 3 + ``` + + REMOVAL: If you want to remove "line 2" and "line 3", you can set `new_content` to an empty string: + + edit_file_by_replace( + '/workspace/example.txt', + to_replace='line 2\nline 3', + new_content='', + ) + + Args: + file_name: str: The name of the file to edit. + to_replace: str: The content to search for and replace. + new_content: str: The new content to replace the old content with. + """ + # FIXME: support replacing *all* occurrences + if to_replace.strip() == "": + raise ValueError("`to_replace` must not be empty.") + + if to_replace == new_content: + raise ValueError("`to_replace` and `new_content` must be different.") + + # search for `to_replace` in the file + # if found, replace it with `new_content` + # if not found, perform a fuzzy search to find the closest match and replace it with `new_content` + file_name = self._try_fix_path(file_name) + with file_name.open("r") as file: + file_content = file.read() + + if file_content.count(to_replace) > 1: + raise ValueError( + "`to_replace` appears more than once, please include enough lines to make code in `to_replace` unique." + ) + + start = file_content.find(to_replace) + if start != -1: + # Convert start from index to line number + start_line_number = file_content[:start].count("\n") + 1 + end_line_number = start_line_number + len(to_replace.splitlines()) - 1 + else: + + def _fuzzy_transform(s: str) -> str: + # remove all space except newline + return re.sub(r"[^\S\n]+", "", s) + + # perform a fuzzy search (remove all spaces except newlines) + to_replace_fuzzy = _fuzzy_transform(to_replace) + file_content_fuzzy = _fuzzy_transform(file_content) + # find the closest match + start = file_content_fuzzy.find(to_replace_fuzzy) + if start == -1: + return f"[No exact match found in {file_name} for\n```\n{to_replace}\n```\n]" + # Convert start from index to line number for fuzzy match + start_line_number = file_content_fuzzy[:start].count("\n") + 1 + end_line_number = start_line_number + len(to_replace.splitlines()) - 1 + + ret_str = self._edit_file_impl( + file_name, + start=start_line_number, + end=end_line_number, + content=new_content, + is_insert=False, + ) + # lint_error = bool(LINTER_ERROR_MSG in ret_str) + # TODO: automatically tries to fix linter error (maybe involve some static analysis tools on the location near the edit to figure out indentation) + return ret_str + + def insert_content_at_line(self, file_name: str, line_number: int, content: str) -> str: + """Insert content at the given line number in a file. + This will NOT modify the content of the lines before OR after the given line number. + + For example, if the file has the following content: + ``` + line 1 + line 2 + line 3 + ``` + and you call `insert_content_at_line('file.txt', 2, 'new line')`, the file will be updated to: + ``` + line 1 + new line + line 2 + line 3 + ``` + + Args: + file_name: str: The name of the file to edit. + line_number: int: The line number (starting from 1) to insert the content after. + content: str: The content to insert. + """ + file_name = self._try_fix_path(file_name) + + ret_str = self._edit_file_impl( + file_name, + start=line_number, + end=line_number, + content=content, + is_insert=True, + is_append=False, + ) + return ret_str + + def append_file(self, file_name: str, content: str) -> str: + """Append content to the given file. + It appends text `content` to the end of the specified file. + + Args: + file_name: str: The name of the file to edit. + content: str: The content to insert. + """ + file_name = self._try_fix_path(file_name) + + ret_str = self._edit_file_impl( + file_name, + start=None, + end=None, + content=content, + is_insert=False, + is_append=True, + ) + return ret_str + + def search_dir(self, search_term: str, dir_path: str = "./") -> str: + """Searches for search_term in all files in dir. If dir is not provided, searches in the current directory. + + Args: + search_term: str: The term to search for. + dir_path: str: The path to the directory to search. + """ + dir_path = self._try_fix_path(dir_path) + if not dir_path.is_dir(): + raise FileNotFoundError(f"Directory {dir_path} not found") + matches = [] + for root, _, files in os.walk(dir_path): + for file in files: + if file.startswith("."): + continue + file_path = Path(root) / file + with file_path.open("r", errors="ignore") as f: + for line_num, line in enumerate(f, 1): + if search_term in line: + matches.append((file_path, line_num, line.strip())) + + if not matches: + return f'No matches found for "{search_term}" in {dir_path}' + + num_matches = len(matches) + num_files = len(set(match[0] for match in matches)) + + if num_files > 100: + return f'More than {num_files} files matched for "{search_term}" in {dir_path}. Please narrow your search.' + + res_list = [f'[Found {num_matches} matches for "{search_term}" in {dir_path}]'] + for file_path, line_num, line in matches: + res_list.append(f"{file_path} (Line {line_num}): {line}") + res_list.append(f'[End of matches for "{search_term}" in {dir_path}]') + return "\n".join(res_list) + + def search_file(self, search_term: str, file_path: Optional[str] = None) -> str: + """Searches for search_term in file. If file is not provided, searches in the current open file. + + Args: + search_term: str: The term to search for. + file_path: str | None: The path to the file to search. + """ + if file_path is None: + file_path = self.current_file + else: + file_path = self._try_fix_path(file_path) + if file_path is None: + raise FileNotFoundError("No file specified or open. Use the open_file function first.") + if not file_path.is_file(): + raise FileNotFoundError(f"File {file_path} not found") + + matches = [] + with file_path.open() as file: + for i, line in enumerate(file, 1): + if search_term in line: + matches.append((i, line.strip())) + res_list = [] + if matches: + res_list.append(f'[Found {len(matches)} matches for "{search_term}" in {file_path}]') + for match in matches: + res_list.append(f"Line {match[0]}: {match[1]}") + res_list.append(f'[End of matches for "{search_term}" in {file_path}]') + else: + res_list.append(f'[No matches found for "{search_term}" in {file_path}]') + return "\n".join(res_list) + + def find_file(self, file_name: str, dir_path: str = "./") -> str: + """Finds all files with the given name in the specified directory. + + Args: + file_name: str: The name of the file to find. + dir_path: str: The path to the directory to search. + """ + file_name = self._try_fix_path(file_name) + dir_path = self._try_fix_path(dir_path) + if not dir_path.is_dir(): + raise FileNotFoundError(f"Directory {dir_path} not found") + + matches = [] + for root, _, files in os.walk(dir_path): + for file in files: + if str(file_name) in file: + matches.append(Path(root) / file) + + res_list = [] + if matches: + res_list.append(f'[Found {len(matches)} matches for "{file_name}" in {dir_path}]') + for match in matches: + res_list.append(f"{match}") + res_list.append(f'[End of matches for "{file_name}" in {dir_path}]') + else: + res_list.append(f'[No matches found for "{file_name}" in {dir_path}]') + return "\n".join(res_list) + + def _try_fix_path(self, path: Union[Path, str]) -> Path: + """Tries to fix the path if it is not absolute.""" + if not isinstance(path, Path): + path = Path(path) + if not path.is_absolute(): + path = self.working_dir / path + return path diff --git a/metagpt/tools/libs/index_repo.py b/metagpt/tools/libs/index_repo.py new file mode 100644 index 000000000..fadc11522 --- /dev/null +++ b/metagpt/tools/libs/index_repo.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import json +from pathlib import Path +from typing import Dict, List, Optional, Set, Union + +import tiktoken +from llama_index.core.base.embeddings.base import BaseEmbedding +from llama_index.core.schema import NodeWithScore +from pydantic import BaseModel, Field, model_validator + +from metagpt.config2 import Config +from metagpt.logs import logger +from metagpt.rag.engines import SimpleEngine +from metagpt.rag.factories.embedding import RAGEmbeddingFactory +from metagpt.rag.schema import FAISSIndexConfig, FAISSRetrieverConfig, LLMRankerConfig +from metagpt.utils.common import aread, awrite, generate_fingerprint, list_files +from metagpt.utils.repo_to_markdown import is_text_file + + +class TextScore(BaseModel): + filename: str + text: str + score: Optional[float] = None + + +class IndexRepo(BaseModel): + persist_path: str # The persist path of the index repo, {DEFAULT_WORKSPACE_ROOT}/.index/{chat_id or 'uploads'}/ + root_path: str # `/data/uploads` or r`/data/chats/\d+`, the root path of files indexed by the index repo. + fingerprint_filename: str = "fingerprint.json" + model: Optional[str] = None + min_token_count: int = 10000 + max_token_count: int = 100000000 + recall_count: int = 5 + embedding: Optional[BaseEmbedding] = Field(default=None, exclude=True) + fingerprints: Dict[str, str] = Field(default_factory=dict) + + @model_validator(mode="after") + def _update_fingerprints(self) -> "IndexRepo": + """Load fingerprints from the fingerprint file if not already loaded. + + Returns: + IndexRepo: The updated IndexRepo instance. + """ + if not self.fingerprints: + filename = Path(self.persist_path) / self.fingerprint_filename + if not filename.exists(): + return self + with open(str(filename), "r") as reader: + self.fingerprints = json.load(reader) + return self + + async def search( + self, query: str, filenames: Optional[List[Path]] = None + ) -> Optional[List[Union[NodeWithScore, TextScore]]]: + """Search for documents related to the given query. + + Args: + query (str): The search query. + filenames (Optional[List[Path]]): A list of filenames to filter the search. + + Returns: + Optional[List[Union[NodeWithScore, TextScore]]]: A list of search results containing NodeWithScore or TextScore. + """ + encoding = tiktoken.get_encoding("cl100k_base") + result: List[Union[NodeWithScore, TextScore]] = [] + filenames, _ = await self._filter(filenames) + filter_filenames = set() + for i in filenames: + content = await aread(filename=i) + token_count = len(encoding.encode(content)) + if not self._is_buildable(token_count): + result.append(TextScore(filename=str(i), text=content)) + continue + file_fingerprint = generate_fingerprint(content) + if self.fingerprints.get(str(i)) != file_fingerprint: + logger.error(f'file: "{i}" changed but not indexed') + continue + filter_filenames.add(str(i)) + nodes = await self._search(query=query, filters=filter_filenames) + return result + nodes + + async def merge( + self, query: str, indices_list: List[List[Union[NodeWithScore, TextScore]]] + ) -> List[Union[NodeWithScore, TextScore]]: + """Merge results from multiple indices based on the query. + + Args: + query (str): The search query. + indices_list (List[List[Union[NodeWithScore, TextScore]]]): A list of result lists from different indices. + + Returns: + List[Union[NodeWithScore, TextScore]]: A list of merged results sorted by similarity. + """ + if not self.embedding: + config = Config.default() + if self.model: + config.embedding.model = self.model + factory = RAGEmbeddingFactory(config) + self.embedding = factory.get_rag_embedding() + + scores = [] + query_embedding = await self.embedding.aget_text_embedding(query) + flat_nodes = [node for indices in indices_list for node in indices] + for i in flat_nodes: + text_embedding = await self.embedding.aget_text_embedding(i.text) + similarity = self.embedding.similarity(query_embedding, text_embedding) + scores.append((similarity, i)) + scores.sort(key=lambda x: x[0], reverse=True) + return [i[1] for i in scores][: self.recall_count] + + async def add(self, paths: List[Path]): + """Add new documents to the index. + + Args: + paths (List[Path]): A list of paths to the documents to be added. + """ + encoding = tiktoken.get_encoding("cl100k_base") + filenames, _ = await self._filter(paths) + filter_filenames = [] + delete_filenames = [] + for i in filenames: + content = await aread(filename=i) + if not self._is_fingerprint_changed(filename=i, content=content): + continue + token_count = len(encoding.encode(content)) + if self._is_buildable(token_count): + filter_filenames.append(i) + logger.debug(f"{i} is_buildable: {token_count}, {self.min_token_count}~{self.max_token_count}") + else: + delete_filenames.append(i) + logger.debug(f"{i} not is_buildable: {token_count}, {self.min_token_count}~{self.max_token_count}") + await self._add_batch(filenames=filter_filenames, delete_filenames=delete_filenames) + + async def _add_batch(self, filenames: List[Union[str, Path]], delete_filenames: List[Union[str, Path]]): + """Add and remove documents in a batch operation. + + Args: + filenames (List[Union[str, Path]]): List of filenames to add. + delete_filenames (List[Union[str, Path]]): List of filenames to delete. + """ + if not filenames: + return + logger.info(f"update index repo, add {filenames}, remove {delete_filenames}") + engine = None + if Path(self.persist_path).exists(): + logger.debug(f"load index from {self.persist_path}") + engine = SimpleEngine.from_index( + index_config=FAISSIndexConfig(persist_path=self.persist_path), + retriever_configs=[FAISSRetrieverConfig()], + ) + try: + engine.delete_docs(filenames + delete_filenames) + logger.debug(f"delete docs {filenames + delete_filenames}") + engine.add_docs(input_files=filenames) + logger.debug(f"add docs {filenames}") + except NotImplementedError as e: + logger.debug(f"{e}") + filenames = list(set([str(i) for i in filenames] + list(self.fingerprints.keys()))) + engine = None + logger.info(f"{e}. Rebuild all.") + if not engine: + engine = SimpleEngine.from_docs( + input_files=[str(i) for i in filenames], + retriever_configs=[FAISSRetrieverConfig()], + ranker_configs=[LLMRankerConfig()], + ) + logger.debug(f"add docs {filenames}") + engine.persist(persist_dir=self.persist_path) + for i in filenames: + content = await aread(i) + fp = generate_fingerprint(content) + self.fingerprints[str(i)] = fp + await awrite(filename=Path(self.persist_path) / self.fingerprint_filename, data=json.dumps(self.fingerprints)) + + def __str__(self): + """Return a string representation of the IndexRepo. + + Returns: + str: The filename of the index repository. + """ + return f"{self.persist_path}" + + def _is_buildable(self, token_count: int) -> bool: + """Check if the token count is within the buildable range. + + Args: + token_count (int): The number of tokens in the content. + + Returns: + bool: True if buildable, False otherwise. + """ + if token_count < self.min_token_count or token_count > self.max_token_count: + return False + return True + + async def _filter(self, filenames: Optional[List[Union[str, Path]]] = None) -> (List[Path], List[Path]): + """Filter the provided filenames to only include valid text files. + + Args: + filenames (Optional[List[Union[str, Path]]]): List of filenames to filter. + + Returns: + Tuple[List[Path], List[Path]]: A tuple containing a list of valid pathnames and a list of excluded paths. + """ + root_path = Path(self.root_path).absolute() + if not filenames: + filenames = [root_path] + pathnames = [] + excludes = [] + for i in filenames: + path = Path(i).absolute() + if not path.is_relative_to(root_path): + excludes.append(path) + logger.debug(f"{path} not is_relative_to {root_path})") + continue + if not path.is_dir(): + is_text, _ = await is_text_file(path) + if is_text: + pathnames.append(path) + continue + subfiles = list_files(path) + for j in subfiles: + is_text, _ = await is_text_file(j) + if is_text: + pathnames.append(j) + + logger.debug(f"{pathnames}, excludes:{excludes})") + return pathnames, excludes + + async def _search(self, query: str, filters: Set[str]) -> List[NodeWithScore]: + """Perform a search for the given query using the index. + + Args: + query (str): The search query. + filters (Set[str]): A set of filenames to filter the search results. + + Returns: + List[NodeWithScore]: A list of nodes with scores matching the query. + """ + if not Path(self.persist_path).exists(): + return [] + engine = SimpleEngine.from_index( + index_config=FAISSIndexConfig(persist_path=self.persist_path), retriever_configs=[FAISSRetrieverConfig()] + ) + rsp = await engine.aretrieve(query) + return [i for i in rsp if i.metadata.get("file_path") in filters] + + def _is_fingerprint_changed(self, filename: Union[str, Path], content: str) -> bool: + """Check if the fingerprint of the given document content has changed. + + Args: + filename (Union[str, Path]): The filename of the document. + content (str): The content of the document. + + Returns: + bool: True if the fingerprint has changed, False otherwise. + """ + old_fp = self.fingerprints.get(str(filename)) + if not old_fp: + return True + fp = generate_fingerprint(content) + return old_fp != fp diff --git a/metagpt/tools/libs/linter.py b/metagpt/tools/libs/linter.py new file mode 100644 index 000000000..c8760a53b --- /dev/null +++ b/metagpt/tools/libs/linter.py @@ -0,0 +1,226 @@ +""" +This file is borrowed from OpenDevin +You can find the original repository here: +https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/plugins/agent_skills/utils/aider/linter.py +""" +import os +import subprocess +import sys +import traceback +import warnings +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + +from grep_ast import TreeContext, filename_to_lang +from tree_sitter_languages import get_parser # noqa: E402 + +# tree_sitter is throwing a FutureWarning +warnings.simplefilter("ignore", category=FutureWarning) + + +@dataclass +class LintResult: + text: str + lines: list + + +class Linter: + def __init__(self, encoding="utf-8", root=None): + self.encoding = encoding + self.root = root + + self.languages = dict( + python=self.py_lint, + ) + self.all_lint_cmd = None + + def set_linter(self, lang, cmd): + if lang: + self.languages[lang] = cmd + return + + self.all_lint_cmd = cmd + + def get_rel_fname(self, fname): + if self.root: + return os.path.relpath(fname, self.root) + else: + return fname + + def run_cmd(self, cmd, rel_fname, code): + cmd += " " + rel_fname + cmd = cmd.split() + process = subprocess.Popen(cmd, cwd=self.root, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + stdout, _ = process.communicate() + errors = stdout.decode().strip() + self.returncode = process.returncode + if self.returncode == 0: + return # zero exit status + + cmd = " ".join(cmd) + res = "" + res += errors + line_num = extract_error_line_from(res) + return LintResult(text=res, lines=[line_num]) + + def get_abs_fname(self, fname): + if os.path.isabs(fname): + return fname + elif os.path.isfile(fname): + rel_fname = self.get_rel_fname(fname) + return os.path.abspath(rel_fname) + else: # if a temp file + return self.get_rel_fname(fname) + + def lint(self, fname, cmd=None) -> Optional[LintResult]: + code = Path(fname).read_text(self.encoding) + absolute_fname = self.get_abs_fname(fname) + if cmd: + cmd = cmd.strip() + if not cmd: + lang = filename_to_lang(fname) + if not lang: + return None + if self.all_lint_cmd: + cmd = self.all_lint_cmd + else: + cmd = self.languages.get(lang) + if callable(cmd): + linkres = cmd(fname, absolute_fname, code) + elif cmd: + linkres = self.run_cmd(cmd, absolute_fname, code) + else: + linkres = basic_lint(absolute_fname, code) + return linkres + + def flake_lint(self, rel_fname, code): + fatal = "F821,F822,F831,E112,E113,E999,E902" + flake8 = f"flake8 --select={fatal} --isolated" + + try: + flake_res = self.run_cmd(flake8, rel_fname, code) + except FileNotFoundError: + flake_res = None + return flake_res + + def py_lint(self, fname, rel_fname, code): + error = self.flake_lint(rel_fname, code) + if not error: + error = lint_python_compile(fname, code) + if not error: + error = basic_lint(rel_fname, code) + return error + + +def lint_python_compile(fname, code): + try: + compile(code, fname, "exec") # USE TRACEBACK BELOW HERE + return + except IndentationError as err: + end_lineno = getattr(err, "end_lineno", err.lineno) + if isinstance(end_lineno, int): + line_numbers = list(range(end_lineno - 1, end_lineno)) + else: + line_numbers = [] + + tb_lines = traceback.format_exception(type(err), err, err.__traceback__) + last_file_i = 0 + + target = "# USE TRACEBACK" + target += " BELOW HERE" + for i in range(len(tb_lines)): + if target in tb_lines[i]: + last_file_i = i + break + tb_lines = tb_lines[:1] + tb_lines[last_file_i + 1 :] + + res = "".join(tb_lines) + return LintResult(text=res, lines=line_numbers) + + +def basic_lint(fname, code): + """ + Use tree-sitter to look for syntax errors, display them with tree context. + """ + + lang = filename_to_lang(fname) + if not lang: + return + + parser = get_parser(lang) + tree = parser.parse(bytes(code, "utf-8")) + + errors = traverse_tree(tree.root_node) + if not errors: + return + return LintResult(text=f"{fname}:{errors[0]}", lines=errors) + + +def extract_error_line_from(lint_error): + # moved from openhands.agentskills#_lint_file + for line in lint_error.splitlines(True): + if line.strip(): + # The format of the error message is: ::: + parts = line.split(":") + if len(parts) >= 2: + try: + first_error_line = int(parts[1]) + break + except ValueError: + continue + return first_error_line + + +def tree_context(fname, code, line_nums): + context = TreeContext( + fname, + code, + color=False, + line_number=True, + child_context=False, + last_line=False, + margin=0, + mark_lois=True, + loi_pad=3, + # header_max=30, + show_top_of_file_parent_scope=False, + ) + line_nums = set(line_nums) + context.add_lines_of_interest(line_nums) + context.add_context() + output = context.format() + + return output + + +# Traverse the tree to find errors +def traverse_tree(node): + errors = [] + if node.type == "ERROR" or node.is_missing: + line_no = node.start_point[0] + 1 + errors.append(line_no) + + for child in node.children: + errors += traverse_tree(child) + + return errors + + +def main(): + """ + Main function to parse files provided as command line arguments. + """ + if len(sys.argv) < 2: + print("Usage: python linter.py ...") + sys.exit(1) + + linter = Linter(root=os.getcwd()) + for file_path in sys.argv[1:]: + errors = linter.lint(file_path) + if errors: + print(errors) + + +if __name__ == "__main__": + main() diff --git a/metagpt/utils/common.py b/metagpt/utils/common.py index 42a872c76..90f13da23 100644 --- a/metagpt/utils/common.py +++ b/metagpt/utils/common.py @@ -16,6 +16,7 @@ import base64 import contextlib import csv import functools +import hashlib import importlib import inspect import json @@ -889,7 +890,7 @@ async def get_mime_type(filename: str | Path, force_read: bool = False) -> str: } try: - stdout, stderr, _ = await shell_execute(f"file --mime-type {str(filename)}") + stdout, stderr, _ = await shell_execute(f"file --mime-type '{str(filename)}'") if stderr: logger.debug(f"file:{filename}, error:{stderr}") return guess_mime_type @@ -1175,3 +1176,23 @@ def rectify_pathname(path: Union[str, Path], default_filename: str) -> Path: else: output_pathname.parent.mkdir(parents=True, exist_ok=True) return output_pathname + + +def generate_fingerprint(text: str) -> str: + """ + Generate a fingerprint for the given text + + Args: + text (str): The text for which the fingerprint needs to be generated + + Returns: + str: The fingerprint value of the text + """ + text_bytes = text.encode("utf-8") + + # calculate SHA-256 hash + sha256 = hashlib.sha256() + sha256.update(text_bytes) + fingerprint = sha256.hexdigest() + + return fingerprint diff --git a/metagpt/utils/report.py b/metagpt/utils/report.py index 5d1cd93e4..427f401ab 100644 --- a/metagpt/utils/report.py +++ b/metagpt/utils/report.py @@ -306,7 +306,7 @@ class DocsReporter(FileReporter): class EditorReporter(FileReporter): - """Equivalent to FileReporter(block=BlockType.Editor).""" + """Equivalent to FileReporter(block=BlockType.EDITOR).""" block: Literal[BlockType.EDITOR] = BlockType.EDITOR diff --git a/requirements.txt b/requirements.txt index 23806eb63..ed8965b46 100644 --- a/requirements.txt +++ b/requirements.txt @@ -74,3 +74,5 @@ pylint~=3.0.3 pygithub~=2.3 htmlmin fsspec +grep-ast~=0.3.3 # linter +tree-sitter~=0.21.3 # linter \ No newline at end of file diff --git a/tests/data/embedding/2.answer.md b/tests/data/embedding/2.answer.md new file mode 100644 index 000000000..3807f03c1 --- /dev/null +++ b/tests/data/embedding/2.answer.md @@ -0,0 +1,2 @@ +检索结果 +法务查询者可根据国际小超人钉钉小程序UI上的滚筒切换业务线 这张图片展示了一个移动应用的界面,界面标题为“法律意见详情”。用户可以根据具体情况切换业务线。界面中有多个字段,包括“国家名称”、“国家情况描述”、“业务线”、“产品法规分析”和“签约主体”。第一张截图显示了详细的法律情报信息,包含区域名称、区域情况描述、业务线和产品法规概述等字段。第二张截图显示了“法律意见详情”界面,其中列出了国家名称、国家情况描述、业务线、产品法规分析和签约主体。第三张截图与第二张相似,但显示了选项的可选择状态。最下方有“取消”和“确定”的按钮。 法务查询者从国家详情中的业务线名列表中选出要查看的业务线。 \ No newline at end of file diff --git a/tests/data/embedding/2.knowledge.md b/tests/data/embedding/2.knowledge.md new file mode 100644 index 000000000..615614098 --- /dev/null +++ b/tests/data/embedding/2.knowledge.md @@ -0,0 +1,25 @@ +## Textual User Requirements + +### 3.2. 首页 + +首页有两个分区,上面部分是法律意见检索栏。 + +法务查询者第一次进入国际小超人钉钉小程序展示引导页,以后进入不再展示,点击「我知道了」引导页消失。 + +#### 首页 +![首页](1.png) +这是一个名为“法务小超人”的移动应用程序的界面截图。界面顶部显示了应用名称和一个可切换语言的按钮“English”。在界面中间部分,有一个标题“法律意见查询”,以及一个搜索框,提示输入国家名称以查询法律意见。下方显示已收录法律意见8394篇。界面下半部分是“法务 Q&A”部分,列出了一些法律相关的选项,例如“国际法务接入口人”、“国内法务接入口人”、“国际法律协议合同办理指引”和“国内法律协议合同办理指引”。界面底部有三个导航按钮,分别是“首页”、“模板”和“我的”。 + +#### 按国家名维度搜索 +法务查询者在国际小超人钉钉小程序的搜索框中进行检索时采用typeahead,只能下拉选择法务中台中有的国家名称。 +![按国家名维度搜索](2.png) +在这张图像中,用户正在一个名为“法律意见查询”的应用中进行国家名称的搜索。用户在搜索框中输入国家名称时,系统会提供下拉建议。这些建议基于 typeahead 功能,从法务中台中筛选出匹配的国家名称供用户选择。目前,搜索结果包含了“中国”和“菲律宾”两个具体的国家名称,其它显示为“国家名”。用户可以通过下拉菜单快速选择所需的国家名称。 + +#### 检索结果 +法务查询者可根据国际小超人钉钉小程序UI上的滚筒切换业务线 +![检索结果](3.png) +这张图片展示了一个移动应用的界面,界面标题为“法律意见详情”。用户可以根据具体情况切换业务线。界面中有多个字段,包括“国家名称”、“国家情况描述”、“业务线”、“产品法规分析”和“签约主体”。第一张截图显示了详细的法律情报信息,包含区域名称、区域情况描述、业务线和产品法规概述等字段。第二张截图显示了“法律意见详情”界面,其中列出了国家名称、国家情况描述、业务线、产品法规分析和签约主体。第三张截图与第二张相似,但显示了选项的可选择状态。最下方有“取消”和“确定”的按钮。 +法务查询者从国家详情中的业务线名列表中选出要查看的业务线。 + +#### 查看法律意见详情 +国际小超人钉钉小程序用国家代码和业务代码做参数,查询法律意见详情,然后将法律意见详情展示给法务查询者。 \ No newline at end of file diff --git a/tests/data/embedding/2.query.md b/tests/data/embedding/2.query.md new file mode 100644 index 000000000..ba470b8bd --- /dev/null +++ b/tests/data/embedding/2.query.md @@ -0,0 +1 @@ +业务线UI有哪些操作? \ No newline at end of file diff --git a/tests/data/embedding/3.answer.md b/tests/data/embedding/3.answer.md new file mode 100644 index 000000000..35b0c6899 --- /dev/null +++ b/tests/data/embedding/3.answer.md @@ -0,0 +1,7 @@ +国家/区域导游详情 & 法律意见详情 查询 +Description:根据国家code查询国家/区域导游信息详情 +ID: 8 +HTTP METHOD: GET +Endpoint: /contract/country/navigate.json +Input Parameters: |名称|描述|类型(长度)|必选|备注| | :- | :- | :-: | :- | :- | |countryCode|国家code|string|√|| +Returns: |名称|描述|类型(长度)|必选|备注| | :- | :- | :-: | :- | :- | |success|业务处理成功true,否则false|boolean|√|只判断这个属性即可| |message|错误信息,可以用来提示|string|√|| |code|返回状态码|string|√|| |data|国家/区域导游详情|object|√|| |-> country||||| |-> -> id|id|integer|√|| |-> -> country|国家code|string|√|| |-> -> countryName|国家中文名称|string|√|| |-> -> countryNameEn|国家英文名称|string|√|| |-> -> content|国家导游中文详情json数组,具体格式见下示例|list of object|√|| |-> -> -> title|标题|object|√|| |-> -> -> -> title|中文标题|string||| |-> -> -> -> titleEn|英文标题|string||| |-> -> -> contentList|标题下面的文字描述列表|list of object|√|| |-> -> -> -> detail|内容中文详情|string|√|| |-> -> -> -> detailEn|内容英文详情|string|√|| |-> -> -> -> url|超链接|string||| |-> legal|法务信息|object||| |-> -> country|国家code|string|√|| |-> -> businessList|业务线列表|list of object||| |-> -> -> id|id|integer||新增时不传,修改时传递| |-> -> -> business|业务线code|string|√|| |-> -> -> businessName|业务线中文名称|string|√|| |-> -> -> businessNameEn|业务线英文名称|string|√|| |-> -> -> content|业务线json,具体如下|object|√|| |-> -> -> -> detailEn|具体的详情英文内容|string|√|| |-> -> -> -> detail|具体的详情内容|string|√|| \ No newline at end of file diff --git a/tests/data/embedding/3.knowledge.md b/tests/data/embedding/3.knowledge.md new file mode 100644 index 000000000..61de5f4b8 --- /dev/null +++ b/tests/data/embedding/3.knowledge.md @@ -0,0 +1,189 @@ +## Interfaces +- 用户登录 + - Description: 用户从小程序/微应用发起请求,需要验证用户的合法身份才能正常处理。 + - ID: 1 + - HTTP METHOD: GET + - Endpoint: `/sup/login.json` + - Input Parameters: + |名称|描述|类型(长度)|必选|备注| + | :- | :- | :-: | :- | :- | + |authCode|用户临时免登授权码|String(64)|√|| + |loginTypeEnum|登录类型|String(20)|√|| + |authCorpId|用户所在企业/组织id|String(64)||微应用免登时传递| + |app|应用标识|String(3)|√|| + - Returns: + |名称|描述|类型(长度)|必选|备注| + | :- | :- | :-: | :- | :- | + |success|业务处理成功与否,成功true,否则false|boolean|√|只判断这个属性即可| + |message|错误信息,可以用来提示|string|√|| + |code|返回状态码|string|√|| + |data|用户的sessionId|string|√|| +- 根据sessionId查询用户详细信息 + - Description: 查询当前用户的详细信息,如 staffId,unionId,name,avatar等信息 + - ID: 2 + - HTTP METHOD: GET + - Endpoint: `/sup/user.json` + - Input Parameters: + |名称|描述|类型(长度)|必选|备注| + | :- | :- | :-: | :- | :- | + |NDA_SESSION|用户sessionId|String(64)|√|| + - Returns: + |名称|描述|类型(长度)|必选|备注| + | :- | :- | :-: | :- | :- | + |success|业务处理成功与否,成功true,否则false|boolean|√|只判断这个属性即可| + |message|错误信息,可以用来提示|string|√|| + |code|返回状态码|string|√|| + |data|用户的详细信息|object|√|| + |-> corpId|当前用户企业 钉钉ID(小程序端会拿不到该信息)|string|√|| + |-> corpName|当前用户企业名称(小程序端会拿不到该信息)|string|√|| + |-> staffId|员工在当前企业内的唯一标识,也称staffId(小程序端会拿不到该信息)|string|√|| + |-> unionId|员工在当前开发者企业账号范围内的唯一标识,系统生成,固定值,不会改变。|string|√|| + |-> name|当前用户的名称(小程序端会拿不到该信息)|string|√|| + |-> avatar|头像图片URL|string|√|| +- 查询国家情况描述 + - Description: 根据国家code查询国家情况描述 + - ID: 3 + - HTTP METHOD: GET + - Endpoint: `/sup/country/detail.json` + - Input Parameters: + |名称|描述|类型(长度)|必选|备注| + | :- | :- | :-: | :- | :- | + |countryCode|国家code|string|√|| + - Returns: + |名称|描述|类型(长度)|必选|备注| + | :- | :- | :-: | :- | :- | + |success|业务处理成功true,否则false|boolean|√|只判断这个属性即可| + |message|错误信息,可以用来提示|string|√|| + |code|返回状态码|string|√|| + |data|国家情况描述|object|√|| + |-> id|id|integer|√|| + |-> countryName|国家名称|string|√|| + |-> countryCode|国家code|string|√|| + |-> detail|产品法规分析|string|√|| +- 查询产品法规分析(法律意见详情) + - Description: 根据国家和业务线查询产品法规分析 + - ID: 4 + - HTTP METHOD: GET + - Endpoint: `/sup/legal/detail.json` + - Input Parameters: + |名称|描述|类型(长度)|必选|备注| + | :- | :- | :-: | :- | :- | + |countryCode|国家code|string|√|| + |businessCode|业务线code|string|√|| + - Returns: + |名称|描述|类型(长度)|必选|备注| + | :- | :- | :-: | :- | :- | + |success|业务处理成功true,否则false|boolean|√|只判断这个属性即可| + |message|错误信息,可以用来提示|string|√|| + |code|返回状态码|string|√|| + |data|法律意见详情|object|√|| + |-> id|id|integer|√|| + |-> countryName|国家名称|string|√|| + |-> countryCode|国家code|string|√|| + |-> businessLine|业务线|string|√|| + |-> businessCode|业务线code|string|√|| + |-> detail|产品法规分析|string|√|| + |-> signEntity|签约主体|string|√|| +- 查询法律意见总数 + - Description: 法律意见总数查询 + - ID: 5 + - HTTP METHOD: GET + - Endpoint: `/sup/legal/count.json` + - Input Parameters: + |名称|描述|类型(长度)|必选|备注| + | :- | :- | :-: | :- | :- | + - Returns: + |名称|描述|类型(长度)|必选|备注| + | :- | :- | :-: | :- | :- | + |success|业务处理成功true,否则false|boolean|√|只判断这个属性即可| + |message|错误信息,可以用来提示|string|√|| + |code|返回状态码|string|√|| + |data|总数|integer|√|| +- 查询所有国家和业务线信息列表 + - Description: 查询所有国家和业务线信息列表 + - ID: 6 + - HTTP METHOD: GET + - Endpoint: `/sup/legal/country/list.json` + - Input Parameters: + |名称|描述|类型(长度)|必选|备注| + | :- | :- | :-: | :- | :- | + - Returns: + |名称|描述|类型(长度)|必选|备注| + | :- | :- | :-: | :- | :- | + |success|业务处理成功true,否则false|boolean|√|只判断这个属性即可| + |message|错误信息,可以用来提示|string|√|| + |code|返回状态码|string|√|| + |data|所有数据列表|list of object|√|| + |-> country|国家code|string|√|| + |-> business|业务线code|string|√|| + |-> dataType|数据类型|string|√|| + |-> businessName|业务线名|string|√|| + |-> countryName|国家名|string|√|| + |-> businessNameEn|业务线名(英文)|string|√|| +- 调用法务中台antlaw接口 + - ID: 7 +- 国家/区域导游详情 & 法律意见详情 查询 + - Description:根据国家code查询国家/区域导游信息详情 + - ID: 8 + - HTTP METHOD: GET + - Endpoint: `/contract/country/navigate.json` + - Input Parameters: + |名称|描述|类型(长度)|必选|备注| + | :- | :- | :-: | :- | :- | + |countryCode|国家code|string|√|| + - Returns: + |名称|描述|类型(长度)|必选|备注| + | :- | :- | :-: | :- | :- | + |success|业务处理成功true,否则false|boolean|√|只判断这个属性即可| + |message|错误信息,可以用来提示|string|√|| + |code|返回状态码|string|√|| + |data|国家/区域导游详情|object|√|| + |-> country||||| + |-> -> id|id|integer|√|| + |-> -> country|国家code|string|√|| + |-> -> countryName|国家中文名称|string|√|| + |-> -> countryNameEn|国家英文名称|string|√|| + |-> -> content|国家导游中文详情json数组,具体格式见下示例|list of object|√|| + |-> -> -> title|标题|object|√|| + |-> -> -> -> title|中文标题|string||| + |-> -> -> -> titleEn|英文标题|string||| + |-> -> -> contentList|标题下面的文字描述列表|list of object|√|| + |-> -> -> -> detail|内容中文详情|string|√|| + |-> -> -> -> detailEn|内容英文详情|string|√|| + |-> -> -> -> url|超链接|string||| + |-> legal|法务信息|object||| + |-> -> country|国家code|string|√|| + |-> -> businessList|业务线列表|list of object||| + |-> -> -> id|id|integer||新增时不传,修改时传递| + |-> -> -> business|业务线code|string|√|| + |-> -> -> businessName|业务线中文名称|string|√|| + |-> -> -> businessNameEn|业务线英文名称|string|√|| + |-> -> -> content|业务线json,具体如下|object|√|| + |-> -> -> -> detailEn|具体的详情英文内容|string|√|| + |-> -> -> -> detail|具体的详情内容|string|√|| +- 国家/区域导游列表分页查询 + - Description: 分页查询国家/区域列表 + - ID: 9 + - HTTP METHOD: GET + - Endpoint: `/contract/country/list.json` + - Input Parameters: + |名称|描述|类型(长度)|必选|备注| + | :- | :- | :-: | :- | :- | + |pageSize|分页大小|integer|√|>=1| + |pageNum|分页大小|integer|√|>=1| + |country|国家code|string||| + |business|业务线code|string||| + - Returns: + |名称|描述|类型(长度)|必选|备注| + | :- | :- | :-: | :- | :- | + |success|业务处理成功true,否则false|boolean|√|只判断这个属性即可| + |message|错误信息,可以用来提示|string|√|| + |code|返回状态码|string|√|| + |data|国家/区域导游详情|list of object|√|| + |-> id|id|integer|√|| + |-> country|国家code|string|√|| + |-> countryName|国家中文名称|string|√|| + |-> countryNameEn|国家英文名称|string|√|| + |-> gmtCreate|创建时间|string|√|| + |-> gmtModified|更新时间|string|√|| + |total|数据总量|integer|√|| diff --git a/tests/data/embedding/3.query.md b/tests/data/embedding/3.query.md new file mode 100644 index 000000000..6026899d7 --- /dev/null +++ b/tests/data/embedding/3.query.md @@ -0,0 +1 @@ +根据国家code查询国家业务线列表 \ No newline at end of file diff --git a/tests/metagpt/rag/__init__.py b/tests/metagpt/rag/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/metagpt/rag/test_large_pdf.py b/tests/metagpt/rag/test_large_pdf.py new file mode 100644 index 000000000..4f343aa87 --- /dev/null +++ b/tests/metagpt/rag/test_large_pdf.py @@ -0,0 +1,55 @@ +import pytest + +from metagpt.config2 import Config +from metagpt.const import TEST_DATA_PATH +from metagpt.rag.engines import SimpleEngine +from metagpt.rag.factories.embedding import RAGEmbeddingFactory +from metagpt.utils.common import aread + + +@pytest.mark.skip +@pytest.mark.parametrize( + ("knowledge_filename", "query_filename", "answer_filename"), + [ + ( + TEST_DATA_PATH / "embedding/2.knowledge.md", + TEST_DATA_PATH / "embedding/2.query.md", + TEST_DATA_PATH / "embedding/2.answer.md", + ), + ( + TEST_DATA_PATH / "embedding/3.knowledge.md", + TEST_DATA_PATH / "embedding/3.query.md", + TEST_DATA_PATH / "embedding/3.answer.md", + ), + ], +) +@pytest.mark.asyncio +async def test_large_pdf(knowledge_filename, query_filename, answer_filename): + Config.default(reload=True) # `config.embedding.model = "text-embedding-ada-002"` changes the cache. + + engine = SimpleEngine.from_docs( + input_files=[knowledge_filename], + ) + + query = await aread(filename=query_filename) + rsp = await engine.aretrieve(query) + assert rsp + + config = Config.default() + config.embedding.model = "text-embedding-ada-002" + factory = RAGEmbeddingFactory(config) + embedding = factory.get_rag_embedding() + answer = await aread(filename=answer_filename) + answer_embedding = await embedding.aget_text_embedding(answer) + similarity = 0 + for i in rsp: + rsp_embedding = await embedding.aget_query_embedding(i.text) + v = embedding.similarity(answer_embedding, rsp_embedding) + similarity = max(similarity, v) + + print(similarity) + assert similarity > 0.9 + + +if __name__ == "__main__": + pytest.main([__file__, "-s"]) diff --git a/tests/metagpt/tools/libs/test_editor.py b/tests/metagpt/tools/libs/test_editor.py index 64149fdb7..bcef2b74e 100644 --- a/tests/metagpt/tools/libs/test_editor.py +++ b/tests/metagpt/tools/libs/test_editor.py @@ -1,7 +1,7 @@ import pytest from metagpt.const import TEST_DATA_PATH -from metagpt.tools.libs.editor import Editor, FileBlock +from metagpt.tools.libs.editor import Editor TEST_FILE_CONTENT = """ # this is line one @@ -13,31 +13,24 @@ def test_function_for_fm(): # this is the 7th line """.strip() -TEST_FILE_PATH = TEST_DATA_PATH / "tools/test_script_for_editor.py" +WINDOW = 100 @pytest.fixture -def test_file(): - with open(TEST_FILE_PATH, "w") as f: - f.write(TEST_FILE_CONTENT) - yield - with open(TEST_FILE_PATH, "w") as f: - f.write("") +def temp_file_path(tmp_path): + assert tmp_path is not None + temp_file_path = tmp_path / "a.txt" + yield temp_file_path + temp_file_path.unlink() -EXPECTED_SEARCHED_BLOCK = FileBlock( - file_path=str(TEST_FILE_PATH), - block_content='001|# this is line one\n002|def test_function_for_fm():\n003| "some docstring"\n004| a = 1\n005| b = 2\n', - block_start_line=1, - block_end_line=5, - symbol="def test_function_for_fm", - symbol_line=2, -) - - -def test_search_content(test_file): - block = Editor().search_content("def test_function_for_fm", root_path=TEST_DATA_PATH, window=3) - assert block == EXPECTED_SEARCHED_BLOCK +@pytest.fixture +def temp_py_file(tmp_path): + assert tmp_path is not None + temp_file_path = tmp_path / "test_script_for_editor.py" + temp_file_path.write_text(TEST_FILE_CONTENT) + yield temp_file_path + temp_file_path.unlink() EXPECTED_CONTENT_AFTER_REPLACE = """ @@ -50,32 +43,43 @@ def test_function_for_fm(): """.strip() -@pytest.mark.skip -def test_replace_content(test_file): - Editor().write_content( - file_path=str(TEST_FILE_PATH), - start_line=3, - end_line=5, - new_block_content=" # This is the new line A replacing lines 3 to 5.\n # This is the new line B.", +def test_replace_content(temp_py_file): + editor = Editor() + editor._edit_file_impl( + file_name=temp_py_file, + start=3, + end=5, + content=" # This is the new line A replacing lines 3 to 5.\n # This is the new line B.", + is_insert=False, + is_append=False, ) - with open(TEST_FILE_PATH, "r") as f: + with open(temp_py_file, "r") as f: new_content = f.read() - assert new_content == EXPECTED_CONTENT_AFTER_REPLACE + assert new_content.strip() == EXPECTED_CONTENT_AFTER_REPLACE.strip() EXPECTED_CONTENT_AFTER_DELETE = """ # this is line one def test_function_for_fm(): + c = 3 # this is the 7th line """.strip() -def test_delete_content(test_file): - Editor().write_content(file_path=str(TEST_FILE_PATH), start_line=3, end_line=5) - with open(TEST_FILE_PATH, "r") as f: +def test_delete_content(temp_py_file): + editor = Editor() + editor._edit_file_impl( + file_name=temp_py_file, + start=3, + end=5, + content="", + is_insert=False, + is_append=False, + ) + with open(temp_py_file, "r") as f: new_content = f.read() - assert new_content == EXPECTED_CONTENT_AFTER_DELETE + assert new_content.strip() == EXPECTED_CONTENT_AFTER_DELETE.strip() EXPECTED_CONTENT_AFTER_INSERT = """ @@ -90,39 +94,16 @@ def test_function_for_fm(): """.strip() -@pytest.mark.skip -def test_insert_content(test_file): - Editor().write_content( - file_path=str(TEST_FILE_PATH), - start_line=3, - end_line=-1, - new_block_content=" # This is the new line to be inserted, at line 3", +def test_insert_content(temp_py_file): + editor = Editor(enable_auto_lint=True) + editor.insert_content_at_line( + file_name=temp_py_file, + line_number=3, + content=" # This is the new line to be inserted, at line 3", ) - with open(TEST_FILE_PATH, "r") as f: + with open(temp_py_file, "r") as f: new_content = f.read() - assert new_content == EXPECTED_CONTENT_AFTER_INSERT - - -@pytest.mark.skip -def test_new_content_wrong_indentation(test_file): - msg = Editor().write_content( - file_path=str(TEST_FILE_PATH), - start_line=3, - end_line=-1, - new_block_content=" This is the new line to be inserted, at line 3", # omit # should throw a syntax error - ) - assert "failed" in msg - - -@pytest.mark.skip -def test_new_content_format_issue(test_file): - msg = Editor().write_content( - file_path=str(TEST_FILE_PATH), - start_line=3, - end_line=-1, - new_block_content=" # This is the new line to be inserted, at line 3 ", # trailing spaces are format issue only, and should not throw an error - ) - assert "failed" not in msg + assert new_content.strip() == EXPECTED_CONTENT_AFTER_INSERT.strip() @pytest.mark.parametrize( @@ -151,5 +132,518 @@ async def test_read_files(filename): assert file_block.block_content +def _numbered_test_lines(start, end) -> str: + return ("\n".join(f"{i}|" for i in range(start, end + 1))) + "\n" + + +def _generate_test_file_with_lines(temp_path, num_lines) -> str: + file_path = temp_path / "test_file.py" + file_path.write_text("\n" * num_lines) + return file_path + + +def _generate_ruby_test_file_with_lines(temp_path, num_lines) -> str: + file_path = temp_path / "test_file.rb" + file_path.write_text("\n" * num_lines) + return file_path + + +def _calculate_window_bounds(current_line, total_lines, window_size): + half_window = window_size // 2 + if current_line - half_window < 0: + start = 1 + end = window_size + else: + start = current_line - half_window + end = current_line + half_window + return start, end + + +def test_open_file_unexist_path(): + editor = Editor() + with pytest.raises(FileNotFoundError): + editor.open_file("/unexist/path/a.txt") + + +def test_open_file(temp_file_path): + editor = Editor() + temp_file_path.write_text("Line 1\nLine 2\nLine 3\nLine 4\nLine 5") + + result = editor.open_file(str(temp_file_path)) + + assert result is not None + expected = ( + f"[File: {temp_file_path} (5 lines total)]\n" + "(this is the beginning of the file)\n" + "1|Line 1\n" + "2|Line 2\n" + "3|Line 3\n" + "4|Line 4\n" + "5|Line 5\n" + "(this is the end of the file)" + ) + assert result.split("\n") == expected.split("\n") + + +def test_open_file_with_indentation(temp_file_path): + editor = Editor() + temp_file_path.write_text("Line 1\n Line 2\nLine 3\nLine 4\nLine 5") + + result = editor.open_file(str(temp_file_path)) + assert result is not None + expected = ( + f"[File: {temp_file_path} (5 lines total)]\n" + "(this is the beginning of the file)\n" + "1|Line 1\n" + "2| Line 2\n" + "3|Line 3\n" + "4|Line 4\n" + "5|Line 5\n" + "(this is the end of the file)" + ) + assert result.split("\n") == expected.split("\n") + + +def test_open_file_long(temp_file_path): + editor = Editor() + content = "\n".join([f"Line {i}" for i in range(1, 1001)]) + temp_file_path.write_text(content) + + result = editor.open_file(str(temp_file_path), 1, 50) + assert result is not None + expected = f"[File: {temp_file_path} (1000 lines total)]\n" + expected += "(this is the beginning of the file)\n" + for i in range(1, 51): + expected += f"{i}|Line {i}\n" + expected += "(950 more lines below)" + assert result.split("\n") == expected.split("\n") + + +def test_open_file_long_with_lineno(temp_file_path): + editor = Editor() + content = "\n".join([f"Line {i}" for i in range(1, 1001)]) + temp_file_path.write_text(content) + + cur_line = 100 + + result = editor.open_file(str(temp_file_path), cur_line) + assert result is not None + expected = f"[File: {temp_file_path} (1000 lines total)]\n" + start, end = _calculate_window_bounds(cur_line, 1000, WINDOW) + if start == 1: + expected += "(this is the beginning of the file)\n" + else: + expected += f"({start - 1} more lines above)\n" + for i in range(start, end + 1): + expected += f"{i}|Line {i}\n" + if end == 1000: + expected += "(this is the end of the file)\n" + else: + expected += f"({1000 - end} more lines below)" + assert result.split("\n") == expected.split("\n") + + +def test_create_file_unexist_path(): + editor = Editor() + with pytest.raises(FileNotFoundError): + editor.create_file("/unexist/path/a.txt") + + +def test_create_file(temp_file_path): + editor = Editor() + result = editor.create_file(str(temp_file_path)) + + expected = f"[File {temp_file_path} created.]" + assert result.split("\n") == expected.split("\n") + + +def test_goto_line(temp_file_path): + editor = Editor() + total_lines = 1000 + content = "\n".join([f"Line {i}" for i in range(1, total_lines + 1)]) + temp_file_path.write_text(content) + + result = editor.open_file(str(temp_file_path)) + assert result is not None + + expected = f"[File: {temp_file_path} ({total_lines} lines total)]\n" + expected += "(this is the beginning of the file)\n" + for i in range(1, WINDOW + 1): + expected += f"{i}|Line {i}\n" + expected += f"({total_lines - WINDOW} more lines below)" + assert result.split("\n") == expected.split("\n") + + result = editor.goto_line(500) + + assert result is not None + + cur_line = 500 + expected = f"[File: {temp_file_path} ({total_lines} lines total)]\n" + start, end = _calculate_window_bounds(cur_line, total_lines, WINDOW) + if start == 1: + expected += "(this is the beginning of the file)\n" + else: + expected += f"({start - 1} more lines above)\n" + for i in range(start, end + 1): + expected += f"{i}|Line {i}\n" + if end == total_lines: + expected += "(this is the end of the file)\n" + else: + expected += f"({total_lines - end} more lines below)" + assert result.split("\n") == expected.split("\n") + + +def test_goto_line_negative(temp_file_path): + editor = Editor() + content = "\n".join([f"Line {i}" for i in range(1, 5)]) + temp_file_path.write_text(content) + + editor.open_file(str(temp_file_path)) + with pytest.raises(ValueError): + editor.goto_line(-1) + + +def test_goto_line_out_of_bound(temp_file_path): + editor = Editor() + content = "\n".join([f"Line {i}" for i in range(1, 5)]) + temp_file_path.write_text(content) + + editor.open_file(str(temp_file_path)) + with pytest.raises(ValueError): + editor.goto_line(100) + + +def test_scroll_down(temp_file_path): + editor = Editor() + total_lines = 1000 + content = "\n".join([f"Line {i}" for i in range(1, total_lines + 1)]) + temp_file_path.write_text(content) + result = editor.open_file(str(temp_file_path)) + assert result is not None + + expected = f"[File: {temp_file_path} ({total_lines} lines total)]\n" + start, end = _calculate_window_bounds(1, total_lines, WINDOW) + if start == 1: + expected += "(this is the beginning of the file)\n" + else: + expected += f"({start - 1} more lines above)\n" + for i in range(start, end + 1): + expected += f"{i}|Line {i}\n" + if end == total_lines: + expected += "(this is the end of the file)" + else: + expected += f"({total_lines - end} more lines below)" + assert result.split("\n") == expected.split("\n") + + result = editor.scroll_down() + + assert result is not None + + expected = f"[File: {temp_file_path} ({total_lines} lines total)]\n" + start, end = _calculate_window_bounds(WINDOW + 1, total_lines, WINDOW) + if start == 1: + expected += "(this is the beginning of the file)\n" + else: + expected += f"({start - 1} more lines above)\n" + for i in range(start, end + 1): + expected += f"{i}|Line {i}\n" + if end == total_lines: + expected += "(this is the end of the file)\n" + else: + expected += f"({total_lines - end} more lines below)" + assert result.split("\n") == expected.split("\n") + + +def test_scroll_up(temp_file_path): + editor = Editor() + total_lines = 1000 + content = "\n".join([f"Line {i}" for i in range(1, total_lines + 1)]) + temp_file_path.write_text(content) + + cur_line = 300 + + result = editor.open_file(str(temp_file_path), cur_line) + assert result is not None + + expected = f"[File: {temp_file_path} ({total_lines} lines total)]\n" + start, end = _calculate_window_bounds(cur_line, total_lines, WINDOW) + if start == 1: + expected += "(this is the beginning of the file)\n" + else: + expected += f"({start - 1} more lines above)\n" + for i in range(start, end + 1): + expected += f"{i}|Line {i}\n" + if end == total_lines: + expected += "(this is the end of the file)\n" + else: + expected += f"({total_lines - end} more lines below)" + assert result.split("\n") == expected.split("\n") + result = editor.scroll_up() + assert result is not None + + cur_line = cur_line - WINDOW + + expected = f"[File: {temp_file_path} ({total_lines} lines total)]\n" + start, end = _calculate_window_bounds(cur_line, total_lines, WINDOW) + if start == 1: + expected += "(this is the beginning of the file)\n" + else: + expected += f"({start - 1} more lines above)\n" + for i in range(start, end + 1): + expected += f"{i}|Line {i}\n" + if end == total_lines: + expected += "(this is the end of the file)\n" + else: + expected += f"({total_lines - end} more lines below)" + assert result.split("\n") == expected.split("\n") + + +def test_scroll_down_edge(temp_file_path): + editor = Editor() + content = "\n".join([f"Line {i}" for i in range(1, 10)]) + temp_file_path.write_text(content) + + result = editor.open_file(str(temp_file_path)) + assert result is not None + + expected = f"[File: {temp_file_path} (9 lines total)]\n" + expected += "(this is the beginning of the file)\n" + for i in range(1, 10): + expected += f"{i}|Line {i}\n" + expected += "(this is the end of the file)" + + result = editor.scroll_down() + assert result is not None + + assert result.split("\n") == expected.split("\n") + + +def test_print_window_internal(temp_file_path): + editor = Editor() + editor.create_file(str(temp_file_path)) + with open(temp_file_path, "w") as file: + for i in range(1, 101): + file.write(f"Line `{i}`\n") + + current_line = 50 + window = 2 + + result = editor._print_window(temp_file_path, current_line, window) + expected = "(48 more lines above)\n" "49|Line `49`\n" "50|Line `50`\n" "51|Line `51`\n" "(49 more lines below)" + assert result == expected + + +def test_open_file_large_line_number(temp_file_path): + editor = Editor() + editor.create_file(str(temp_file_path)) + with open(temp_file_path, "w") as file: + for i in range(1, 1000): + file.write(f"Line `{i}`\n") + + current_line = 800 + window = 100 + + result = editor.open_file(str(temp_file_path), current_line, window) + + expected = f"[File: {temp_file_path} (999 lines total)]\n" + expected += "(749 more lines above)\n" + for i in range(750, 850 + 1): + expected += f"{i}|Line `{i}`\n" + expected += "(149 more lines below)" + assert result == expected + + +def test_open_file_large_line_number_consecutive_diff_window(temp_file_path): + editor = Editor() + editor.create_file(str(temp_file_path)) + total_lines = 1000 + with open(temp_file_path, "w") as file: + for i in range(1, total_lines + 1): + file.write(f"Line `{i}`\n") + + current_line = 800 + cur_window = 300 + + result = editor.open_file(str(temp_file_path), current_line, cur_window) + + expected = f"[File: {temp_file_path} ({total_lines} lines total)]\n" + start, end = _calculate_window_bounds(current_line, total_lines, cur_window) + if start == 1: + expected += "(this is the beginning of the file)\n" + else: + expected += f"({start - 1} more lines above)\n" + for i in range(current_line - cur_window // 2, current_line + cur_window // 2 + 1): + expected += f"{i}|Line `{i}`\n" + if end == total_lines: + expected += "(this is the end of the file)\n" + else: + expected += f"({total_lines - end} more lines below)" + assert result == expected + + current_line = current_line - WINDOW + + result = editor.scroll_up() + + expected = f"[File: {temp_file_path} ({total_lines} lines total)]\n" + start, end = _calculate_window_bounds(current_line, total_lines, WINDOW) + if start == 1: + expected += "(this is the beginning of the file)\n" + else: + expected += f"({start - 1} more lines above)\n" + for i in range(start, end + 1): + expected += f"{i}|Line `{i}`\n" + if end == total_lines: + expected += "(this is the end of the file)\n" + else: + expected += f"({total_lines - end} more lines below)" + assert result.split("\n") == expected.split("\n") + + +EXPECTED_CONTENT_AFTER_REPLACE_TEXT = """ +# this is line one +def test_function_for_fm(): + "some docstring" + a = 1 + b = 9 + c = 3 + # this is the 7th line +""".strip() + + +def test_edit_file_by_replace(temp_py_file): + editor = Editor() + editor.edit_file_by_replace(file_name=str(temp_py_file), to_replace=" b = 2", new_content=" b = 9") + with open(temp_py_file, "r") as f: + new_content = f.read() + assert new_content.strip() == EXPECTED_CONTENT_AFTER_REPLACE_TEXT.strip() + + +def test_append_file(temp_file_path): + editor = Editor() + # 写入初始内容 + initial_content = "Line 1\nLine 2\nLine 3\n" + temp_file_path.write_text(initial_content) + + # 追加内容到文件 + append_content = "Line 4\nLine 5\n" + + result = editor.append_file(str(temp_file_path), append_content) + + # 预期内容 + expected_content = initial_content + append_content + + # 读取文件并断言内容与预期一致 + with open(temp_file_path, "r") as f: + new_content = f.read() + assert new_content == expected_content + + # 输出的预期结果 + expected_output = ( + f"[File: {temp_file_path.resolve()} (5 lines total after edit)]\n" + "(this is the beginning of the file)\n" + "1|Line 1\n" + "2|Line 2\n" + "3|Line 3\n" + "4|Line 4\n" + "5|Line 5\n" + "(this is the end of the file)\n" + "[File updated (edited at line 3). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]" + ) + + assert result.split("\n") == expected_output.split("\n") + + +def test_search_dir(tmp_path): + editor = Editor() + dir_path = tmp_path / "test_dir" + dir_path.mkdir() + + # Create some files with specific content + (dir_path / "file1.txt").write_text("This is a test file with some content.") + (dir_path / "file2.txt").write_text("Another file with different content.") + sub_dir = dir_path / "sub_dir" + sub_dir.mkdir() + (sub_dir / "file3.txt").write_text("This file is inside a sub directory with some content.") + + search_term = "some content" + + result = editor.search_dir(search_term, str(dir_path)) + + assert "file1.txt" in result + assert "file3.txt" in result + assert "Another file with different content." not in result + + +def test_search_file(temp_file_path): + editor = Editor() + file_path = temp_file_path + file_path.write_text("This is a test file with some content.\nAnother line with more content.") + + search_term = "some content" + + result = editor.search_file(search_term, str(file_path)) + + assert "Line 1: This is a test file with some content." in result + assert "Line 2: Another line with more content." not in result + + +def test_find_file(tmp_path): + editor = Editor() + dir_path = tmp_path / "test_dir" + dir_path.mkdir() + + # Create some files with specific names + (dir_path / "file1.txt").write_text("Content of file 1.") + (dir_path / "file2.txt").write_text("Content of file 2.") + sub_dir = dir_path / "sub_dir" + sub_dir.mkdir() + (sub_dir / "file3.txt").write_text("Content of file 3.") + + file_name = "file1.txt" + + result = editor.find_file(file_name, str(dir_path)) + + assert "file1.txt" in result + assert "file2.txt" not in result + assert "file3.txt" not in result + + +# Test data for _append_impl method +TEST_LINES = ["First line\n", "Second line\n", "Third line\n"] + +NEW_CONTENT = "Appended line\n" + +EXPECTED_APPEND_NON_EMPTY_FILE = ["First line\n", "Second line\n", "Third line\n", "Appended line\n"] + +EXPECTED_APPEND_EMPTY_FILE = ["Appended line\n"] + + +def test_append_non_empty_file(): + editor = Editor() + lines = TEST_LINES.copy() + content, n_added_lines = editor._append_impl(lines, NEW_CONTENT) + + assert content.splitlines(keepends=True) == EXPECTED_APPEND_NON_EMPTY_FILE + assert n_added_lines == 1 + + +def test_append_empty_file(): + editor = Editor() + lines = [] + content, n_added_lines = editor._append_impl(lines, NEW_CONTENT) + + assert content.splitlines(keepends=True) == EXPECTED_APPEND_EMPTY_FILE + assert n_added_lines == 1 + + +def test_append_to_single_empty_line_file(): + editor = Editor() + lines = [""] + content, n_added_lines = editor._append_impl(lines, NEW_CONTENT) + + assert content.splitlines(keepends=True) == EXPECTED_APPEND_EMPTY_FILE + assert n_added_lines == 1 + + if __name__ == "__main__": pytest.main([__file__, "-s"]) diff --git a/tests/metagpt/tools/libs/test_index_repo.py b/tests/metagpt/tools/libs/test_index_repo.py new file mode 100644 index 000000000..3cc8ad406 --- /dev/null +++ b/tests/metagpt/tools/libs/test_index_repo.py @@ -0,0 +1,32 @@ +import shutil + +import pytest + +from metagpt.const import DEFAULT_WORKSPACE_ROOT, TEST_DATA_PATH +from metagpt.tools.libs.index_repo import IndexRepo + + +@pytest.mark.asyncio +@pytest.mark.parametrize(("path", "query"), [(TEST_DATA_PATH / "requirements", "业务线")]) +async def test_index_repo(path, query): + index_path = DEFAULT_WORKSPACE_ROOT / ".index" + repo = IndexRepo(persist_path=str(index_path), root_path=str(path), min_token_count=0) + await repo.add([path]) + await repo.add([path]) + assert index_path.exists() + + rsp = await repo.search(query) + assert rsp + + repo2 = IndexRepo(persist_path=str(index_path), root_path=str(path), min_token_count=0) + rsp2 = await repo2.search(query) + assert rsp2 + + merged_rsp = await repo.merge(query=query, indices_list=[rsp, rsp2]) + assert merged_rsp + + shutil.rmtree(index_path) + + +if __name__ == "__main__": + pytest.main([__file__, "-s"])