diff --git a/metagpt/actions/write_analysis_code.py b/metagpt/actions/write_analysis_code.py index cf806a986..c6e504b9e 100644 --- a/metagpt/actions/write_analysis_code.py +++ b/metagpt/actions/write_analysis_code.py @@ -22,7 +22,8 @@ from metagpt.prompts.ml_engineer import ( TOOL_USAGE_PROMPT, ) from metagpt.schema import Message, Plan -from metagpt.tools.tool_registry import TOOL_REGISTRY +from metagpt.tools import TOOL_REGISTRY +from metagpt.tools.tool_registry import validate_tool_names from metagpt.utils.common import create_func_config, remove_comments @@ -90,30 +91,29 @@ class WriteCodeByGenerate(BaseWriteAnalysisCode): class WriteCodeWithTools(BaseWriteAnalysisCode): """Write code with help of local available tools. Choose tools first, then generate code to use the tools""" - available_tools: dict = {} + # selected tools to choose from, listed by their names. En empty list means selection from all tools. + selected_tools: list[str] = [] - def __init__(self, **kwargs): - super().__init__(**kwargs) - - def _parse_recommend_tools(self, recommend_tools: list) -> dict: + def _get_tools_by_type(self, tool_type: str) -> dict: """ - Parses and validates a list of recommended tools, and retrieves their schema from registry. + Retreive tools by tool type from registry, but filtered by pre-selected tool list Args: - recommend_tools (list): A list of recommended tools. + tool_type (str): Tool type to retrieve from the registry Returns: - dict: A dict of valid tool schemas. + dict: A dict of tool name to Tool object, representing available tools under the type """ - valid_tools = [] - for tool_name in recommend_tools: - if TOOL_REGISTRY.has_tool(tool_name): - valid_tools.append(TOOL_REGISTRY.get_tool(tool_name)) + candidate_tools = TOOL_REGISTRY.get_tools_by_type(tool_type) + if self.selected_tools: + candidate_tools = { + tool_name: candidate_tools[tool_name] + for tool_name in self.selected_tools + if tool_name in candidate_tools + } + return candidate_tools - tool_catalog = {tool.name: tool.schemas for tool in valid_tools} - return tool_catalog - - async def _tool_recommendation( + async def _recommend_tool( self, task: str, code_steps: str, @@ -128,7 +128,7 @@ class WriteCodeWithTools(BaseWriteAnalysisCode): available_tools (dict): the available tools description Returns: - list: recommended tools for the specified task + dict: schemas of recommended tools for the specified task """ prompt = TOOL_RECOMMENDATION_PROMPT.format( current_task=task, @@ -138,42 +138,62 @@ class WriteCodeWithTools(BaseWriteAnalysisCode): tool_config = create_func_config(SELECT_FUNCTION_TOOLS) rsp = await self.llm.aask_code(prompt, **tool_config) recommend_tools = rsp["recommend_tools"] - return recommend_tools + logger.info(f"Recommended tools: \n{recommend_tools}") + + # Parses and validates the recommended tools, for LLM might hallucinate and recommend non-existing tools + valid_tools = validate_tool_names(recommend_tools, return_tool_object=True) + + tool_schemas = {tool.name: tool.schemas for tool in valid_tools} + + return tool_schemas + + async def _prepare_tools(self, plan: Plan) -> Tuple[dict, str]: + """Prepare tool schemas and usage instructions according to current task + + Args: + plan (Plan): The overall plan containing task information. + + Returns: + Tuple[dict, str]: A tool schemas ({tool_name: tool_schema_dict}) and a usage prompt for the type of tools selected + """ + # find tool type from task type through exact match, can extend to retrieval in the future + tool_type = plan.current_task.task_type + + # prepare tool-type-specific instruction + tool_type_usage_prompt = ( + TOOL_REGISTRY.get_tool_type(tool_type).usage_prompt if TOOL_REGISTRY.has_tool_type(tool_type) else "" + ) + + # prepare schemas of available tools + tool_schemas = {} + available_tools = self._get_tools_by_type(tool_type) + if available_tools: + available_tools = {tool_name: tool.schemas["description"] for tool_name, tool in available_tools.items()} + code_steps = plan.current_task.code_steps + tool_schemas = await self._recommend_tool(plan.current_task.instruction, code_steps, available_tools) + + return tool_schemas, tool_type_usage_prompt async def run( self, context: List[Message], - plan: Plan = None, + plan: Plan, **kwargs, ) -> str: - tool_type = ( - plan.current_task.task_type - ) # find tool type from task type through exact match, can extend to retrieval in the future - available_tools = TOOL_REGISTRY.get_tools_by_type(tool_type) - special_prompt = ( - TOOL_REGISTRY.get_tool_type(tool_type).usage_prompt if TOOL_REGISTRY.has_tool_type(tool_type) else "" + # prepare tool schemas and tool-type-specific instruction + tool_schemas, tool_type_usage_prompt = await self._prepare_tools(plan=plan) + + # form a complete tool usage instruction and include it as a message in context + tools_instruction = TOOL_USAGE_PROMPT.format( + tool_schemas=tool_schemas, tool_type_usage_prompt=tool_type_usage_prompt ) - code_steps = plan.current_task.code_steps - - tool_catalog = {} - - if available_tools: - available_tools = {tool_name: tool.schemas["description"] for tool_name, tool in available_tools.items()} - - recommend_tools = await self._tool_recommendation( - plan.current_task.instruction, code_steps, available_tools - ) - tool_catalog = self._parse_recommend_tools(recommend_tools) - logger.info(f"Recommended tools: \n{recommend_tools}") - - tools_instruction = TOOL_USAGE_PROMPT.format(special_prompt=special_prompt, tool_catalog=tool_catalog) - context.append(Message(content=tools_instruction, role="user")) + # prepare prompt & LLM call prompt = self.process_msg(context) - tool_config = create_func_config(CODE_GENERATOR_WITH_TOOLS) rsp = await self.llm.aask_code(prompt, **tool_config) + return rsp @@ -185,36 +205,25 @@ class WriteCodeWithToolsML(WriteCodeWithTools): column_info: str = "", **kwargs, ) -> Tuple[List[Message], str]: - tool_type = ( - plan.current_task.task_type - ) # find tool type from task type through exact match, can extend to retrieval in the future - available_tools = TOOL_REGISTRY.get_tools_by_type(tool_type) - special_prompt = ( - TOOL_REGISTRY.get_tool_type(tool_type).usage_prompt if TOOL_REGISTRY.has_tool_type(tool_type) else "" - ) - code_steps = plan.current_task.code_steps + # prepare tool schemas and tool-type-specific instruction + tool_schemas, tool_type_usage_prompt = await self._prepare_tools(plan=plan) + # ML-specific variables to be used in prompt + code_steps = plan.current_task.code_steps finished_tasks = plan.get_finished_tasks() code_context = [remove_comments(task.code) for task in finished_tasks] code_context = "\n\n".join(code_context) - if available_tools: - available_tools = {tool_name: tool.schemas["description"] for tool_name, tool in available_tools.items()} - - recommend_tools = await self._tool_recommendation( - plan.current_task.instruction, code_steps, available_tools - ) - tool_catalog = self._parse_recommend_tools(recommend_tools) - logger.info(f"Recommended tools: \n{recommend_tools}") - + # prepare prompt depending on tool availability & LLM call + if tool_schemas: prompt = ML_TOOL_USAGE_PROMPT.format( user_requirement=plan.goal, history_code=code_context, current_task=plan.current_task.instruction, column_info=column_info, - special_prompt=special_prompt, + tool_type_usage_prompt=tool_type_usage_prompt, code_steps=code_steps, - tool_catalog=tool_catalog, + tool_schemas=tool_schemas, ) else: @@ -223,13 +232,15 @@ class WriteCodeWithToolsML(WriteCodeWithTools): history_code=code_context, current_task=plan.current_task.instruction, column_info=column_info, - special_prompt=special_prompt, + tool_type_usage_prompt=tool_type_usage_prompt, code_steps=code_steps, ) - tool_config = create_func_config(CODE_GENERATOR_WITH_TOOLS) rsp = await self.llm.aask_code(prompt, **tool_config) + + # Extra output to be used for potential debugging context = [Message(content=prompt, role="user")] + return context, rsp diff --git a/metagpt/prompts/ml_engineer.py b/metagpt/prompts/ml_engineer.py index 3fd895e6e..ac95e14bd 100644 --- a/metagpt/prompts/ml_engineer.py +++ b/metagpt/prompts/ml_engineer.py @@ -161,7 +161,7 @@ Latest data info after previous tasks: # Task Write complete code for 'Current Task'. And avoid duplicating code from 'Done Tasks', such as repeated import of packages, reading data, etc. -Specifically, {special_prompt} +Specifically, {tool_type_usage_prompt} # Code Steps: Strictly follow steps below when you writing code if it's convenient. @@ -192,7 +192,7 @@ model.fit(train, y_train) TOOL_USAGE_PROMPT = """ # Instruction Write complete code for 'Current Task'. And avoid duplicating code from finished tasks, such as repeated import of packages, reading data, etc. -Specifically, {special_prompt} +Specifically, {tool_type_usage_prompt} # Capabilities - You can utilize pre-defined tools in any code lines from 'Available Tools' in the form of Python Class. @@ -200,7 +200,7 @@ Specifically, {special_prompt} # Available Tools (can be empty): Each Class tool is described in JSON format. When you call a tool, import the tool first. -{tool_catalog} +{tool_schemas} # Constraints: - Ensure the output new code is executable in the same Jupyter notebook with previous tasks code have been executed. @@ -225,7 +225,7 @@ Latest data info after previous tasks: # Task Write complete code for 'Current Task'. And avoid duplicating code from 'Done Tasks', such as repeated import of packages, reading data, etc. -Specifically, {special_prompt} +Specifically, {tool_type_usage_prompt} # Code Steps: Strictly follow steps below when you writing code if it's convenient. @@ -237,7 +237,7 @@ Strictly follow steps below when you writing code if it's convenient. # Available Tools: Each Class tool is described in JSON format. When you call a tool, import the tool from its path first. -{tool_catalog} +{tool_schemas} # Output Example: when current task is "do data preprocess, like fill missing value, handle outliers, etc.", and their are two steps in 'Code Steps', the code be like: diff --git a/metagpt/roles/code_interpreter.py b/metagpt/roles/code_interpreter.py index f972e72e2..11ede6068 100644 --- a/metagpt/roles/code_interpreter.py +++ b/metagpt/roles/code_interpreter.py @@ -19,6 +19,7 @@ class CodeInterpreter(Role): make_udfs: bool = False # whether to save user-defined functions use_code_steps: bool = False execute_code: ExecutePyCode = Field(default_factory=ExecutePyCode, exclude=True) + tools: list[str] = [] def __init__( self, @@ -27,13 +28,20 @@ class CodeInterpreter(Role): goal="", auto_run=True, use_tools=False, - make_udfs=False, + tools=[], **kwargs, ): super().__init__( - name=name, profile=profile, goal=goal, auto_run=auto_run, use_tools=use_tools, make_udfs=make_udfs, **kwargs + name=name, profile=profile, goal=goal, auto_run=auto_run, use_tools=use_tools, tools=tools, **kwargs ) self._set_react_mode(react_mode="plan_and_act", auto_run=auto_run, use_tools=use_tools) + if use_tools and tools: + from metagpt.tools.tool_registry import ( + validate_tool_names, # import upon use + ) + + self.tools = validate_tool_names(tools) + logger.info(f"will only use {self.tools} as tools") @property def working_memory(self): @@ -92,7 +100,7 @@ class CodeInterpreter(Role): return code["code"], result, success async def _write_code(self): - todo = WriteCodeByGenerate() if not self.use_tools else WriteCodeWithTools() + todo = WriteCodeByGenerate() if not self.use_tools else WriteCodeWithTools(selected_tools=self.tools) logger.info(f"ready to {todo.name}") context = self.planner.get_useful_memories() diff --git a/metagpt/roles/ml_engineer.py b/metagpt/roles/ml_engineer.py index 6b671f9c2..d1a22b9d3 100644 --- a/metagpt/roles/ml_engineer.py +++ b/metagpt/roles/ml_engineer.py @@ -27,7 +27,7 @@ class MLEngineer(CodeInterpreter): column_info = await self._update_data_columns() logger.info("Write code with tools") - tool_context, code = await WriteCodeWithToolsML().run( + tool_context, code = await WriteCodeWithToolsML(selected_tools=self.tools).run( context=[], # context assembled inside the Action plan=self.planner.plan, column_info=column_info, diff --git a/metagpt/roles/role.py b/metagpt/roles/role.py index a2f2f2e9d..21e48a127 100644 --- a/metagpt/roles/role.py +++ b/metagpt/roles/role.py @@ -477,7 +477,7 @@ class Role(SerializationMixin, is_polymorphic_base=True): else: # update plan according to user's feedback and to take on changed tasks - await self.planner.update_plan(review) + await self.planner.update_plan() completed_plan_memory = self.planner.get_useful_memories() # completed plan as a outcome diff --git a/metagpt/tools/libs/__init__.py b/metagpt/tools/libs/__init__.py index 442f57149..c9767c1e5 100644 --- a/metagpt/tools/libs/__init__.py +++ b/metagpt/tools/libs/__init__.py @@ -9,7 +9,7 @@ from metagpt.tools.libs import ( feature_engineering, sd_engine, gpt_v_generator, - web_scrapping, + web_scraping, ) -_ = data_preprocess, feature_engineering, sd_engine, gpt_v_generator, web_scrapping # Avoid pre-commit error +_ = data_preprocess, feature_engineering, sd_engine, gpt_v_generator, web_scraping # Avoid pre-commit error diff --git a/metagpt/tools/libs/data_preprocess.py b/metagpt/tools/libs/data_preprocess.py index 3891f9df0..0480e71a7 100644 --- a/metagpt/tools/libs/data_preprocess.py +++ b/metagpt/tools/libs/data_preprocess.py @@ -26,31 +26,64 @@ class MLProcess(object): def transform(self, df): raise NotImplementedError - def fit_transform(self, df): + def fit_transform(self, df) -> pd.DataFrame: + """ + Fit and transform the input DataFrame. + + Args: + df (pd.DataFrame): The input DataFrame. + + Returns: + pd.DataFrame: The transformed DataFrame. + """ self.fit(df) return self.transform(df) @register_tool(tool_type=TOOL_TYPE) class FillMissingValue(MLProcess): - def __init__( - self, - features: list, - strategy: str = "mean", - fill_value=None, - ): + """ + Completing missing values with simple strategies. + """ + + def __init__(self, features: list, strategy: str = "mean", fill_value=None): + """ + Initialize self. + + Args: + features (list): Columns to be processed. + strategy (str, optional): The imputation strategy, notice 'mean' and 'median' can only + be used for numeric features. Enum: ['mean', 'median', 'most_frequent', 'constant']. Defaults to 'mean'. + fill_value (int, optional): Fill_value is used to replace all occurrences of missing_values. + Defaults to None. + """ self.features = features self.strategy = strategy self.fill_value = fill_value self.si = None def fit(self, df: pd.DataFrame): + """ + Fit the FillMissingValue model. + + Args: + df (pd.DataFrame): The input DataFrame. + """ if len(self.features) == 0: return self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value) self.si.fit(df[self.features]) - def transform(self, df: pd.DataFrame): + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Transform the input DataFrame with the fitted model. + + Args: + df (pd.DataFrame): The input DataFrame. + + Returns: + pd.DataFrame: The transformed DataFrame. + """ if len(self.features) == 0: return df new_df = df.copy() @@ -60,18 +93,40 @@ class FillMissingValue(MLProcess): @register_tool(tool_type=TOOL_TYPE) class MinMaxScale(MLProcess): - def __init__( - self, - features: list, - ): + """ + Transform features by scaling each feature to a range, which is (0, 1). + """ + + def __init__(self, features: list): + """ + Initialize self. + + Args: + features (list): Columns to be processed. + """ self.features = features self.mms = None def fit(self, df: pd.DataFrame): + """ + Fit the MinMaxScale model. + + Args: + df (pd.DataFrame): The input DataFrame. + """ self.mms = MinMaxScaler() self.mms.fit(df[self.features]) - def transform(self, df: pd.DataFrame): + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Transform the input DataFrame with the fitted model. + + Args: + df (pd.DataFrame): The input DataFrame. + + Returns: + pd.DataFrame: The transformed DataFrame. + """ new_df = df.copy() new_df[self.features] = self.mms.transform(new_df[self.features]) return new_df @@ -79,18 +134,40 @@ class MinMaxScale(MLProcess): @register_tool(tool_type=TOOL_TYPE) class StandardScale(MLProcess): - def __init__( - self, - features: list, - ): + """ + Standardize features by removing the mean and scaling to unit variance. + """ + + def __init__(self, features: list): + """ + Initialize self. + + Args: + features (list): Columns to be processed. + """ self.features = features self.ss = None def fit(self, df: pd.DataFrame): + """ + Fit the StandardScale model. + + Args: + df (pd.DataFrame): The input DataFrame. + """ self.ss = StandardScaler() self.ss.fit(df[self.features]) - def transform(self, df: pd.DataFrame): + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Transform the input DataFrame with the fitted model. + + Args: + df (pd.DataFrame): The input DataFrame. + + Returns: + pd.DataFrame: The transformed DataFrame. + """ new_df = df.copy() new_df[self.features] = self.ss.transform(new_df[self.features]) return new_df @@ -98,18 +175,40 @@ class StandardScale(MLProcess): @register_tool(tool_type=TOOL_TYPE) class MaxAbsScale(MLProcess): - def __init__( - self, - features: list, - ): + """ + Scale each feature by its maximum absolute value. + """ + + def __init__(self, features: list): + """ + Initialize self. + + Args: + features (list): Columns to be processed. + """ self.features = features self.mas = None def fit(self, df: pd.DataFrame): + """ + Fit the MaxAbsScale model. + + Args: + df (pd.DataFrame): The input DataFrame. + """ self.mas = MaxAbsScaler() self.mas.fit(df[self.features]) - def transform(self, df: pd.DataFrame): + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Transform the input DataFrame with the fitted model. + + Args: + df (pd.DataFrame): The input DataFrame. + + Returns: + pd.DataFrame: The transformed DataFrame. + """ new_df = df.copy() new_df[self.features] = self.mas.transform(new_df[self.features]) return new_df @@ -117,18 +216,40 @@ class MaxAbsScale(MLProcess): @register_tool(tool_type=TOOL_TYPE) class RobustScale(MLProcess): - def __init__( - self, - features: list, - ): + """ + Apply the RobustScaler to scale features using statistics that are robust to outliers. + """ + + def __init__(self, features: list): + """ + Initialize the RobustScale instance with feature names. + + Args: + features (list): List of feature names to be scaled. + """ self.features = features self.rs = None def fit(self, df: pd.DataFrame): + """ + Compute the median and IQR for scaling. + + Args: + df (pd.DataFrame): Dataframe containing the features. + """ self.rs = RobustScaler() self.rs.fit(df[self.features]) def transform(self, df: pd.DataFrame): + """ + Scale features using the previously computed median and IQR. + + Args: + df (pd.DataFrame): Dataframe containing the features to be scaled. + + Returns: + pd.DataFrame: A new dataframe with scaled features. + """ new_df = df.copy() new_df[self.features] = self.rs.transform(new_df[self.features]) return new_df @@ -136,18 +257,40 @@ class RobustScale(MLProcess): @register_tool(tool_type=TOOL_TYPE) class OrdinalEncode(MLProcess): - def __init__( - self, - features: list, - ): + """ + Encode categorical features as ordinal integers. + """ + + def __init__(self, features: list): + """ + Initialize the OrdinalEncode instance with feature names. + + Args: + features (list): List of categorical feature names to be encoded. + """ self.features = features self.oe = None def fit(self, df: pd.DataFrame): + """ + Learn the ordinal encodings for the features. + + Args: + df (pd.DataFrame): Dataframe containing the categorical features. + """ self.oe = OrdinalEncoder() self.oe.fit(df[self.features]) def transform(self, df: pd.DataFrame): + """ + Convert the categorical features to ordinal integers. + + Args: + df (pd.DataFrame): Dataframe containing the categorical features to be encoded. + + Returns: + pd.DataFrame: A new dataframe with the encoded features. + """ new_df = df.copy() new_df[self.features] = self.oe.transform(new_df[self.features]) return new_df @@ -155,18 +298,40 @@ class OrdinalEncode(MLProcess): @register_tool(tool_type=TOOL_TYPE) class OneHotEncode(MLProcess): - def __init__( - self, - features: list, - ): + """ + Apply one-hot encoding to specified categorical columns, the original columns will be dropped. + """ + + def __init__(self, features: list): + """ + Initialize self. + + Args: + features (list): Categorical columns to be one-hot encoded and dropped. + """ self.features = features self.ohe = None def fit(self, df: pd.DataFrame): + """ + Fit the OneHotEncoding model. + + Args: + df (pd.DataFrame): The input DataFrame. + """ self.ohe = OneHotEncoder(handle_unknown="ignore", sparse=False) self.ohe.fit(df[self.features]) - def transform(self, df: pd.DataFrame): + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Transform the input DataFrame with the fitted model. + + Args: + df (pd.DataFrame): The input DataFrame. + + Returns: + pd.DataFrame: The transformed DataFrame. + """ ts_data = self.ohe.transform(df[self.features]) new_columns = self.ohe.get_feature_names_out(self.features) ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index) @@ -177,21 +342,43 @@ class OneHotEncode(MLProcess): @register_tool(tool_type=TOOL_TYPE) class LabelEncode(MLProcess): - def __init__( - self, - features: list, - ): + """ + Apply label encoding to specified categorical columns in-place. + """ + + def __init__(self, features: list): + """ + Initialize self. + + Args: + features (list): Categorical columns to be label encoded. + """ self.features = features self.le_encoders = [] def fit(self, df: pd.DataFrame): + """ + Fit the LabelEncode model. + + Args: + df (pd.DataFrame): The input DataFrame. + """ if len(self.features) == 0: return for col in self.features: le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ["unknown"]) self.le_encoders.append(le) - def transform(self, df: pd.DataFrame): + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Transform the input DataFrame with the fitted model. + + Args: + df (pd.DataFrame): The input DataFrame. + + Returns: + pd.DataFrame: The transformed DataFrame. + """ if len(self.features) == 0: return df new_df = df.copy() @@ -204,8 +391,17 @@ class LabelEncode(MLProcess): return new_df -@register_tool(tool_type=TOOL_TYPE) def get_column_info(df: pd.DataFrame) -> dict: + """ + Analyzes a DataFrame and categorizes its columns based on data types. + + Args: + df (pd.DataFrame): The DataFrame to be analyzed. + + Returns: + dict: A dictionary with four keys ('Category', 'Numeric', 'Datetime', 'Others'). + Each key corresponds to a list of column names belonging to that category. + """ column_info = { "Category": [], "Numeric": [], diff --git a/metagpt/tools/libs/feature_engineering.py b/metagpt/tools/libs/feature_engineering.py index 308150f9b..79e1c1b07 100644 --- a/metagpt/tools/libs/feature_engineering.py +++ b/metagpt/tools/libs/feature_engineering.py @@ -184,7 +184,7 @@ class SplitBins(MLProcess): return new_df -@register_tool(tool_type=TOOL_TYPE) +# @register_tool(tool_type=TOOL_TYPE) class ExtractTimeComps(MLProcess): def __init__(self, time_col: str, time_comps: list): self.time_col = time_col @@ -242,6 +242,7 @@ class GeneralSelection(MLProcess): # skip for now because lgb is needed +# @register_tool(tool_type=TOOL_TYPE) class TreeBasedSelection(MLProcess): def __init__(self, label_col: str, task_type: str): self.label_col = label_col diff --git a/metagpt/tools/libs/web_scrapping.py b/metagpt/tools/libs/web_scraping.py similarity index 100% rename from metagpt/tools/libs/web_scrapping.py rename to metagpt/tools/libs/web_scraping.py diff --git a/metagpt/tools/schemas/data_preprocess/OrdinalEncode.yml b/metagpt/tools/schemas/data_preprocess/OrdinalEncode.yml new file mode 100644 index 000000000..79ebaf37c --- /dev/null +++ b/metagpt/tools/schemas/data_preprocess/OrdinalEncode.yml @@ -0,0 +1,46 @@ +OrdinalEncode: + type: class + description: Encode categorical features as ordinal integers. + methods: + __init__: + description: 'Initialize the OrdinalEncode instance with feature names. ' + parameters: + properties: + features: + type: list + description: List of categorical feature names to be encoded. + required: + - features + fit: + description: 'Learn the ordinal encodings for the features. ' + parameters: + properties: + df: + type: pd.DataFrame + description: Dataframe containing the categorical features. + required: + - df + fit_transform: + description: 'Fit and transform the input DataFrame. ' + parameters: + properties: + df: + type: pd.DataFrame + description: The input DataFrame. + required: + - df + returns: + - type: pd.DataFrame + description: The transformed DataFrame. + transform: + description: 'Convert the categorical features to ordinal integers. ' + parameters: + properties: + df: + type: pd.DataFrame + description: Dataframe containing the categorical features to be encoded. + required: + - df + returns: + - type: pd.DataFrame + description: A new dataframe with the encoded features. diff --git a/metagpt/tools/schemas/data_preprocess/RobustScale.yml b/metagpt/tools/schemas/data_preprocess/RobustScale.yml new file mode 100644 index 000000000..6d5dfaf3a --- /dev/null +++ b/metagpt/tools/schemas/data_preprocess/RobustScale.yml @@ -0,0 +1,47 @@ +RobustScale: + type: class + description: Apply the RobustScaler to scale features using statistics that are + robust to outliers. + methods: + __init__: + description: 'Initialize the RobustScale instance with feature names. ' + parameters: + properties: + features: + type: list + description: List of feature names to be scaled. + required: + - features + fit: + description: 'Compute the median and IQR for scaling. ' + parameters: + properties: + df: + type: pd.DataFrame + description: Dataframe containing the features. + required: + - df + fit_transform: + description: 'Fit and transform the input DataFrame. ' + parameters: + properties: + df: + type: pd.DataFrame + description: The input DataFrame. + required: + - df + returns: + - type: pd.DataFrame + description: The transformed DataFrame. + transform: + description: 'Scale features using the previously computed median and IQR. ' + parameters: + properties: + df: + type: pd.DataFrame + description: Dataframe containing the features to be scaled. + required: + - df + returns: + - type: pd.DataFrame + description: A new dataframe with scaled features. diff --git a/metagpt/tools/schemas/web_scrapping/scrape_web_playwright.yml b/metagpt/tools/schemas/web_scraping/scrape_web_playwright.yml similarity index 100% rename from metagpt/tools/schemas/web_scrapping/scrape_web_playwright.yml rename to metagpt/tools/schemas/web_scraping/scrape_web_playwright.yml diff --git a/metagpt/tools/tool_convert.py b/metagpt/tools/tool_convert.py new file mode 100644 index 000000000..b8377e67a --- /dev/null +++ b/metagpt/tools/tool_convert.py @@ -0,0 +1,72 @@ +import inspect + +from metagpt.utils.parse_docstring import GoogleDocstringParser, remove_spaces + + +def convert_code_to_tool_schema(obj, include: list[str] = []): + docstring = inspect.getdoc(obj) + assert docstring, "no docstring found for the objects, skip registering" + + if inspect.isclass(obj): + schema = {"type": "class", "description": remove_spaces(docstring), "methods": {}} + for name, method in inspect.getmembers(obj, inspect.isfunction): + if include and name not in include: + continue + method_doc = inspect.getdoc(method) + if method_doc: + schema["methods"][name] = docstring_to_schema(method_doc) + + elif inspect.isfunction(obj): + schema = { + "type": "function", + **docstring_to_schema(docstring), + } + + schema = {obj.__name__: schema} + + return schema + + +def docstring_to_schema(docstring: str): + if docstring is None: + return {} + + parser = GoogleDocstringParser(docstring=docstring) + + # 匹配简介部分 + description = parser.parse_desc() + + # 匹配Args部分 + params = parser.parse_params() + parameter_schema = {"properties": {}, "required": []} + for param in params: + param_name, param_type, param_desc = param + # check required or optional + is_optional, param_type = parser.check_and_parse_optional(param_type) + if not is_optional: + parameter_schema["required"].append(param_name) + # type and desc + param_dict = {"type": param_type, "description": remove_spaces(param_desc)} + # match Default for optional args + has_default_val, default_val = parser.check_and_parse_default_value(param_desc) + if has_default_val: + param_dict["default"] = default_val + # match Enum + has_enum, enum_vals = parser.check_and_parse_enum(param_desc) + if has_enum: + param_dict["enum"] = enum_vals + # add to parameter schema + parameter_schema["properties"].update({param_name: param_dict}) + + # 匹配Returns部分 + returns = parser.parse_returns() + + # 构建YAML字典 + schema = { + "description": description, + "parameters": parameter_schema, + } + if returns: + schema["returns"] = [{"type": ret[0], "description": remove_spaces(ret[1])} for ret in returns] + + return schema diff --git a/metagpt/tools/tool_registry.py b/metagpt/tools/tool_registry.py index 52ad25ce4..d16defa0a 100644 --- a/metagpt/tools/tool_registry.py +++ b/metagpt/tools/tool_registry.py @@ -11,17 +11,18 @@ import re from collections import defaultdict import yaml +from pydantic import BaseModel from metagpt.const import TOOL_SCHEMA_PATH from metagpt.logs import logger +from metagpt.tools.tool_convert import convert_code_to_tool_schema from metagpt.tools.tool_data_type import Tool, ToolSchema, ToolType -class ToolRegistry: - def __init__(self): - self.tools = {} - self.tool_types = {} - self.tools_by_types = defaultdict(dict) # two-layer k-v, {tool_type: {tool_name: {...}, ...}, ...} +class ToolRegistry(BaseModel): + tools: dict = {} + tool_types: dict = {} + tools_by_types: dict = defaultdict(dict) # two-layer k-v, {tool_type: {tool_name: {...}, ...}, ...} def register_tool_type(self, tool_type: ToolType): self.tool_types[tool_type.name] = tool_type @@ -34,7 +35,9 @@ class ToolRegistry: schema_path=None, tool_code="", tool_type="other", - make_schema_if_not_exists=False, + tool_source_object=None, + include_functions=[], + make_schema_if_not_exists=True, ): if self.has_tool(tool_name): return @@ -44,14 +47,16 @@ class ToolRegistry: if not os.path.exists(schema_path): if make_schema_if_not_exists: logger.warning(f"no schema found, will make schema at {schema_path}") - make_schema(tool_code, schema_path) + schema_dict = make_schema(tool_source_object, include_functions, schema_path) else: logger.warning(f"no schema found at assumed schema_path {schema_path}, skip registering {tool_name}") return - - with open(schema_path, "r", encoding="utf-8") as f: - schema_dict = yaml.safe_load(f) - schemas = schema_dict.get(tool_name) or list(schema_dict.values())[0] + else: + with open(schema_path, "r", encoding="utf-8") as f: + schema_dict = yaml.safe_load(f) + if not schema_dict: + return + schemas = schema_dict.get(tool_name) or list(schema_dict.values())[0] schemas["tool_path"] = tool_path # corresponding code file path of the tool try: ToolSchema(**schemas) # validation @@ -65,22 +70,22 @@ class ToolRegistry: self.tools_by_types[tool_type][tool_name] = tool logger.info(f"{tool_name} registered") - def has_tool(self, key): + def has_tool(self, key: str) -> Tool: return key in self.tools - def get_tool(self, key): + def get_tool(self, key) -> Tool: return self.tools.get(key) - def get_tools_by_type(self, key): - return self.tools_by_types.get(key) + def get_tools_by_type(self, key) -> dict[str, Tool]: + return self.tools_by_types.get(key, {}) - def has_tool_type(self, key): + def has_tool_type(self, key) -> bool: return key in self.tool_types - def get_tool_type(self, key): + def get_tool_type(self, key) -> ToolType: return self.tool_types.get(key) - def get_tool_types(self): + def get_tool_types(self) -> dict[str, ToolType]: return self.tool_types @@ -94,7 +99,7 @@ def register_tool_type(cls): return cls -def register_tool(tool_name="", tool_type="other", schema_path=None): +def register_tool(tool_name="", tool_type="other", schema_path=None, **kwargs): """register a tool to registry""" def decorator(cls, tool_name=tool_name): @@ -112,15 +117,39 @@ def register_tool(tool_name="", tool_type="other", schema_path=None): schema_path=schema_path, tool_code=source_code, tool_type=tool_type, + tool_source_object=cls, + **kwargs, ) return cls return decorator -def make_schema(tool_code, path): +def make_schema(tool_source_object, include, path): os.makedirs(os.path.dirname(path), exist_ok=True) # Create the necessary directories - schema = {} # an empty schema for now - with open(path, "w", encoding="utf-8") as f: - yaml.dump(schema, f) - return path + try: + schema = convert_code_to_tool_schema(tool_source_object, include=include) + with open(path, "w", encoding="utf-8") as f: + yaml.dump(schema, f, sort_keys=False) + # import json + # with open(str(path).replace("yml", "json"), "w", encoding="utf-8") as f: + # json.dump(schema, f, ensure_ascii=False, indent=4) + logger.info(f"schema made at {path}") + except Exception as e: + schema = {} + logger.error(f"Fail to make schema: {e}") + + return schema + + +def validate_tool_names(tools: list[str], return_tool_object=False) -> list[str]: + valid_tools = [] + for tool_name in tools: + if not TOOL_REGISTRY.has_tool(tool_name): + logger.warning( + f"Specified tool {tool_name} not found and was skipped. Check if you have registered it properly" + ) + else: + valid_tool = TOOL_REGISTRY.get_tool(tool_name) if return_tool_object else tool_name + valid_tools.append(valid_tool) + return valid_tools diff --git a/metagpt/utils/parse_docstring.py b/metagpt/utils/parse_docstring.py new file mode 100644 index 000000000..8a017e1f7 --- /dev/null +++ b/metagpt/utils/parse_docstring.py @@ -0,0 +1,87 @@ +import re +from typing import Tuple + +from pydantic import BaseModel + + +def remove_spaces(text): + return re.sub(r"\s+", " ", text) + + +class DocstringParser(BaseModel): + docstring: str + + def parse_desc(self) -> str: + """Parse and return the description from the docstring.""" + + def parse_params(self) -> list[Tuple[str, str, str]]: + """Parse and return the parameters from the docstring. + + Returns: + list[Tuple[str, str, str]]: A list of input paramter info. Each info is a triple of (param name, param type, param description) + """ + + def parse_returns(self) -> list[Tuple[str, str]]: + """Parse and return the output information from the docstring. + + Returns: + list[Tuple[str, str]]: A list of output info. Each info is a tuple of (return type, return description) + """ + + @staticmethod + def check_and_parse_optional(param_type: str) -> Tuple[bool, str]: + """Check if a parameter is optional and return a processed param_type rid of the optionality info if so""" + + @staticmethod + def check_and_parse_default_value(param_desc: str) -> Tuple[bool, str]: + """Check if a parameter has a default value and return the default value if so""" + + @staticmethod + def check_and_parse_enum(param_desc: str) -> Tuple[bool, str]: + """Check if a parameter description includes an enum and return enum values if so""" + + +class reSTDocstringParser(DocstringParser): + """A parser for reStructuredText (reST) docstring""" + + +class GoogleDocstringParser(DocstringParser): + """A parser for Google-stype docstring""" + + docstring: str + + def parse_desc(self) -> str: + description_match = re.search(r"^(.*?)(?:Args:|Returns:|Raises:|$)", self.docstring, re.DOTALL) + description = remove_spaces(description_match.group(1)) if description_match else "" + return description + + def parse_params(self) -> list[Tuple[str, str, str]]: + args_match = re.search(r"Args:\s*(.*?)(?:Returns:|Raises:|$)", self.docstring, re.DOTALL) + _args = args_match.group(1).strip() if args_match else "" + # variable_pattern = re.compile(r"(\w+)\s*\((.*?)\):\s*(.*)") + variable_pattern = re.compile( + r"(\w+)\s*\((.*?)\):\s*(.*?)(?=\n\s*\w+\s*\(|\Z)", re.DOTALL + ) # (?=\n\w+\s*\(|\Z) is to assert that what follows is either the start of the next parameter (indicated by a newline, some word characters, and an opening parenthesis) or the end of the string (\Z). + params = variable_pattern.findall(_args) + return params + + def parse_returns(self) -> list[Tuple[str, str]]: + returns_match = re.search(r"Returns:\s*(.*?)(?:Raises:|$)", self.docstring, re.DOTALL) + returns = returns_match.group(1).strip() if returns_match else "" + return_pattern = re.compile(r"^(.*)\s*:\s*(.*)$") + returns = return_pattern.findall(returns) + return returns + + @staticmethod + def check_and_parse_optional(param_type: str) -> Tuple[bool, str]: + return "optional" in param_type, param_type.replace(", optional", "") + + @staticmethod + def check_and_parse_default_value(param_desc: str) -> Tuple[bool, str]: + default_val = re.search(r"Defaults to (.+?)\.", param_desc) + return (True, default_val.group(1)) if default_val else (False, "") + + @staticmethod + def check_and_parse_enum(param_desc: str) -> Tuple[bool, str]: + enum_val = re.search(r"Enum: \[(.+?)\]", param_desc) + return (True, [e.strip() for e in enum_val.group(1).split(",")]) if enum_val else (False, []) diff --git a/tests/metagpt/roles/run_code_interpreter.py b/tests/metagpt/roles/run_code_interpreter.py index 539b20286..766a25998 100644 --- a/tests/metagpt/roles/run_code_interpreter.py +++ b/tests/metagpt/roles/run_code_interpreter.py @@ -10,7 +10,7 @@ from metagpt.utils.recovery_util import load_history, save_history async def run_code_interpreter( - role_class, requirement, auto_run, use_tools, use_code_steps, make_udfs, use_udfs, save_dir + role_class, requirement, auto_run, use_tools, use_code_steps, make_udfs, use_udfs, save_dir, tools ): """ The main function to run the MLEngineer with optional history loading. @@ -25,7 +25,9 @@ async def run_code_interpreter( """ if role_class == "ci": - role = CodeInterpreter(goal=requirement, auto_run=auto_run, use_tools=use_tools, make_udfs=make_udfs) + role = CodeInterpreter( + goal=requirement, auto_run=auto_run, use_tools=use_tools, make_udfs=make_udfs, tools=tools + ) else: role = MLEngineer( goal=requirement, @@ -33,7 +35,7 @@ async def run_code_interpreter( use_tools=use_tools, use_code_steps=use_code_steps, make_udfs=make_udfs, - use_udfs=use_udfs, + tools=tools, ) if save_dir: @@ -73,6 +75,8 @@ if __name__ == "__main__": use_tools = True make_udfs = False use_udfs = False + tools = [] + # tools = ["FillMissingValue", "CatCross", "non_existing_test"] async def main( role_class: str = role_class, @@ -83,9 +87,10 @@ if __name__ == "__main__": make_udfs: bool = make_udfs, use_udfs: bool = use_udfs, save_dir: str = save_dir, + tools=tools, ): await run_code_interpreter( - role_class, requirement, auto_run, use_tools, use_code_steps, make_udfs, use_udfs, save_dir + role_class, requirement, auto_run, use_tools, use_code_steps, make_udfs, use_udfs, save_dir, tools ) fire.Fire(main) diff --git a/tests/metagpt/tools/test_tool_convert.py b/tests/metagpt/tools/test_tool_convert.py new file mode 100644 index 000000000..1dad997bd --- /dev/null +++ b/tests/metagpt/tools/test_tool_convert.py @@ -0,0 +1,158 @@ +import pandas as pd + +from metagpt.tools.tool_convert import convert_code_to_tool_schema, docstring_to_schema + + +def test_docstring_to_schema(): + docstring = """ + Some test desc. + + Args: + features (list): Columns to be processed. + strategy (str, optional): The imputation strategy, notice 'mean' and 'median' can only be + used for numeric features. Enum: ['mean', 'median', 'most_frequent', 'constant']. Defaults to 'mean'. + fill_value (int, optional): Fill_value is used to replace all occurrences of missing_values. + Defaults to None. + Returns: + pd.DataFrame: The transformed DataFrame. + """ + expected = { + "description": " Some test desc. ", + "parameters": { + "properties": { + "features": {"type": "list", "description": "Columns to be processed."}, + "strategy": { + "type": "str", + "description": "The imputation strategy, notice 'mean' and 'median' can only be used for numeric features. Enum: ['mean', 'median', 'most_frequent', 'constant']. Defaults to 'mean'.", + "default": "'mean'", + "enum": ["'mean'", "'median'", "'most_frequent'", "'constant'"], + }, + "fill_value": { + "type": "int", + "description": "Fill_value is used to replace all occurrences of missing_values. Defaults to None.", + "default": "None", + }, + }, + "required": ["features"], + }, + "returns": [{"type": "pd.DataFrame", "description": "The transformed DataFrame."}], + } + schema = docstring_to_schema(docstring) + assert schema == expected + + +class DummyClass: + """ + Completing missing values with simple strategies. + """ + + def __init__(self, features: list, strategy: str = "mean", fill_value=None): + """ + Initialize self. + + Args: + features (list): Columns to be processed. + strategy (str, optional): The imputation strategy, notice 'mean' and 'median' can only + be used for numeric features. Enum: ['mean', 'median', 'most_frequent', 'constant']. Defaults to 'mean'. + fill_value (int, optional): Fill_value is used to replace all occurrences of missing_values. + Defaults to None. + """ + pass + + def fit(self, df: pd.DataFrame): + """ + Fit the FillMissingValue model. + + Args: + df (pd.DataFrame): The input DataFrame. + """ + pass + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Transform the input DataFrame with the fitted model. + + Args: + df (pd.DataFrame): The input DataFrame. + + Returns: + pd.DataFrame: The transformed DataFrame. + """ + pass + + +def dummy_fn(df: pd.DataFrame) -> dict: + """ + Analyzes a DataFrame and categorizes its columns based on data types. + + Args: + df (pd.DataFrame): The DataFrame to be analyzed. + + Returns: + dict: A dictionary with four keys ('Category', 'Numeric', 'Datetime', 'Others'). + Each key corresponds to a list of column names belonging to that category. + """ + pass + + +def test_convert_code_to_tool_schema_class(): + expected = { + "DummyClass": { + "type": "class", + "description": "Completing missing values with simple strategies.", + "methods": { + "__init__": { + "description": "Initialize self. ", + "parameters": { + "properties": { + "features": {"type": "list", "description": "Columns to be processed."}, + "strategy": { + "type": "str", + "description": "The imputation strategy, notice 'mean' and 'median' can only be used for numeric features. Enum: ['mean', 'median', 'most_frequent', 'constant']. Defaults to 'mean'.", + "default": "'mean'", + "enum": ["'mean'", "'median'", "'most_frequent'", "'constant'"], + }, + "fill_value": { + "type": "int", + "description": "Fill_value is used to replace all occurrences of missing_values. Defaults to None.", + "default": "None", + }, + }, + "required": ["features"], + }, + }, + "fit": { + "description": "Fit the FillMissingValue model. ", + "parameters": { + "properties": {"df": {"type": "pd.DataFrame", "description": "The input DataFrame."}}, + "required": ["df"], + }, + }, + "transform": { + "description": "Transform the input DataFrame with the fitted model. ", + "parameters": { + "properties": {"df": {"type": "pd.DataFrame", "description": "The input DataFrame."}}, + "required": ["df"], + }, + "returns": [{"type": "pd.DataFrame", "description": "The transformed DataFrame."}], + }, + }, + } + } + schema = convert_code_to_tool_schema(DummyClass) + assert schema == expected + + +def test_convert_code_to_tool_schema_function(): + expected = { + "dummy_fn": { + "type": "function", + "description": "Analyzes a DataFrame and categorizes its columns based on data types. ", + "parameters": { + "properties": {"df": {"type": "pd.DataFrame", "description": "The DataFrame to be analyzed."}}, + "required": ["df"], + }, + } + } + schema = convert_code_to_tool_schema(dummy_fn) + assert schema == expected diff --git a/tests/metagpt/tools/test_tool_registry.py b/tests/metagpt/tools/test_tool_registry.py index 582c368a8..c24122e39 100644 --- a/tests/metagpt/tools/test_tool_registry.py +++ b/tests/metagpt/tools/test_tool_registry.py @@ -98,4 +98,4 @@ def test_get_tools_by_type(tool_registry, schema_yaml): # Test case for when the tool type does not exist def test_get_tools_by_nonexistent_type(tool_registry): tools_by_type = tool_registry.get_tools_by_type("NonexistentType") - assert tools_by_type is None + assert not tools_by_type