diff --git a/README.md b/README.md index a151a1f0f..618f21a4b 100644 --- a/README.md +++ b/README.md @@ -184,4 +184,10 @@ ## Citation archivePrefix={arXiv}, primaryClass={cs.AI} } +@article{zhang2024aflow, + title={AFlow: Automating Agentic Workflow Generation}, + author={Zhang, Jiayi and Xiang, Jinyu and Yu, Zhaoyang and Teng, Fengwei and Chen, Xionghui and Chen, Jiaqi and Zhuge, Mingchen and Cheng, Xin and Hong, Sirui and Wang, Jinlin and others}, + journal={arXiv preprint arXiv:2410.10762}, + year={2024} +} ``` diff --git a/examples/aflow/README.md b/examples/aflow/README.md index 55547c618..fb1aea612 100644 --- a/examples/aflow/README.md +++ b/examples/aflow/README.md @@ -33,14 +33,22 @@ ## Quick Start optimized_path = "path/to/optimized/workflows" # Path to save optimized workflows, defaults to metagpt/ext/aflow/scripts/optimized initial_round = 1 # Starting round number max_rounds = 20 # Maximum number of optimization rounds + validation_rounds = 5 # The validation rounds of AFLOW. ``` - Adjust these parameters according to your specific requirements and dataset 2. Set up parameters in `config/config2.yaml` (see `examples/aflow/config2.example.yaml` for reference) 3. Set the operator you want to use in `optimize.py` and in `optimized_path/template/operator.py`, `optimized_path/template/operator.json`. You can reference our implementation to add operators for specific datasets 4. When you first run, you can download the datasets and initial rounds by setting `download(["datasets", "initial_rounds"])` in `examples/aflow/optimize.py` 5. (Optional) Add your custom dataset and corresponding evaluation function following the [Custom Datasets](#custom-datasets) section +6. (Optional) If you want to use a portion of the validation data, you can set `va_list` in `examples/aflow/evaluator.py` 6. Run `python examples/aflow/optimize.py` to start the optimization process! + +## Reproduce the Results in the Paper +1. We provide the raw data obtained from our experiments (link), including the workflows and prompts generated in each iteration, as well as their trajectories on the validation dataset. We also provide the optimal workflow for each dataset and the corresponding data on the test dataset. You can download these data using `metagpt/ext/aflow/data/download_data.py`. +2. You can directly reproduce our experimental results by running the scripts in `examples/aflow/experiments`. + + ## Citation If you use AFlow in your research, please cite our paper: diff --git a/metagpt/ext/aflow/benchmark/README.md b/metagpt/ext/aflow/benchmark/README.md new file mode 100644 index 000000000..4a2464fd1 --- /dev/null +++ b/metagpt/ext/aflow/benchmark/README.md @@ -0,0 +1,29 @@ +# Custom Evaluation Function via Benchmark Class + +## How to Use + +To create a benchmark for a new dataset, follow these steps: + +1. Create a new Python file, e.g., `my_dataset_benchmark.py` +2. Import the base class: + ```python + from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark + ``` +3. Create a new class that inherits from `BaseBenchmark`: + ```python + class MyDatasetBenchmark(BaseBenchmark): + def __init__(self, name: str, file_path: str, log_path: str): + super().__init__(name, file_path, log_path) + ``` +4. Implement the required abstract methods: + - `evaluate_problem`: Evaluate a single problem + - `calculate_score`: Calculate the score for a prediction + - `get_result_columns`: Define column names for the results CSV file + +5. Override other methods as needed, such as `load_data` or `save_results_to_csv` + +## Example + +Refer to the `DROPBenchmark` class in the `drop.py` file for an example of how to implement a benchmark for a specific dataset. + +By following these guidelines, you can easily create custom benchmark evaluations for new datasets. diff --git a/metagpt/ext/aflow/data/download_data.py b/metagpt/ext/aflow/data/download_data.py index 198ef20c1..5d0642be4 100644 --- a/metagpt/ext/aflow/data/download_data.py +++ b/metagpt/ext/aflow/data/download_data.py @@ -64,8 +64,18 @@ datasets_to_download: Dict[str, Dict[str, str]] = { } +def is_directory_empty(path: str) -> bool: + """Check if the directory is empty""" + return len(os.listdir(path)) == 0 + def download(datasets): - """Main function to process all selected datasets.""" + """Main function to process all selected datasets""" for dataset_name in datasets: dataset = datasets_to_download[dataset_name] - process_dataset(dataset["url"], dataset["filename"], dataset["extract_path"]) + extract_path = dataset["extract_path"] + + if os.path.exists(extract_path) and not is_directory_empty(extract_path): + logger.info(f"Target folder {extract_path} for {dataset_name} is not empty, skipping download and extraction.") + continue + + process_dataset(dataset["url"], dataset["filename"], extract_path) diff --git a/metagpt/ext/aflow/scripts/evaluator.py b/metagpt/ext/aflow/scripts/evaluator.py index ecc009a5c..34bdcd9fc 100644 --- a/metagpt/ext/aflow/scripts/evaluator.py +++ b/metagpt/ext/aflow/scripts/evaluator.py @@ -45,8 +45,10 @@ class Evaluator: # Use params to configure the graph and benchmark configured_graph = await self._configure_graph(dataset, graph, params) - - va_list = [1, 2, 3] # Use va_list from params, or use default value if not provided + if is_test: + va_list = None # For test data, generally use None to test all + else: + va_list = None # Use None to test all Validation data, or set va_list (e.g., [1, 2, 3]) to use partial data return await benchmark.run_evaluation(configured_graph, va_list) async def _configure_graph(self, dataset, graph, params: dict): diff --git a/metagpt/ext/aflow/scripts/operator.py b/metagpt/ext/aflow/scripts/operator.py index 9d27c7cd1..903a962e0 100644 --- a/metagpt/ext/aflow/scripts/operator.py +++ b/metagpt/ext/aflow/scripts/operator.py @@ -100,14 +100,14 @@ class ScEnsemble(Operator): def __init__(self, llm: LLM, name: str = "ScEnsemble"): super().__init__(llm, name) - async def __call__(self, solutions: List[str]): + async def __call__(self, solutions: List[str], problem: str): answer_mapping = {} solution_text = "" for index, solution in enumerate(solutions): answer_mapping[chr(65 + index)] = index solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n" - prompt = SC_ENSEMBLE_PROMPT.format(solutions=solution_text) + prompt = SC_ENSEMBLE_PROMPT.format(question=problem, solutions=solution_text) response = await self._fill_node(ScEnsembleOp, prompt, mode="xml_fill") answer = response.get("solution_letter", "") diff --git a/metagpt/ext/aflow/scripts/optimized/MATH/workflows/round_2/__init__.py b/metagpt/ext/aflow/scripts/optimized/MATH/workflows/round_2/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/metagpt/ext/aflow/scripts/optimized/MATH/workflows/round_2/graph.py b/metagpt/ext/aflow/scripts/optimized/MATH/workflows/round_2/graph.py new file mode 100644 index 000000000..7369cdb78 --- /dev/null +++ b/metagpt/ext/aflow/scripts/optimized/MATH/workflows/round_2/graph.py @@ -0,0 +1,30 @@ +from typing import Literal +import metagpt.ext.aflow.scripts.optimized.MATH.workflows.template.operator as operator +import metagpt.ext.aflow.scripts.optimized.MATH.workflows.round_2.prompt as prompt_custom +from metagpt.provider.llm_provider_registry import create_llm_instance +from metagpt.utils.cost_manager import CostManager + +DatasetType = Literal["HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP"] + +class Workflow: + def __init__( + self, + name: str, + llm_config, + dataset: DatasetType, + ) -> None: + self.name = name + self.dataset = dataset + self.llm = create_llm_instance(llm_config) + self.llm.cost_manager = CostManager() + self.custom = operator.Custom(self.llm) + self.sc_ensemble = operator.ScEnsemble(self.llm) + + async def __call__(self, problem: str): + """ + Implementation of the workflow + """ + initial_solution = await self.custom(input=problem, instruction=prompt_custom.INITIAL_SOLUTION_PROMPT) + revised_solution = await self.custom(input=problem + f"\nInitial solution: {initial_solution['response']}", instruction=prompt_custom.REVISE_SOLUTION_PROMPT) + final_solution = await self.sc_ensemble(solutions=[initial_solution['response'], revised_solution['response']], problem=problem) + return final_solution['response'], self.llm.cost_manager.total_cost diff --git a/metagpt/ext/aflow/scripts/optimized/MATH/workflows/round_2/prompt.py b/metagpt/ext/aflow/scripts/optimized/MATH/workflows/round_2/prompt.py new file mode 100644 index 000000000..80554beb8 --- /dev/null +++ b/metagpt/ext/aflow/scripts/optimized/MATH/workflows/round_2/prompt.py @@ -0,0 +1,17 @@ +INITIAL_SOLUTION_PROMPT = """ +You are a math expert tasked with solving a complex problem. Please provide a step-by-step solution to the given problem, showing all your work and explaining your reasoning clearly. If the problem involves calculations, make sure to include them in your response. + +Problem: +""" + +REVISE_SOLUTION_PROMPT = """ +You are a math expert tasked with reviewing and improving a solution to a complex problem. An initial solution has been provided, but it may contain errors or be incomplete. Your task is to carefully review the initial solution, identify any mistakes or areas for improvement, and provide a revised, more accurate solution. + +Please follow these steps: +1. Review the initial solution thoroughly. +2. Identify any errors or areas that need improvement. +3. Provide a revised solution, explaining your changes and reasoning. +4. Ensure your revised solution is complete, accurate, and clearly explained. + +Problem: +""" \ No newline at end of file diff --git a/metagpt/ext/aflow/scripts/optimizer.py b/metagpt/ext/aflow/scripts/optimizer.py index 63da6a56c..8dadc1d1a 100644 --- a/metagpt/ext/aflow/scripts/optimizer.py +++ b/metagpt/ext/aflow/scripts/optimizer.py @@ -42,6 +42,7 @@ class Optimizer: optimized_path: str = None, initial_round: int = 1, max_rounds: int = 20, + validation_rounds: int = 5, ) -> None: self.optimize_llm_config = opt_llm_config self.optimize_llm = create_llm_instance(self.optimize_llm_config) @@ -59,6 +60,7 @@ class Optimizer: self.top_scores = [] self.round = initial_round self.max_rounds = max_rounds + self.validation_rounds = validation_rounds self.graph_utils = GraphUtils(self.root_path) self.data_utils = DataUtils(self.root_path) @@ -116,7 +118,7 @@ class Optimizer: time.sleep(5) async def _optimize_graph(self): - validation_n = 2 # validation datasets's execution number + validation_n = self.validation_rounds # validation datasets's execution number graph_path = f"{self.root_path}/workflows" data = self.data_utils.load_results(graph_path) diff --git a/optimize.py b/optimize.py new file mode 100644 index 000000000..0eab0a53e --- /dev/null +++ b/optimize.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +# @Date : 8/23/2024 20:00 PM +# @Author : didi +# @Desc : Entrance of AFlow. + + +from metagpt.configs.models_config import ModelsConfig +from metagpt.ext.aflow.data.download_data import download +from metagpt.ext.aflow.scripts.optimizer import DatasetType, Optimizer, QuestionType + +# DatasetType, QuestionType, and OptimizerType definitions +# DatasetType = Literal["HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP"] +# QuestionType = Literal["math", "code", "qa"] +# OptimizerType = Literal["Graph", "Test"] + +# When you fisrt use, please download the datasets and initial rounds; If you want to get a look of the results, please download the results. +download(["datasets", "initial_rounds"]) + +# Crucial Parameters +dataset: DatasetType = "MATH" # Ensure the type is consistent with DatasetType +sample: int = 4 # Sample Count, which means how many workflows will be resampled from generated workflows +question_type: QuestionType = "math" # Ensure the type is consistent with QuestionType +optimized_path: str = "metagpt/ext/aflow/scripts/optimized" # Optimized Result Save Path +initial_round: int = 1 # Corrected the case from Initial_round to initial_round +max_rounds: int = 20 # The max iteration of AFLOW. +check_convergence: bool = True # Whether Early Stop +validation_rounds: int = 5 # The validation rounds of AFLOW. + +# Config llm model, you can modify `config/config2.yaml` to use more llms. +mini_llm_config = ModelsConfig.default().get("gpt-4o-mini") +claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620") + +# Config operators. +operators = [ + "Custom", # It's basic unit of a fixed node. optimizer can modify its prompt to get vairous nodes. + # "AnswerGenerate" # It's for qa + # "CustomCodeGenerate", # It's for code + "ScEnsemble", # It's for code, math and qa + # "Test", # It's for code + "Programmer", # It's for math +] + +# Create an optimizer instance +optimizer = Optimizer( + dataset=dataset, # Config dataset + question_type=question_type, # Config Question Type + opt_llm_config=claude_llm_config, # Config Optimizer LLM + exec_llm_config=mini_llm_config, # Config Execution LLM + check_convergence=check_convergence, # Whether Early Stop + operators=operators, # Config Operators you want to use + optimized_path=optimized_path, # Config Optimized workflow's file path + sample=sample, # Only Top(sample) rounds will be selected. + initial_round=initial_round, # Optimize from initial round + max_rounds=max_rounds, # The max iteration of AFLOW. + validation_rounds=validation_rounds, # The validation rounds of AFLOW. +) + +if __name__ == "__main__": + # Optimize workflow via setting the optimizer's mode to 'Graph' + optimizer.optimize("Graph") + # Test workflow via setting the optimizer's mode to 'Test' + # optimizer.optimize("Test")