diff --git a/docs/resources/AFLOW-experiment.jpg b/docs/resources/aflow/AFLOW-experiment.jpg similarity index 100% rename from docs/resources/AFLOW-experiment.jpg rename to docs/resources/aflow/AFLOW-experiment.jpg diff --git a/docs/resources/AFLOW-method.jpg b/docs/resources/aflow/AFLOW-method.jpg similarity index 100% rename from docs/resources/AFLOW-method.jpg rename to docs/resources/aflow/AFLOW-method.jpg diff --git a/docs/resources/AFLOW-performance.jpg b/docs/resources/aflow/AFLOW-performance.jpg similarity index 100% rename from docs/resources/AFLOW-performance.jpg rename to docs/resources/aflow/AFLOW-performance.jpg diff --git a/examples/aflow/README.md b/examples/aflow/README.md index 9567e8cf4..616daeaf0 100644 --- a/examples/aflow/README.md +++ b/examples/aflow/README.md @@ -5,7 +5,7 @@ # AFlow: Automating Agentic Workflow Generation [Read our paper on arXiv](https://arxiv.org/abs/2410.10762)

-Performance Of AFLOW +Performance Of AFlow

## Framework Components @@ -17,7 +17,7 @@ ## Framework Components - **Evaluator**: Assesses workflow performance on given tasks. Provides feedback to guide the optimization process towards more effective workflows. See `metagpt/ext/aflow/scripts/evaluator.py` for details.

-Performance Of AFLOW +Framework of AFlow

## Datasets @@ -26,7 +26,7 @@ ### Experimental Datasets We conducted experiments on six datasets (HumanEval, MBPP, GSM8K, MATH, HotpotQA, DROP) and provide their evaluation code. The data can be found in this [datasets](https://drive.google.com/uc?export=download&id=1DNoegtZiUhWtvkd2xoIuElmIi4ah7k8e) link, or you can download them using `metagpt/ext/aflow/data/download_data.py`

-Performance Of AFLOW +Performance Of AFlow

### Custom Datasets @@ -68,7 +68,7 @@ # Or with custom parameters ``` ## Reproduce the Results in the Paper -1. We provide the raw data obtained from our experiments ([download link](https://drive.google.com/uc?export=download&id=1Sr5wjgKf3bN8OC7G6cO3ynzJqD4w6_Dv)), including the workflows and prompts generated in each iteration, as well as their trajectories on the validation dataset. We also provide the optimal workflow for each dataset and the corresponding data on the test dataset. You can download these data using `metagpt/ext/aflow/data/download_data.py`. +1. We provide the raw data obtained from our experiments in this [link](https://drive.google.com/uc?export=download&id=1Sr5wjgKf3bN8OC7G6cO3ynzJqD4w6_Dv), including the workflows and prompts generated in each iteration, as well as their trajectories on the validation dataset. We also provide the optimal workflow for each dataset and the corresponding data on the test dataset. You can download these data using `metagpt/ext/aflow/data/download_data.py`. 2. You can directly reproduce our experimental results by running the scripts in `examples/aflow/experiments`. diff --git a/examples/aflow/experiments/optimize_drop.py b/examples/aflow/experiments/optimize_drop.py index 801c5222b..73abfe961 100644 --- a/examples/aflow/experiments/optimize_drop.py +++ b/examples/aflow/experiments/optimize_drop.py @@ -3,50 +3,51 @@ # @Author : didi # @Desc : Entrance of AFlow. +import argparse + from metagpt.configs.models_config import ModelsConfig -from metagpt.ext.aflow.scripts.optimizer import DatasetType, Optimizer, QuestionType +from metagpt.ext.aflow.scripts.evaluator import Optimizer -# Crucial Parameters -dataset: DatasetType = "DROP" # Ensure the type is consistent with DatasetType -sample: int = 4 # Sample Count, which means how many workflows will be resampled from generated workflows -question_type: QuestionType = "qa" # Ensure the type is consistent with QuestionType -optimized_path: str = "metagpt/ext/aflow/scripts/optimized" # Optimized Result Save Path -initial_round: int = 1 # Corrected the case from Initial_round to initial_round -max_rounds: int = 20 # The max iteration of AFLOW. -check_convergence: bool = True # Whether Early Stop -validation_rounds: int = 5 # The validation rounds of AFLOW. -# Config llm model, you can modify `config/config2.yaml` to use more llms. -mini_llm_config = ModelsConfig.default().get("gpt-4o-mini") -claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620") +def parse_args(): + parser = argparse.ArgumentParser(description="AFlow Optimizer for DROP") + parser.add_argument("--dataset", type=str, default="DROP", help="Dataset type") + parser.add_argument("--sample", type=int, default=4, help="Sample count") + parser.add_argument("--question_type", type=str, default="qa", help="Question type") + parser.add_argument( + "--optimized_path", type=str, default="metagpt/ext/aflow/scripts/optimized", help="Optimized result save path" + ) + parser.add_argument("--initial_round", type=int, default=1, help="Initial round") + parser.add_argument("--max_rounds", type=int, default=20, help="Max iteration rounds") + parser.add_argument("--check_convergence", type=bool, default=True, help="Whether to enable early stop") + parser.add_argument("--validation_rounds", type=int, default=5, help="Validation rounds") + return parser.parse_args() -# Config operators. -operators = [ - "Custom", # It's basic unit of a fixed node. optimizer can modify its prompt to get vairous nodes. - "AnswerGenerate", # It's for qa - # "CustomCodeGenerate", # It's for code - "ScEnsemble", # It's for code, math and qa - # "Test", # It's for code - # "Programmer", # It's for math -] - -# Create an optimizer instance -optimizer = Optimizer( - dataset=dataset, # Config dataset - question_type=question_type, # Config Question Type - opt_llm_config=claude_llm_config, # Config Optimizer LLM - exec_llm_config=mini_llm_config, # Config Execution LLM - check_convergence=check_convergence, # Whether Early Stop - operators=operators, # Config Operators you want to use - optimized_path=optimized_path, # Config Optimized workflow's file path - sample=sample, # Only Top(sample) rounds will be selected. - initial_round=initial_round, # Optimize from initial round - max_rounds=max_rounds, # The max iteration of AFLOW. - validation_rounds=validation_rounds, # The validation rounds of AFLOW. -) if __name__ == "__main__": - # Optimize workflow via setting the optimizer's mode to 'Graph' + args = parse_args() + + mini_llm_config = ModelsConfig.default().get("gpt-4o-mini") + claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620") + + operators = [ + "Custom", + "AnswerGenerate", + "ScEnsemble", + ] + + optimizer = Optimizer( + dataset=args.dataset, + question_type=args.question_type, + opt_llm_config=claude_llm_config, + exec_llm_config=mini_llm_config, + check_convergence=args.check_convergence, + operators=operators, + optimized_path=args.optimized_path, + sample=args.sample, + initial_round=args.initial_round, + max_rounds=args.max_rounds, + validation_rounds=args.validation_rounds, + ) + optimizer.optimize("Graph") - # Test workflow via setting the optimizer's mode to 'Test' - # optimizer.optimize("Test") diff --git a/examples/aflow/experiments/optimize_gsm8k.py b/examples/aflow/experiments/optimize_gsm8k.py index e34fdb66d..17eafb664 100644 --- a/examples/aflow/experiments/optimize_gsm8k.py +++ b/examples/aflow/experiments/optimize_gsm8k.py @@ -3,50 +3,51 @@ # @Author : didi # @Desc : Entrance of AFlow. +import argparse + from metagpt.configs.models_config import ModelsConfig -from metagpt.ext.aflow.scripts.optimizer import DatasetType, Optimizer, QuestionType +from metagpt.ext.aflow.scripts.evaluator import Optimizer -# Crucial Parameters -dataset: DatasetType = "GSM8K" # Ensure the type is consistent with DatasetType -sample: int = 4 # Sample Count, which means how many workflows will be resampled from generated workflows -question_type: QuestionType = "math" # Ensure the type is consistent with QuestionType -optimized_path: str = "metagpt/ext/aflow/scripts/optimized" # Optimized Result Save Path -initial_round: int = 1 # Corrected the case from Initial_round to initial_round -max_rounds: int = 20 # The max iteration of AFLOW. -check_convergence: bool = True # Whether Early Stop -validation_rounds: int = 5 # The validation rounds of AFLOW. -# Config llm model, you can modify `config/config2.yaml` to use more llms. -mini_llm_config = ModelsConfig.default().get("gpt-4o-mini") -claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620") +def parse_args(): + parser = argparse.ArgumentParser(description="AFlow Optimizer for GSM8K") + parser.add_argument("--dataset", type=str, default="GSM8K", help="Dataset type") + parser.add_argument("--sample", type=int, default=4, help="Sample count") + parser.add_argument("--question_type", type=str, default="math", help="Question type") + parser.add_argument( + "--optimized_path", type=str, default="metagpt/ext/aflow/scripts/optimized", help="Optimized result save path" + ) + parser.add_argument("--initial_round", type=int, default=1, help="Initial round") + parser.add_argument("--max_rounds", type=int, default=20, help="Max iteration rounds") + parser.add_argument("--check_convergence", type=bool, default=True, help="Whether to enable early stop") + parser.add_argument("--validation_rounds", type=int, default=5, help="Validation rounds") + return parser.parse_args() -# Config operators. -operators = [ - "Custom", # It's basic unit of a fixed node. optimizer can modify its prompt to get vairous nodes. - # "AnswerGenerate", # It's for qa - # "CustomCodeGenerate", # It's for code - "ScEnsemble", # It's for code, math and qa - # "Test", # It's for code - "Programmer", # It's for math -] - -# Create an optimizer instance -optimizer = Optimizer( - dataset=dataset, # Config dataset - question_type=question_type, # Config Question Type - opt_llm_config=claude_llm_config, # Config Optimizer LLM - exec_llm_config=mini_llm_config, # Config Execution LLM - check_convergence=check_convergence, # Whether Early Stop - operators=operators, # Config Operators you want to use - optimized_path=optimized_path, # Config Optimized workflow's file path - sample=sample, # Only Top(sample) rounds will be selected. - initial_round=initial_round, # Optimize from initial round - max_rounds=max_rounds, # The max iteration of AFLOW. - validation_rounds=validation_rounds, # The validation rounds of AFLOW. -) if __name__ == "__main__": - # Optimize workflow via setting the optimizer's mode to 'Graph' + args = parse_args() + + mini_llm_config = ModelsConfig.default().get("gpt-4o-mini") + claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620") + + operators = [ + "Custom", + "ScEnsemble", + "Programmer", + ] + + optimizer = Optimizer( + dataset=args.dataset, + question_type=args.question_type, + opt_llm_config=claude_llm_config, + exec_llm_config=mini_llm_config, + check_convergence=args.check_convergence, + operators=operators, + optimized_path=args.optimized_path, + sample=args.sample, + initial_round=args.initial_round, + max_rounds=args.max_rounds, + validation_rounds=args.validation_rounds, + ) + optimizer.optimize("Graph") - # Test workflow via setting the optimizer's mode to 'Test' - # optimizer.optimize("Test") diff --git a/examples/aflow/experiments/optimize_hotpotqa.py b/examples/aflow/experiments/optimize_hotpotqa.py index 92d26ddd5..4ea2076a2 100644 --- a/examples/aflow/experiments/optimize_hotpotqa.py +++ b/examples/aflow/experiments/optimize_hotpotqa.py @@ -3,50 +3,51 @@ # @Author : didi # @Desc : Entrance of AFlow. +import argparse + from metagpt.configs.models_config import ModelsConfig -from metagpt.ext.aflow.scripts.optimizer import DatasetType, Optimizer, QuestionType +from metagpt.ext.aflow.scripts.evaluator import Optimizer -# Crucial Parameters -dataset: DatasetType = "HotpotQA" # Ensure the type is consistent with DatasetType -sample: int = 4 # Sample Count, which means how many workflows will be resampled from generated workflows -question_type: QuestionType = "qa" # Ensure the type is consistent with QuestionType -optimized_path: str = "metagpt/ext/aflow/scripts/optimized" # Optimized Result Save Path -initial_round: int = 1 # Corrected the case from Initial_round to initial_round -max_rounds: int = 20 # The max iteration of AFLOW. -check_convergence: bool = True # Whether Early Stop -validation_rounds: int = 5 # The validation rounds of AFLOW. -# Config llm model, you can modify `config/config2.yaml` to use more llms. -mini_llm_config = ModelsConfig.default().get("gpt-4o-mini") -claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620") +def parse_args(): + parser = argparse.ArgumentParser(description="AFlow Optimizer for HotpotQA") + parser.add_argument("--dataset", type=str, default="HotpotQA", help="Dataset type") + parser.add_argument("--sample", type=int, default=4, help="Sample count") + parser.add_argument("--question_type", type=str, default="qa", help="Question type") + parser.add_argument( + "--optimized_path", type=str, default="metagpt/ext/aflow/scripts/optimized", help="Optimized result save path" + ) + parser.add_argument("--initial_round", type=int, default=1, help="Initial round") + parser.add_argument("--max_rounds", type=int, default=20, help="Max iteration rounds") + parser.add_argument("--check_convergence", type=bool, default=True, help="Whether to enable early stop") + parser.add_argument("--validation_rounds", type=int, default=5, help="Validation rounds") + return parser.parse_args() -# Config operators. -operators = [ - "Custom", # It's basic unit of a fixed node. optimizer can modify its prompt to get vairous nodes. - "AnswerGenerate", # It's for qa - # "CustomCodeGenerate", # It's for code - "ScEnsemble", # It's for code, math and qa - # "Test", # It's for code - # "Programmer", # It's for math -] - -# Create an optimizer instance -optimizer = Optimizer( - dataset=dataset, # Config dataset - question_type=question_type, # Config Question Type - opt_llm_config=claude_llm_config, # Config Optimizer LLM - exec_llm_config=mini_llm_config, # Config Execution LLM - check_convergence=check_convergence, # Whether Early Stop - operators=operators, # Config Operators you want to use - optimized_path=optimized_path, # Config Optimized workflow's file path - sample=sample, # Only Top(sample) rounds will be selected. - initial_round=initial_round, # Optimize from initial round - max_rounds=max_rounds, # The max iteration of AFLOW. - validation_rounds=validation_rounds, # The validation rounds of AFLOW. -) if __name__ == "__main__": - # Optimize workflow via setting the optimizer's mode to 'Graph' + args = parse_args() + + mini_llm_config = ModelsConfig.default().get("gpt-4o-mini") + claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620") + + operators = [ + "Custom", + "AnswerGenerate", + "ScEnsemble", + ] + + optimizer = Optimizer( + dataset=args.dataset, + question_type=args.question_type, + opt_llm_config=claude_llm_config, + exec_llm_config=mini_llm_config, + check_convergence=args.check_convergence, + operators=operators, + optimized_path=args.optimized_path, + sample=args.sample, + initial_round=args.initial_round, + max_rounds=args.max_rounds, + validation_rounds=args.validation_rounds, + ) + optimizer.optimize("Graph") - # Test workflow via setting the optimizer's mode to 'Test' - # optimizer.optimize("Test") diff --git a/examples/aflow/experiments/optimize_humaneval.py b/examples/aflow/experiments/optimize_humaneval.py index 6027e9ec8..20597d395 100644 --- a/examples/aflow/experiments/optimize_humaneval.py +++ b/examples/aflow/experiments/optimize_humaneval.py @@ -3,50 +3,52 @@ # @Author : didi # @Desc : Entrance of AFlow. +import argparse + from metagpt.configs.models_config import ModelsConfig -from metagpt.ext.aflow.scripts.optimizer import DatasetType, Optimizer, QuestionType +from metagpt.ext.aflow.scripts.evaluator import Optimizer -# Crucial Parameters -dataset: DatasetType = "HumanEval" # Ensure the type is consistent with DatasetType -sample: int = 4 # Sample Count, which means how many workflows will be resampled from generated workflows -question_type: QuestionType = "code" # Ensure the type is consistent with QuestionType -optimized_path: str = "metagpt/ext/aflow/scripts/optimized" # Optimized Result Save Path -initial_round: int = 1 # Corrected the case from Initial_round to initial_round -max_rounds: int = 20 # The max iteration of AFLOW. -check_convergence: bool = True # Whether Early Stop -validation_rounds: int = 5 # The validation rounds of AFLOW. -# Config llm model, you can modify `config/config2.yaml` to use more llms. -mini_llm_config = ModelsConfig.default().get("gpt-4o-mini") -claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620") +def parse_args(): + parser = argparse.ArgumentParser(description="AFlow Optimizer for HumanEval") + parser.add_argument("--dataset", type=str, default="HumanEval", help="Dataset type") + parser.add_argument("--sample", type=int, default=4, help="Sample count") + parser.add_argument("--question_type", type=str, default="code", help="Question type") + parser.add_argument( + "--optimized_path", type=str, default="metagpt/ext/aflow/scripts/optimized", help="Optimized result save path" + ) + parser.add_argument("--initial_round", type=int, default=1, help="Initial round") + parser.add_argument("--max_rounds", type=int, default=20, help="Max iteration rounds") + parser.add_argument("--check_convergence", type=bool, default=True, help="Whether to enable early stop") + parser.add_argument("--validation_rounds", type=int, default=5, help="Validation rounds") + return parser.parse_args() -# Config operators. -operators = [ - "Custom", # It's basic unit of a fixed node. optimizer can modify its prompt to get vairous nodes. - # "AnswerGenerate", # It's for qa - "CustomCodeGenerate", # It's for code - "ScEnsemble", # It's for code, math and qa - "Test", # It's for code - # "Programmer", # It's for math -] - -# Create an optimizer instance -optimizer = Optimizer( - dataset=dataset, # Config dataset - question_type=question_type, # Config Question Type - opt_llm_config=claude_llm_config, # Config Optimizer LLM - exec_llm_config=mini_llm_config, # Config Execution LLM - check_convergence=check_convergence, # Whether Early Stop - operators=operators, # Config Operators you want to use - optimized_path=optimized_path, # Config Optimized workflow's file path - sample=sample, # Only Top(sample) rounds will be selected. - initial_round=initial_round, # Optimize from initial round - max_rounds=max_rounds, # The max iteration of AFLOW. - validation_rounds=validation_rounds, # The validation rounds of AFLOW. -) if __name__ == "__main__": - # Optimize workflow via setting the optimizer's mode to 'Graph' + args = parse_args() + + mini_llm_config = ModelsConfig.default().get("gpt-4o-mini") + claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620") + + operators = [ + "Custom", + "CustomCodeGenerate", + "ScEnsemble", + "Test", + ] + + optimizer = Optimizer( + dataset=args.dataset, + question_type=args.question_type, + opt_llm_config=claude_llm_config, + exec_llm_config=mini_llm_config, + check_convergence=args.check_convergence, + operators=operators, + optimized_path=args.optimized_path, + sample=args.sample, + initial_round=args.initial_round, + max_rounds=args.max_rounds, + validation_rounds=args.validation_rounds, + ) + optimizer.optimize("Graph") - # Test workflow via setting the optimizer's mode to 'Test' - # optimizer.optimize("Test") diff --git a/examples/aflow/experiments/optimize_math.py b/examples/aflow/experiments/optimize_math.py index 5d951c168..40cc2b0d1 100644 --- a/examples/aflow/experiments/optimize_math.py +++ b/examples/aflow/experiments/optimize_math.py @@ -3,50 +3,51 @@ # @Author : didi # @Desc : Entrance of AFlow. +import argparse + from metagpt.configs.models_config import ModelsConfig -from metagpt.ext.aflow.scripts.optimizer import DatasetType, Optimizer, QuestionType +from metagpt.ext.aflow.scripts.evaluator import Optimizer -# Crucial Parameters -dataset: DatasetType = "MATH" # Ensure the type is consistent with DatasetType -sample: int = 4 # Sample Count, which means how many workflows will be resampled from generated workflows -question_type: QuestionType = "math" # Ensure the type is consistent with QuestionType -optimized_path: str = "metagpt/ext/aflow/scripts/optimized" # Optimized Result Save Path -initial_round: int = 1 # Corrected the case from Initial_round to initial_round -max_rounds: int = 20 # The max iteration of AFLOW. -check_convergence: bool = True # Whether Early Stop -validation_rounds: int = 5 # The validation rounds of AFLOW. -# Config llm model, you can modify `config/config2.yaml` to use more llms. -mini_llm_config = ModelsConfig.default().get("gpt-4o-mini") -claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620") +def parse_args(): + parser = argparse.ArgumentParser(description="AFlow Optimizer for MATH") + parser.add_argument("--dataset", type=str, default="MATH", help="Dataset type") + parser.add_argument("--sample", type=int, default=4, help="Sample count") + parser.add_argument("--question_type", type=str, default="math", help="Question type") + parser.add_argument( + "--optimized_path", type=str, default="metagpt/ext/aflow/scripts/optimized", help="Optimized result save path" + ) + parser.add_argument("--initial_round", type=int, default=1, help="Initial round") + parser.add_argument("--max_rounds", type=int, default=20, help="Max iteration rounds") + parser.add_argument("--check_convergence", type=bool, default=True, help="Whether to enable early stop") + parser.add_argument("--validation_rounds", type=int, default=5, help="Validation rounds") + return parser.parse_args() -# Config operators. -operators = [ - "Custom", # It's basic unit of a fixed node. optimizer can modify its prompt to get vairous nodes. - # "AnswerGenerate", # It's for qa - # "CustomCodeGenerate", # It's for code - "ScEnsemble", # It's for code, math and qa - # "Test", # It's for code - "Programmer", # It's for math -] - -# Create an optimizer instance -optimizer = Optimizer( - dataset=dataset, # Config dataset - question_type=question_type, # Config Question Type - opt_llm_config=claude_llm_config, # Config Optimizer LLM - exec_llm_config=mini_llm_config, # Config Execution LLM - check_convergence=check_convergence, # Whether Early Stop - operators=operators, # Config Operators you want to use - optimized_path=optimized_path, # Config Optimized workflow's file path - sample=sample, # Only Top(sample) rounds will be selected. - initial_round=initial_round, # Optimize from initial round - max_rounds=max_rounds, # The max iteration of AFLOW. - validation_rounds=validation_rounds, # The validation rounds of AFLOW. -) if __name__ == "__main__": - # Optimize workflow via setting the optimizer's mode to 'Graph' + args = parse_args() + + mini_llm_config = ModelsConfig.default().get("gpt-4o-mini") + claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620") + + operators = [ + "Custom", + "ScEnsemble", + "Programmer", + ] + + optimizer = Optimizer( + dataset=args.dataset, + question_type=args.question_type, + opt_llm_config=claude_llm_config, + exec_llm_config=mini_llm_config, + check_convergence=args.check_convergence, + operators=operators, + optimized_path=args.optimized_path, + sample=args.sample, + initial_round=args.initial_round, + max_rounds=args.max_rounds, + validation_rounds=args.validation_rounds, + ) + optimizer.optimize("Graph") - # Test workflow via setting the optimizer's mode to 'Test' - # optimizer.optimize("Test") diff --git a/examples/aflow/experiments/optimize_mbpp.py b/examples/aflow/experiments/optimize_mbpp.py index 00c008bbf..a27be8147 100644 --- a/examples/aflow/experiments/optimize_mbpp.py +++ b/examples/aflow/experiments/optimize_mbpp.py @@ -3,50 +3,52 @@ # @Author : didi # @Desc : Entrance of AFlow. +import argparse + from metagpt.configs.models_config import ModelsConfig -from metagpt.ext.aflow.scripts.optimizer import DatasetType, Optimizer, QuestionType +from metagpt.ext.aflow.scripts.evaluator import Optimizer -# Crucial Parameters -dataset: DatasetType = "MBPP" # Ensure the type is consistent with DatasetType -sample: int = 4 # Sample Count, which means how many workflows will be resampled from generated workflows -question_type: QuestionType = "code" # Ensure the type is consistent with QuestionType -optimized_path: str = "metagpt/ext/aflow/scripts/optimized" # Optimized Result Save Path -initial_round: int = 1 # Corrected the case from Initial_round to initial_round -max_rounds: int = 20 # The max iteration of AFLOW. -check_convergence: bool = True # Whether Early Stop -validation_rounds: int = 5 # The validation rounds of AFLOW. -# Config llm model, you can modify `config/config2.yaml` to use more llms. -mini_llm_config = ModelsConfig.default().get("gpt-4o-mini") -claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620") +def parse_args(): + parser = argparse.ArgumentParser(description="AFlow Optimizer for MBPP") + parser.add_argument("--dataset", type=str, default="MBPP", help="Dataset type") + parser.add_argument("--sample", type=int, default=4, help="Sample count") + parser.add_argument("--question_type", type=str, default="code", help="Question type") + parser.add_argument( + "--optimized_path", type=str, default="metagpt/ext/aflow/scripts/optimized", help="Optimized result save path" + ) + parser.add_argument("--initial_round", type=int, default=1, help="Initial round") + parser.add_argument("--max_rounds", type=int, default=20, help="Max iteration rounds") + parser.add_argument("--check_convergence", type=bool, default=True, help="Whether to enable early stop") + parser.add_argument("--validation_rounds", type=int, default=5, help="Validation rounds") + return parser.parse_args() -# Config operators. -operators = [ - "Custom", # It's basic unit of a fixed node. optimizer can modify its prompt to get vairous nodes. - # "AnswerGenerate", # It's for qa - "CustomCodeGenerate", # It's for code - "ScEnsemble", # It's for code, math and qa - "Test", # It's for code - # "Programmer", # It's for math -] - -# Create an optimizer instance -optimizer = Optimizer( - dataset=dataset, # Config dataset - question_type=question_type, # Config Question Type - opt_llm_config=claude_llm_config, # Config Optimizer LLM - exec_llm_config=mini_llm_config, # Config Execution LLM - check_convergence=check_convergence, # Whether Early Stop - operators=operators, # Config Operators you want to use - optimized_path=optimized_path, # Config Optimized workflow's file path - sample=sample, # Only Top(sample) rounds will be selected. - initial_round=initial_round, # Optimize from initial round - max_rounds=max_rounds, # The max iteration of AFLOW. - validation_rounds=validation_rounds, # The validation rounds of AFLOW. -) if __name__ == "__main__": - # Optimize workflow via setting the optimizer's mode to 'Graph' + args = parse_args() + + mini_llm_config = ModelsConfig.default().get("gpt-4o-mini") + claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620") + + operators = [ + "Custom", + "CustomCodeGenerate", + "ScEnsemble", + "Test", + ] + + optimizer = Optimizer( + dataset=args.dataset, + question_type=args.question_type, + opt_llm_config=claude_llm_config, + exec_llm_config=mini_llm_config, + check_convergence=args.check_convergence, + operators=operators, + optimized_path=args.optimized_path, + sample=args.sample, + initial_round=args.initial_round, + max_rounds=args.max_rounds, + validation_rounds=args.validation_rounds, + ) + optimizer.optimize("Graph") - # Test workflow via setting the optimizer's mode to 'Test' - # optimizer.optimize("Test") diff --git a/examples/aflow/optimize.py b/examples/aflow/optimize.py index 65b194344..e24facb3a 100644 --- a/examples/aflow/optimize.py +++ b/examples/aflow/optimize.py @@ -9,17 +9,17 @@ from metagpt.configs.models_config import ModelsConfig from metagpt.ext.aflow.data.download_data import download from metagpt.ext.aflow.scripts.optimizer import Optimizer -# DatasetType, QuestionType, and OptimizerType definitions -# DatasetType = Literal["HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP"] -# QuestionType = Literal["math", "code", "qa"] -# OptimizerType = Literal["Graph", "Test"] - def parse_args(): parser = argparse.ArgumentParser(description="AFlow Optimizer") - parser.add_argument("--dataset", type=str, default="MATH", help="Dataset type") + parser.add_argument( + "--dataset", + type=str, + default="MATH", + help="Dataset type, including HumanEval, MBPP, GSM8K, MATH, HotpotQA, DROP", + ) parser.add_argument("--sample", type=int, default=4, help="Sample count") - parser.add_argument("--question_type", type=str, default="math", help="Question type") + parser.add_argument("--question_type", type=str, default="math", help="Question type, including math, code, qa") parser.add_argument( "--optimized_path", type=str, default="metagpt/ext/aflow/scripts/optimized", help="Optimized result save path" ) diff --git a/metagpt/actions/action_node.py b/metagpt/actions/action_node.py index ab190b736..a974b35d1 100644 --- a/metagpt/actions/action_node.py +++ b/metagpt/actions/action_node.py @@ -510,8 +510,9 @@ class ActionNode: return {field_name: field.annotation for field_name, field in model_class.model_fields.items()} def xml_compile(self, context): - # TODO 再来一版 - + """ + Compile the prompt to make it easier for the model to understand the format. + """ field_names = self.get_field_names() # Construct the example using the field names examples = [] diff --git a/metagpt/ext/aflow/benchmark/utils.py b/metagpt/ext/aflow/benchmark/utils.py index 60cbe5580..846101bc0 100644 --- a/metagpt/ext/aflow/benchmark/utils.py +++ b/metagpt/ext/aflow/benchmark/utils.py @@ -11,12 +11,12 @@ import os import numpy as np -from metagpt.utils.common import write_json_file +from metagpt.utils.common import read_json_file, write_json_file def generate_random_indices(n, n_samples, test=False): """ - 生成随机索引 + Generate random indices """ def _set_seed(seed=42): @@ -52,20 +52,16 @@ def log_mismatch(problem, expected_output, prediction, predicted_number, path): log_file = os.path.join(path, "log.json") - # 检查log文件是否已经存在 + # Check if the log file already exists if os.path.exists(log_file): - # 如果存在,加载现有的日志数据 - with open(log_file, "r", encoding="utf-8") as f: - try: - data = json.load(f) - except json.JSONDecodeError: - data = [] + # If it exists, load the existing log data + data = read_json_file(log_file) else: - # 如果不存在,创建一个新的日志列表 + # If it does not exist, create a new log list data = [] - # 添加新的日志记录 + # Add the new log entry data.append(log_data) - # 将数据写回到log.json文件 + # Write the data back to log.json file write_json_file(log_file, data, encoding="utf-8", indent=4) diff --git a/metagpt/ext/aflow/scripts/optimizer.py b/metagpt/ext/aflow/scripts/optimizer.py index 8dadc1d1a..0ac4827e7 100644 --- a/metagpt/ext/aflow/scripts/optimizer.py +++ b/metagpt/ext/aflow/scripts/optimizer.py @@ -10,6 +10,7 @@ from typing import List, Literal from pydantic import BaseModel, Field from metagpt.actions.action_node import ActionNode +from metagpt.ext.aflow.scripts.evaluator import DatasetType from metagpt.ext.aflow.scripts.optimizer_utils.convergence_utils import ConvergenceUtils from metagpt.ext.aflow.scripts.optimizer_utils.data_utils import DataUtils from metagpt.ext.aflow.scripts.optimizer_utils.evaluation_utils import EvaluationUtils @@ -18,7 +19,6 @@ from metagpt.ext.aflow.scripts.optimizer_utils.graph_utils import GraphUtils from metagpt.logs import logger from metagpt.provider.llm_provider_registry import create_llm_instance -DatasetType = Literal["HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP"] QuestionType = Literal["math", "code", "qa"] OptimizerType = Literal["Graph", "Test"] diff --git a/metagpt/ext/aflow/scripts/optimizer_utils/convergence_utils.py b/metagpt/ext/aflow/scripts/optimizer_utils/convergence_utils.py index 246a94798..0e275f496 100644 --- a/metagpt/ext/aflow/scripts/optimizer_utils/convergence_utils.py +++ b/metagpt/ext/aflow/scripts/optimizer_utils/convergence_utils.py @@ -76,8 +76,8 @@ class ConvergenceUtils: if len(self.avg_scores) < top_k + 1: return False, None, None convergence_count = 0 # Convergence counter - previous_Y = None # Y value of the previous round (average of top_k scores) - sigma_Y_previous = None # Standard error of Y value from previous round + previous_y = None # Y value of the previous round (average of top_k scores) + sigma_y_previous = None # Standard error of Y value from previous round for i in range(len(self.avg_scores)): # Dynamically select top_k from current round and all previous rounds top_k_indices = np.argsort(self.avg_scores[: i + 1])[::-1][ @@ -87,18 +87,18 @@ class ConvergenceUtils: top_k_stds = [ self.stds[j] for j in top_k_indices ] # Get list of standard deviations corresponding to top k scores - # Calculate mean of top k scores for current round, i.e., Y_current - Y_current = np.mean(top_k_scores) - # Calculate standard error of Y_current (sigma_Y_current), representing score dispersion - sigma_Y_current = np.sqrt(np.sum([s**2 for s in top_k_stds]) / (top_k**2)) + # Calculate mean of top k scores for current round, i.e., y_current + y_current = np.mean(top_k_scores) + # Calculate standard error of y_current (sigma_y_current), representing score dispersion + sigma_y_current = np.sqrt(np.sum([s**2 for s in top_k_stds]) / (top_k**2)) # If not the first round, calculate change in Y (Delta_Y) and corresponding standard error - if previous_Y is not None: + if previous_y is not None: # Calculate Y difference between current round and previous round - Delta_Y = Y_current - previous_Y + delta_y = y_current - previous_y # Calculate standard error of Y difference (sigma_Delta_Y) - sigma_Delta_Y = np.sqrt(sigma_Y_current**2 + sigma_Y_previous**2) + sigma_delta_y = np.sqrt(sigma_y_current**2 + sigma_y_previous**2) # Check if Y change is within acceptable confidence interval, i.e., convergence condition - if abs(Delta_Y) <= z * sigma_Delta_Y: + if abs(delta_y) <= z * sigma_delta_y: convergence_count += 1 # If consecutive converged rounds reach set value, return convergence information if convergence_count >= consecutive_rounds: @@ -107,8 +107,8 @@ class ConvergenceUtils: # If change is large, reset convergence counter convergence_count = 0 # Update Y value and standard error for previous round - previous_Y = Y_current - sigma_Y_previous = sigma_Y_current + previous_y = y_current + sigma_y_previous = sigma_y_current # If convergence condition not met, return not converged return False, None, None diff --git a/metagpt/ext/aflow/scripts/optimizer_utils/data_utils.py b/metagpt/ext/aflow/scripts/optimizer_utils/data_utils.py index f55ff8740..2df161ed8 100644 --- a/metagpt/ext/aflow/scripts/optimizer_utils/data_utils.py +++ b/metagpt/ext/aflow/scripts/optimizer_utils/data_utils.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd from metagpt.logs import logger +from metagpt.utils.common import read_json_file, write_json_file class DataUtils: @@ -17,11 +18,7 @@ class DataUtils: def load_results(self, path: str) -> list: result_path = os.path.join(path, "results.json") if os.path.exists(result_path): - with open(result_path, "r") as json_file: - try: - return json.load(json_file) - except json.JSONDecodeError: - return [] + return read_json_file(result_path, encoding="utf-8") return [] def get_top_rounds(self, sample: int, path=None, mode="Graph"): @@ -97,8 +94,7 @@ class DataUtils: if not os.path.exists(log_dir): return "" # 如果文件不存在,返回空字符串 logger.info(log_dir) - with open(log_dir, "r", encoding="utf-8") as f: - data = json.load(f) + data = read_json_file(log_dir, encoding="utf-8") if isinstance(data, dict): data = [data] @@ -125,8 +121,7 @@ class DataUtils: return {"round": round, "score": score, "avg_cost": avg_cost, "total_cost": total_cost, "time": now} def save_results(self, json_file_path: str, data: list): - with open(json_file_path, "w") as json_file: - json.dump(data, json_file, default=str, indent=4) + write_json_file(json_file_path, data, encoding="utf-8", indent=4) def _load_scores(self, path=None, mode="Graph"): if mode == "Graph": @@ -137,8 +132,7 @@ class DataUtils: result_file = os.path.join(rounds_dir, "results.json") self.top_scores = [] - with open(result_file, "r", encoding="utf-8") as file: - data = json.load(file) + data = read_json_file(result_file, encoding="utf-8") df = pd.DataFrame(data) scores_per_round = df.groupby("round")["score"].mean().to_dict() diff --git a/metagpt/ext/aflow/scripts/optimizer_utils/experience_utils.py b/metagpt/ext/aflow/scripts/optimizer_utils/experience_utils.py index cffd8b522..43f9eb1d5 100644 --- a/metagpt/ext/aflow/scripts/optimizer_utils/experience_utils.py +++ b/metagpt/ext/aflow/scripts/optimizer_utils/experience_utils.py @@ -3,6 +3,7 @@ import os from collections import defaultdict from metagpt.logs import logger +from metagpt.utils.common import read_json_file, write_json_file class ExperienceUtils: @@ -24,23 +25,22 @@ class ExperienceUtils: round_number = int(round_dir.split("_")[1]) json_file_path = os.path.join(round_path, "experience.json") if os.path.exists(json_file_path): - with open(json_file_path, "r", encoding="utf-8") as json_file: - data = json.load(json_file) - father_node = data["father node"] + data = read_json_file(json_file_path, encoding="utf-8") + father_node = data["father node"] - if experience_data[father_node]["score"] is None: - experience_data[father_node]["score"] = data["before"] + if experience_data[father_node]["score"] is None: + experience_data[father_node]["score"] = data["before"] - if data["succeed"]: - experience_data[father_node]["success"][round_number] = { - "modification": data["modification"], - "score": data["after"], - } - else: - experience_data[father_node]["failure"][round_number] = { - "modification": data["modification"], - "score": data["after"], - } + if data["succeed"]: + experience_data[father_node]["success"][round_number] = { + "modification": data["modification"], + "score": data["after"], + } + else: + experience_data[father_node]["failure"][round_number] = { + "modification": data["modification"], + "score": data["after"], + } except Exception as e: logger.info(f"Error processing {round_dir}: {str(e)}") @@ -93,5 +93,4 @@ class ExperienceUtils: experience["after"] = avg_score experience["succeed"] = bool(avg_score > experience["before"]) - with open(os.path.join(directory, "experience.json"), "w", encoding="utf-8") as file: - json.dump(experience, file, ensure_ascii=False, indent=4) + write_json_file(os.path.join(directory, "experience.json"), experience, encoding="utf-8", indent=4) diff --git a/metagpt/ext/aflow/scripts/utils.py b/metagpt/ext/aflow/scripts/utils.py index bc97f0818..c2fdd0cb7 100644 --- a/metagpt/ext/aflow/scripts/utils.py +++ b/metagpt/ext/aflow/scripts/utils.py @@ -6,11 +6,17 @@ import json import re +from enum import Enum from typing import Any, List, Tuple -def extract_test_cases_from_jsonl(entry_point: str, dataset: str = "HumanEval"): - if dataset == "HumanEval": +class CodeDataset(Enum): + HUMAN_EVAL = "HumanEval" + MBPP = "MBPP" + + +def extract_test_cases_from_jsonl(entry_point: str, dataset: CodeDataset = CodeDataset.HUMAN_EVAL): + if dataset == CodeDataset.HUMAN_EVAL: file_path = "metagpt/ext/aflow/data/humaneval_public_test.jsonl" # Retain the original hardcoded test cases hardcoded_cases = { @@ -25,7 +31,7 @@ def extract_test_cases_from_jsonl(entry_point: str, dataset: str = "HumanEval"): "sum_squares": "", "starts_one_ends": "", } - elif dataset == "MBPP": + elif dataset == CodeDataset.MBPP: file_path = "metagpt/ext/aflow/data/mbpp_public_test.jsonl" hardcoded_cases = { "remove_odd": "", diff --git a/metagpt/ext/aflow/scripts/workflow.py b/metagpt/ext/aflow/scripts/workflow.py index d0f883071..47b54021b 100644 --- a/metagpt/ext/aflow/scripts/workflow.py +++ b/metagpt/ext/aflow/scripts/workflow.py @@ -3,13 +3,11 @@ # @Author : didi # @Desc : Basic Graph Class -from typing import Literal +from metagpt.ext.aflow.scripts.evaluator import DatasetType from metagpt.provider.llm_provider_registry import create_llm_instance from metagpt.utils.cost_manager import CostManager -DatasetType = Literal["HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP"] - class Workflow: def __init__(