From e575b629a12f8fd5c269237028b4d15790513a1a Mon Sep 17 00:00:00 2001 From: didi <84363704+didiforgithub@users.noreply.github.com> Date: Tue, 22 Oct 2024 11:41:45 +0800 Subject: [PATCH] Resolve comment and modify readme --- examples/aflow/README-.md | 57 +++++++++++++++ examples/aflow/readme.md | 70 ------------------- metagpt/ext/aflow/README.md | 51 +++++--------- .../optimizer_utils/convergence_utils.py | 65 +++++++++-------- 4 files changed, 111 insertions(+), 132 deletions(-) create mode 100644 examples/aflow/README-.md delete mode 100644 examples/aflow/readme.md diff --git a/examples/aflow/README-.md b/examples/aflow/README-.md new file mode 100644 index 000000000..4adb92a7e --- /dev/null +++ b/examples/aflow/README-.md @@ -0,0 +1,57 @@ +# AFlow: Automating Agentic Workflow Generation + +AFlow is a framework for automatically generating and optimizing Agentic Workflows. It uses Monte Carlo tree search in a code-represented workflow space to find effective workflows, replacing manual development with machine effort. Our approach shows potential to outperform handcrafted workflows on various tasks. + +[Read our paper on arXiv](https://arxiv.org/abs/2410.10762) + +![Performance of AFlow](examples/aflow/img/performance_of_AFlow.png) + +## Framework Components + +- **Node**: Basic unit of LLM invocation. See `metagpt/actions/action_node.py` for a flexible interface to control LLM, temperature, format, and prompt. +- **Operator**: Predefined combinations of Nodes to enhance search efficiency. Encapsulates common operations like Generate, Format, Review, Revise, Ensemble, Test, and Programmer. See `metagpt/ext/aflow/operator.py` for details. You can customize your own Operator by referencing the implementations in this code. +- **Workflow**: A sequence of LLM-invoking nodes connected by edges. Can be represented as graphs, neural networks, or code to express various execution structures. See `metagpt/ext/aflow/workflow.py` for our implementation. +- **Optimizer**: Uses LLMs within a Monte Carlo Tree Search variant to explore and refine workflows. Iteratively selects, expands, evaluates, and updates workflows based on performance. See `metagpt/ext/aflow/scripts/optimizer.py` for details. +- **Evaluator**: Assesses workflow performance on given tasks. Provides feedback to guide the optimization process towards more effective workflows. See `metagpt/ext/aflow/scripts/evaluator.py` for details. + +## Datasets + +### Experimental Datasets +We conducted experiments on six datasets (HumanEval, MBPP, GSM8K, MATH, HotpotQA, DROP) and provide their evaluation code. The data can be found in this [datasets](https://drive.google.com/uc?export=download&id=1DNoegtZiUhWtvkd2xoIuElmIi4ah7k8e) link, or you can download them using `metagpt/ext/aflow/data/download_data.py` + +### Custom Datasets +For custom tasks, you can reference the code in the metagpt/ext/aflow/benchmark folder. Inherit the `BaseBenchmark` class and implement `evaluate_problem`, `calculate_score`, and `get_result_columns` to add your custom dataset benchmark. Then, add your benchmark name in `metagpt/ext/aflow/scripts/evaluator.py` and `metagpt/ext/aflow/scripts/optimizer.py` to find effective workflows for your custom dataset. + +## Quick Start + +1. Configure your search in `optimize.py`: + - Open `examples/aflow/optimize.py` + - Set the following parameters: + ```python + dataset = "HumanEval" # Choose from: "HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP" or your custom dataset name + question_type = "code" # Choose from: "math", "code", "qa" + sample = 4 # Number of samples to use for optimization + check_convergence = True # Whether to check for convergence + optimized_path = "path/to/optimized/workflows" # Path to save optimized workflows, defaults to metagpt/ext/aflow/scripts/optimized + initial_round = 1 # Starting round number + max_rounds = 20 # Maximum number of optimization rounds + ``` + - Adjust these parameters according to your specific requirements and dataset +2. Set up parameters in `config/config2.yaml` (see `examples/aflow/config2.example.yaml` for reference) +3. Set the operator you want to use in `optimize.py` and in `optimized_path/template/operator.py`, `optimized_path/template/operator.json`. You can reference our implementation to add operators for specific datasets +4. When you first run, you can download the datasets and initial rounds by setting `download(["datasets", "initial_rounds"])` in `examples/aflow/optimize.py` +5. (Optional) Add your custom dataset and corresponding evaluation function following the [Custom Datasets](#custom-datasets) section +6. Run `python examples/aflow/optimize.py` to start the optimization process! + +## Citation + +If you use AFlow in your research, please cite our paper: + +``` +@article{zhang2024aflow, + title={AFlow: Automating Agentic Workflow Generation}, + author={Zhang, Jiayi and Xiang, Jinyu and Yu, Zhaoyang and Teng, Fengwei and Chen, Xionghui and Chen, Jiaqi and Zhuge, Mingchen and Cheng, Xin and Hong, Sirui and Wang, Jinlin and others}, + journal={arXiv preprint arXiv:2410.10762}, + year={2024} +} +``` \ No newline at end of file diff --git a/examples/aflow/readme.md b/examples/aflow/readme.md deleted file mode 100644 index 4fa9bb150..000000000 --- a/examples/aflow/readme.md +++ /dev/null @@ -1,70 +0,0 @@ -# AFlow: Automating Agentic Workflow Generation - -AFlow is a framework for automatically generating and optimizing Agentic Workflows. It uses Monte Carlo tree search in a code-represented workflow space to find effective workflows, replacing manual development with machine effort. Our approach shows potential to outperform handcrafted workflows on various tasks. - -[Read our paper on arXiv](https://arxiv.org/abs/2410.10762) - -[Insert performance graph/image here] - -## Framework Components - -- **Node**: Basic unit of LLM invocation. See `action_node.py` for a flexible interface to control LLM, temperature, format, and prompt. -- **Operator**: Predefined combinations of Nodes to enhance search efficiency. Encapsulates common operations like Generate, Format, Review, Revise, Ensemble, Test, and Programmer. -- **Workflow**: A sequence of LLM-invoking nodes connected by edges. Can be represented as graphs, neural networks, or code to express various execution structures. -- **Optimizer**: Uses LLMs within a Monte Carlo Tree Search variant to explore and refine workflows. Iteratively selects, expands, evaluates, and updates workflows based on performance. -- **Evaluator**: Assesses workflow performance on given tasks. Provides feedback to guide the optimization process towards more effective workflows. - -## Datasets - -We provide implementations for [list datasets here]. - -Data is available at [link to data]. - -For custom tasks, [brief instructions or link to documentation]. - -## Quick Start - -1. Configure your search in `optimize.py`: - - Open `metagpt/ext/aflow/scripts/optimize.py` - - Set the following parameters: - ```python - dataset = "HumanEval" # Choose from: "HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP" or your custom dataset name - question_type = "code" # Choose from: "math", "code", "qa" - sample = 5 # Number of samples to use for optimization - check_convergence = True # Whether to check for convergence - optimized_path = "path/to/optimized/workflows" # Path to save optimized workflows - initial_round = 1 # Starting round number - max_rounds = 20 # Maximum number of optimization rounds - ``` - - Adjust these parameters according to your specific requirements and dataset -2. Set up parameters in `config/config2.yaml` (see `metagpt/ext/aflow/config2.example.yaml` for reference) -3. Set the operator you want to use in `optimize.py` and in `xxxx` -4. Download the init round of six datasets and put them in `xxxxxx` -5. Add your custom dataset and corresponding evaluation function: - -- Create a new Python file in the `metagpt/ext/aflow/benchmark/` directory, named `{custom_dataset_name}.py` -- Implement the following key functions in this new file: - - `load_data`: for loading the dataset - - `evaluate_problem`: for evaluating a single problem solution - - `evaluate_all_problems`: for evaluating all problems - - `save_results_to_csv`: for saving evaluation results - - `optimize_{custom_dataset_name}_evaluation`: main evaluation function that integrates the above functionalities -- Add your custom dataset name and config val_list in `metagpt/ext/aflow/scripts/evaluator.py` - - -## License - -[License information] - -## Citation - -If you use AFlow in your research, please cite our paper: - -``` -@article{zhang2024aflow, - title={AFlow: Automating Agentic Workflow Generation}, - author={Zhang, Jiayi and Xiang, Jinyu and Yu, Zhaoyang and Teng, Fengwei and Chen, Xionghui and Chen, Jiaqi and Zhuge, Mingchen and Cheng, Xin and Hong, Sirui and Wang, Jinlin and others}, - journal={arXiv preprint arXiv:2410.10762}, - year={2024} -} -``` \ No newline at end of file diff --git a/metagpt/ext/aflow/README.md b/metagpt/ext/aflow/README.md index 4fa9bb150..4adb92a7e 100644 --- a/metagpt/ext/aflow/README.md +++ b/metagpt/ext/aflow/README.md @@ -1,60 +1,47 @@ # AFlow: Automating Agentic Workflow Generation -AFlow is a framework for automatically generating and optimizing Agentic Workflows. It uses Monte Carlo tree search in a code-represented workflow space to find effective workflows, replacing manual development with machine effort. Our approach shows potential to outperform handcrafted workflows on various tasks. +AFlow is a framework for automatically generating and optimizing Agentic Workflows. It uses Monte Carlo tree search in a code-represented workflow space to find effective workflows, replacing manual development with machine effort. Our approach shows potential to outperform handcrafted workflows on various tasks. [Read our paper on arXiv](https://arxiv.org/abs/2410.10762) -[Insert performance graph/image here] +![Performance of AFlow](examples/aflow/img/performance_of_AFlow.png) ## Framework Components -- **Node**: Basic unit of LLM invocation. See `action_node.py` for a flexible interface to control LLM, temperature, format, and prompt. -- **Operator**: Predefined combinations of Nodes to enhance search efficiency. Encapsulates common operations like Generate, Format, Review, Revise, Ensemble, Test, and Programmer. -- **Workflow**: A sequence of LLM-invoking nodes connected by edges. Can be represented as graphs, neural networks, or code to express various execution structures. -- **Optimizer**: Uses LLMs within a Monte Carlo Tree Search variant to explore and refine workflows. Iteratively selects, expands, evaluates, and updates workflows based on performance. -- **Evaluator**: Assesses workflow performance on given tasks. Provides feedback to guide the optimization process towards more effective workflows. +- **Node**: Basic unit of LLM invocation. See `metagpt/actions/action_node.py` for a flexible interface to control LLM, temperature, format, and prompt. +- **Operator**: Predefined combinations of Nodes to enhance search efficiency. Encapsulates common operations like Generate, Format, Review, Revise, Ensemble, Test, and Programmer. See `metagpt/ext/aflow/operator.py` for details. You can customize your own Operator by referencing the implementations in this code. +- **Workflow**: A sequence of LLM-invoking nodes connected by edges. Can be represented as graphs, neural networks, or code to express various execution structures. See `metagpt/ext/aflow/workflow.py` for our implementation. +- **Optimizer**: Uses LLMs within a Monte Carlo Tree Search variant to explore and refine workflows. Iteratively selects, expands, evaluates, and updates workflows based on performance. See `metagpt/ext/aflow/scripts/optimizer.py` for details. +- **Evaluator**: Assesses workflow performance on given tasks. Provides feedback to guide the optimization process towards more effective workflows. See `metagpt/ext/aflow/scripts/evaluator.py` for details. ## Datasets -We provide implementations for [list datasets here]. +### Experimental Datasets +We conducted experiments on six datasets (HumanEval, MBPP, GSM8K, MATH, HotpotQA, DROP) and provide their evaluation code. The data can be found in this [datasets](https://drive.google.com/uc?export=download&id=1DNoegtZiUhWtvkd2xoIuElmIi4ah7k8e) link, or you can download them using `metagpt/ext/aflow/data/download_data.py` -Data is available at [link to data]. - -For custom tasks, [brief instructions or link to documentation]. +### Custom Datasets +For custom tasks, you can reference the code in the metagpt/ext/aflow/benchmark folder. Inherit the `BaseBenchmark` class and implement `evaluate_problem`, `calculate_score`, and `get_result_columns` to add your custom dataset benchmark. Then, add your benchmark name in `metagpt/ext/aflow/scripts/evaluator.py` and `metagpt/ext/aflow/scripts/optimizer.py` to find effective workflows for your custom dataset. ## Quick Start 1. Configure your search in `optimize.py`: - - Open `metagpt/ext/aflow/scripts/optimize.py` + - Open `examples/aflow/optimize.py` - Set the following parameters: ```python dataset = "HumanEval" # Choose from: "HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP" or your custom dataset name question_type = "code" # Choose from: "math", "code", "qa" - sample = 5 # Number of samples to use for optimization + sample = 4 # Number of samples to use for optimization check_convergence = True # Whether to check for convergence - optimized_path = "path/to/optimized/workflows" # Path to save optimized workflows + optimized_path = "path/to/optimized/workflows" # Path to save optimized workflows, defaults to metagpt/ext/aflow/scripts/optimized initial_round = 1 # Starting round number max_rounds = 20 # Maximum number of optimization rounds ``` - Adjust these parameters according to your specific requirements and dataset -2. Set up parameters in `config/config2.yaml` (see `metagpt/ext/aflow/config2.example.yaml` for reference) -3. Set the operator you want to use in `optimize.py` and in `xxxx` -4. Download the init round of six datasets and put them in `xxxxxx` -5. Add your custom dataset and corresponding evaluation function: - -- Create a new Python file in the `metagpt/ext/aflow/benchmark/` directory, named `{custom_dataset_name}.py` -- Implement the following key functions in this new file: - - `load_data`: for loading the dataset - - `evaluate_problem`: for evaluating a single problem solution - - `evaluate_all_problems`: for evaluating all problems - - `save_results_to_csv`: for saving evaluation results - - `optimize_{custom_dataset_name}_evaluation`: main evaluation function that integrates the above functionalities -- Add your custom dataset name and config val_list in `metagpt/ext/aflow/scripts/evaluator.py` - - -## License - -[License information] +2. Set up parameters in `config/config2.yaml` (see `examples/aflow/config2.example.yaml` for reference) +3. Set the operator you want to use in `optimize.py` and in `optimized_path/template/operator.py`, `optimized_path/template/operator.json`. You can reference our implementation to add operators for specific datasets +4. When you first run, you can download the datasets and initial rounds by setting `download(["datasets", "initial_rounds"])` in `examples/aflow/optimize.py` +5. (Optional) Add your custom dataset and corresponding evaluation function following the [Custom Datasets](#custom-datasets) section +6. Run `python examples/aflow/optimize.py` to start the optimization process! ## Citation diff --git a/metagpt/ext/aflow/scripts/optimizer_utils/convergence_utils.py b/metagpt/ext/aflow/scripts/optimizer_utils/convergence_utils.py index 536a703dc..0f990356c 100644 --- a/metagpt/ext/aflow/scripts/optimizer_utils/convergence_utils.py +++ b/metagpt/ext/aflow/scripts/optimizer_utils/convergence_utils.py @@ -17,26 +17,26 @@ class ConvergenceUtils: def load_data(self, root_path): """ - 读取 JSON 文件,如果不存在则创建一个新文件,然后返回数据。 + Read JSON file, create a new file if it doesn't exist, then return the data. """ rounds_dir = os.path.join(root_path, "workflows") result_file = os.path.join(rounds_dir, "results.json") - # 确保目录存在 + # Ensure directory exists os.makedirs(rounds_dir, exist_ok=True) - # 如果文件不存在,创建一个包含空列表的新文件 + # If file doesn't exist, create a new one with an empty list if not os.path.exists(result_file): with open(result_file, 'w') as file: json.dump([], file) - # 读取文件并返回数据 + # Read file and return data with open(result_file, 'r') as file: return json.load(file) def process_rounds(self): """ - 以 round 为单位组织数据,返回按轮次的分数字典。 + Organize data by round, return a dictionary of scores by round. """ self.data = self.load_data(root_path=self.root_path) rounds = {} @@ -50,7 +50,7 @@ class ConvergenceUtils: def calculate_avg_and_std(self): """ - 计算每轮的平均分和标准差,返回两个列表:平均分和标准差。 + Calculate average score and standard deviation for each round, return two lists: average scores and standard deviations. """ self.rounds = self.process_rounds() @@ -64,61 +64,66 @@ class ConvergenceUtils: def check_convergence(self, top_k=3, z=0, consecutive_rounds=5): """ - 检查收敛的函数。z 为置信水平对应的 z 分数 。 - consecutive_rounds 为连续轮次内满足停止条件的次数。 + Check for convergence. z is the z-score corresponding to the confidence level. + consecutive_rounds is the number of consecutive rounds that must meet the stop condition. """ + # Calculate average score and standard deviation for each round self.avg_scores, self.stds = self.calculate_avg_and_std() - + # If total rounds are not enough to calculate top_k+1 rounds, return not converged if len(self.avg_scores) < top_k + 1: return False, None, None - - convergence_count = 0 - previous_Y = None - sigma_Y_previous = None - + convergence_count = 0 # Convergence counter + previous_Y = None # Y value of the previous round (average of top_k scores) + sigma_Y_previous = None # Standard error of Y value from previous round for i in range(len(self.avg_scores)): - # 动态选择当前轮次及之前所有轮次的 top_k - top_k_indices = np.argsort(self.avg_scores[:i + 1])[::-1][:top_k] - top_k_scores = [self.avg_scores[j] for j in top_k_indices] - top_k_stds = [self.stds[j] for j in top_k_indices] - + # Dynamically select top_k from current round and all previous rounds + top_k_indices = np.argsort(self.avg_scores[:i + 1])[::-1][:top_k] # Select top k indices by descending average score + top_k_scores = [self.avg_scores[j] for j in top_k_indices] # Get list of top k scores + top_k_stds = [self.stds[j] for j in top_k_indices] # Get list of standard deviations corresponding to top k scores + # Calculate mean of top k scores for current round, i.e., Y_current Y_current = np.mean(top_k_scores) + # Calculate standard error of Y_current (sigma_Y_current), representing score dispersion sigma_Y_current = np.sqrt(np.sum([s ** 2 for s in top_k_stds]) / (top_k ** 2)) - + # If not the first round, calculate change in Y (Delta_Y) and corresponding standard error if previous_Y is not None: + # Calculate Y difference between current round and previous round Delta_Y = Y_current - previous_Y + # Calculate standard error of Y difference (sigma_Delta_Y) sigma_Delta_Y = np.sqrt(sigma_Y_current ** 2 + sigma_Y_previous ** 2) - + # Check if Y change is within acceptable confidence interval, i.e., convergence condition if abs(Delta_Y) <= z * sigma_Delta_Y: convergence_count += 1 + # If consecutive converged rounds reach set value, return convergence information if convergence_count >= consecutive_rounds: return True, i - consecutive_rounds + 1, i else: + # If change is large, reset convergence counter convergence_count = 0 - + # Update Y value and standard error for previous round previous_Y = Y_current sigma_Y_previous = sigma_Y_current - + # If convergence condition not met, return not converged return False, None, None + def print_results(self): """ - 打印所有轮次的平均分和标准差。 + Print average score and standard deviation for all rounds. """ self.avg_scores, self.stds = self.calculate_avg_and_std() for i, (avg_score, std) in enumerate(zip(self.avg_scores, self.stds), 1): - logger.info(f"轮次 {i}: 平均分 = {avg_score:.4f}, 标准差 = {std:.4f}") + logger.info(f"Round {i}: Average Score = {avg_score:.4f}, Standard Deviation = {std:.4f}") if __name__ == "__main__": - # 使用该类,并指定 top_k - checker = ConvergenceUtils("path") # 例如设置 top_k=5 + # Use this class and specify top_k + checker = ConvergenceUtils("path") # For example, set top_k=5 converged, convergence_round, final_round = checker.check_convergence() if converged: - logger.info(f"检测到收敛,发生在第 {convergence_round} 轮,最终轮次为 {final_round} 轮") + logger.info(f"Convergence detected, occurred at round {convergence_round}, final round is {final_round}") else: - logger.info("在所有轮次内未检测到收敛") + logger.info("No convergence detected within all rounds") - # 打印每轮的平均分和标准差 + # Print average score and standard deviation for each round checker.print_results()