diff --git a/examples/aflow/README.md b/examples/aflow/README.md index dd94fdd31..4842b2044 100644 --- a/examples/aflow/README.md +++ b/examples/aflow/README.md @@ -38,14 +38,15 @@ ## Quick Start - Open `examples/aflow/optimize.py` - Set the following parameters: ```python - dataset = "HumanEval" # Choose from: "HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP" or your custom dataset name - question_type = "code" # Choose from: "math", "code", "qa" - sample = 4 # Number of samples to use for optimization - check_convergence = True # Whether to check for convergence - optimized_path = "path/to/optimized/workflows" # Path to save optimized workflows, defaults to metagpt/ext/aflow/scripts/optimized - initial_round = 1 # Starting round number - max_rounds = 20 # Maximum number of optimization rounds - validation_rounds = 5 # The validation rounds of AFLOW. + dataset: DatasetType = "MATH" # Ensure the type is consistent with DatasetType + sample: int = 4 # Sample Count, which means how many workflows will be resampled from generated workflows + question_type: QuestionType = "math" # Ensure the type is consistent with QuestionType + optimized_path: str = "metagpt/ext/aflow/scripts/optimized" # Optimized Result Save Path + initial_round: int = 1 # Corrected the case from Initial_round to initial_round + max_rounds: int = 20 # The max iteration of AFLOW. + check_convergence: bool = True # Whether Early Stop + validation_rounds: int = 5 # The validation rounds of AFLOW. + if_fisrt_optimize = True # You should change it to False after the first optimize. ``` - Adjust these parameters according to your specific requirements and dataset 2. Set up parameters in `config/config2.yaml` (see `examples/aflow/config2.example.yaml` for reference) @@ -66,10 +67,13 @@ ## Citation If you use AFlow in your research, please cite our paper: ``` -@article{zhang2024aflow, - title={AFlow: Automating Agentic Workflow Generation}, - author={Zhang, Jiayi and Xiang, Jinyu and Yu, Zhaoyang and Teng, Fengwei and Chen, Xionghui and Chen, Jiaqi and Zhuge, Mingchen and Cheng, Xin and Hong, Sirui and Wang, Jinlin and others}, - journal={arXiv preprint arXiv:2410.10762}, - year={2024} +@misc{zhang2024aflow, + title={AFlow: Automating Agentic Workflow Generation}, + author={Jiayi Zhang and Jinyu Xiang and Zhaoyang Yu and Fengwei Teng and Xionghui Chen and Jiaqi Chen and Mingchen Zhuge and Xin Cheng and Sirui Hong and Jinlin Wang and Bingnan Zheng and Bang Liu and Yuyu Luo and Chenglin Wu}, + year={2024}, + eprint={2410.10762}, + archivePrefix={arXiv}, + primaryClass={cs.AI}, + url={https://arxiv.org/abs/2410.10762}, } ``` \ No newline at end of file diff --git a/examples/aflow/optimize.py b/examples/aflow/optimize.py index fcc892ee4..dafa5fffd 100644 --- a/examples/aflow/optimize.py +++ b/examples/aflow/optimize.py @@ -3,6 +3,18 @@ # @Author : didi # @Desc : Entrance of AFlow. +import os +import sys + + +def setup_environment(): + current_path = os.path.abspath(__file__) + root_path = os.path.dirname(os.path.dirname(os.path.dirname(current_path))) + sys.path.insert(0, root_path) + os.chdir(root_path) + + +setup_environment() from metagpt.configs.models_config import ModelsConfig from metagpt.ext.aflow.data.download_data import download @@ -13,9 +25,6 @@ from metagpt.ext.aflow.scripts.optimizer import DatasetType, Optimizer, Question # QuestionType = Literal["math", "code", "qa"] # OptimizerType = Literal["Graph", "Test"] -# When you fisrt use, please download the datasets and initial rounds; If you want to get a look of the results, please download the results. -download(["datasets", "initial_rounds"]) - # Crucial Parameters dataset: DatasetType = "MATH" # Ensure the type is consistent with DatasetType sample: int = 4 # Sample Count, which means how many workflows will be resampled from generated workflows @@ -25,6 +34,7 @@ initial_round: int = 1 # Corrected the case from Initial_round to initial_round max_rounds: int = 20 # The max iteration of AFLOW. check_convergence: bool = True # Whether Early Stop validation_rounds: int = 5 # The validation rounds of AFLOW. +if_fisrt_optimize = True # You should change it to False after the first optimize. # Config llm model, you can modify `config/config2.yaml` to use more llms. mini_llm_config = ModelsConfig.default().get("gpt-4o-mini") @@ -56,6 +66,8 @@ optimizer = Optimizer( ) if __name__ == "__main__": + # When you fisrt use, please download the datasets and initial rounds; If you want to get a look of the results, please download the results. + download(["datasets", "initial_rounds"], if_first_download=if_fisrt_optimize) # Optimize workflow via setting the optimizer's mode to 'Graph' optimizer.optimize("Graph") # Test workflow via setting the optimizer's mode to 'Test' diff --git a/metagpt/ext/aflow/benchmark/benchmark.py b/metagpt/ext/aflow/benchmark/benchmark.py index 7412334aa..abdf546f5 100644 --- a/metagpt/ext/aflow/benchmark/benchmark.py +++ b/metagpt/ext/aflow/benchmark/benchmark.py @@ -23,11 +23,9 @@ class BaseBenchmark(ABC): async with aiofiles.open(self.file_path, mode="r", encoding="utf-8") as file: async for line in file: data.append(json.loads(line)) - if specific_indices is not None: filtered_data = [data[i] for i in specific_indices if i < len(data)] return filtered_data - return data def save_results_to_csv(self, results: List[Tuple[Any, ...]], columns: List[str]): @@ -35,26 +33,29 @@ class BaseBenchmark(ABC): avg_score = df["score"].mean() t_cost = df["cost"].max() a_cost = t_cost / len(df) if len(df) > 0 else 0 - current_time = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"{avg_score:.5f}_{current_time}.csv" output_file = os.path.join(self.log_path, filename) - df.to_csv(output_file, index=False) logger.info(f"Results saved to {output_file}") - return avg_score, a_cost, t_cost - def log_mismatch(self, problem: str, expected_output: Any, prediction: str, extracted_output: Any): + def log_mismatch( + self, + problem: str, + expected_output: Any, + prediction: str, + extracted_output: Any, + extract_answer_code: str = "None", + ): log_data = { "question": problem, "right_answer": expected_output, "model_output": prediction, "extracted_output": extracted_output, + "extract_answer_code": extract_answer_code, } - log_file = os.path.join(self.log_path, "log.json") - if os.path.exists(log_file): with open(log_file, "r", encoding="utf-8") as f: try: @@ -63,9 +64,7 @@ class BaseBenchmark(ABC): data = [] else: data = [] - data.append(log_data) - with open(log_file, "w", encoding="utf-8") as f: json.dump(data, f, indent=4, ensure_ascii=False) @@ -89,7 +88,6 @@ class BaseBenchmark(ABC): return await self.evaluate_problem(problem, graph) tasks = [sem_evaluate(problem) for problem in data] - return await tqdm_asyncio.gather(*tasks, desc=f"Evaluating {self.name} problems", total=len(data)) async def run_evaluation(self, graph: Callable, va_list: List[int], max_concurrent_tasks: int = 50): diff --git a/metagpt/ext/aflow/benchmark/math.py b/metagpt/ext/aflow/benchmark/math.py index 61d994b69..07b0612d0 100644 --- a/metagpt/ext/aflow/benchmark/math.py +++ b/metagpt/ext/aflow/benchmark/math.py @@ -1,3 +1,4 @@ +import inspect import re from math import isclose from typing import Any, Callable, List, Tuple @@ -98,6 +99,13 @@ class MATHBenchmark(BaseBenchmark): pass return False + def get_function_code(self, func): + try: + source_code = inspect.getsource(func) + return source_code + except OSError: + return "no code" + @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), retry=retry_if_exception_type(Exception), reraise=True) async def _generate_output(self, graph, input_text): return await graph(input_text) @@ -111,7 +119,13 @@ class MATHBenchmark(BaseBenchmark): uni_score, extracted_output = self.calculate_score(expected_output, output) if uni_score == 0: - self.log_mismatch(input_text, expected_output, output, extracted_output) + self.log_mismatch( + input_text, + expected_output, + output, + extracted_output, + extract_answer_code=self.get_function_code(self.extract_model_answer), + ) return input_text, output, expected_output, uni_score, cost diff --git a/metagpt/ext/aflow/data/download_data.py b/metagpt/ext/aflow/data/download_data.py index 1ecd1179c..a3aa2774c 100644 --- a/metagpt/ext/aflow/data/download_data.py +++ b/metagpt/ext/aflow/data/download_data.py @@ -68,21 +68,12 @@ datasets_to_download: Dict[str, Dict[str, str]] = { } -def is_directory_empty(path: str) -> bool: - """Check if the directory is empty""" - return len(os.listdir(path)) == 0 - - -def download(datasets): +def download(required_datasets, if_first_download: bool = True): """Main function to process all selected datasets""" - for dataset_name in datasets: - dataset = datasets_to_download[dataset_name] - extract_path = dataset["extract_path"] - - if os.path.exists(extract_path) and not is_directory_empty(extract_path): - logger.info( - f"Target folder {extract_path} for {dataset_name} is not empty, skipping download and extraction." - ) - continue - - process_dataset(dataset["url"], dataset["filename"], extract_path) + if if_first_download: + for dataset_name in required_datasets: + dataset = datasets_to_download[dataset_name] + extract_path = dataset["extract_path"] + process_dataset(dataset["url"], dataset["filename"], extract_path) + else: + logger.info("Skip downloading datasets") diff --git a/metagpt/ext/aflow/scripts/optimized/optimized.zip b/metagpt/ext/aflow/scripts/optimized/optimized.zip deleted file mode 100644 index 5de192467..000000000 Binary files a/metagpt/ext/aflow/scripts/optimized/optimized.zip and /dev/null differ