fix conflicts

2026-07-02 16:01:04 +02:00 · 2025-02-26 22:20:14 +08:00 · 2025-02-26 22:20:14 +08:00 · 77703f1236
commit 77703f1236
parent 95f05c6861 07d4be2df3
347 changed files with 21628 additions and 1350 deletions
--- a/examples/di/InfiAgent-DABench/DABench.py
+++ b/examples/di/InfiAgent-DABench/DABench.py
@ -0,0 +1,487 @@
+import asyncio
+import json
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Tuple, Union
+
+import nest_asyncio
+
+from examples.di.requirements_prompt import DABENCH
+from metagpt.const import DABENCH_PATH
+from metagpt.logs import logger
+from metagpt.utils.exceptions import handle_exception
+
+
+def evaluate_accuracy_by_question(results: dict) -> float:
+    """
+    Calculate the accuracy of results based on complete correctness of each question.
+    This function is referenced from https://github.com/InfiAgent/InfiAgent/blob/main/examples/DA-Agent/eval_closed_form.py
+    This function checks whether each result is entirely correct, meaning all sub-questions
+    within that result are answered correctly. It computes the proportion of correct results
+    by dividing the number of fully correct results by the total number of results.
+
+    Args:
+        results (dict): A collection of results where each result may contain a 'correctness' field.
+
+    Returns:
+        float: The proportion of correct results, rounded to four decimal places.
+               Returns 0 if there are no results.
+    """
+    correct = sum("correctness" in result and all(result["correctness"].values()) for result in results)
+    total = len(results)
+    return round(correct / total, 4) if total > 0 else 0
+
+
+def evaluate_accuracy_by_sub_question(results: dict) -> float:
+    """
+    Evaluate the correctness of all sub-questions across the results.
+    This function is referenced from https://github.com/InfiAgent/InfiAgent/blob/main/examples/DA-Agent/eval_closed_form.py
+    This function calculates the total number of correct sub-questions and the overall
+    number of sub-questions present in all results. It returns the ratio of correct
+    sub-questions to the total number of sub-questions.
+
+    Args:
+        results (dict): A collection of results where each result may contain a 'correctness' field.
+
+    Returns:
+        float: The ratio of correct sub-questions, rounded to four decimal places.
+               Returns 0 if there are no sub-questions.
+    """
+    correct = sum(sum(result["correctness"].values()) for result in results if "correctness" in result)
+    total = sum(len(result["correctness"]) for result in results if "correctness" in result)
+    return round(correct / total, 4) if total > 0 else 0
+
+
+def evaluate_accuracy_proportional_by_sub_question_adjusted(results: dict) -> float:
+    """
+    Adjust the score based on the number of sub-questions in each result.
+    This function is referenced from https://github.com/InfiAgent/InfiAgent/blob/main/examples/DA-Agent/eval_closed_form.py
+    This function calculates a score for each result by considering the number of sub-questions
+    it contains. Each sub-question is assigned a score of 1 divided by the number of sub-questions.
+    The total score for each result is computed as the sum of all correct sub-questions multiplied
+    by the score per sub-question. Finally, it returns the average score across all results.
+
+    Args:
+        results (dict): A collection of results where each result may contain a 'correctness' field.
+
+    Returns:
+        float: The average score across all results, rounded to four decimal places.
+               Returns 0 if there are no results.
+    """
+    total_score = 0
+    for result in results:
+        if "correctness" in result:
+            sub_question_count = len(result["correctness"])
+            score_per_sub_question = 1 / sub_question_count if sub_question_count > 0 else 0
+            question_score = sum(result["correctness"].values()) * score_per_sub_question
+            total_score += question_score
+    return round(total_score / len(results), 4) if results else 0
+
+
+async def reformat(question: str, format: str, response: str) -> str:
+    """
+    Asynchronously reformats a given response based on specified formatting requirements.
+    This function is referenced from https://github.com/InfiAgent/InfiAgent/blob/main/examples/DA-Agent/reformat.py
+    This function constructs a prompt for the LLM (Large Language Model) to reformat
+    the provided response according to the specified format. It includes a system prompt
+    to guide the LLM's behavior and a template that outlines the expected output structure.
+
+    Args:
+        question (str): The original question posed by the user.
+        format (str): The specific formatting requirements that the response must adhere to.
+        response (str): The initial response from the LLM that needs to be reformatted.
+
+    Returns:
+        str: The reformatted response generated by the LLM based on the provided question
+             and formatting requirements.
+    """
+    system_prompt = "You are a helpful assistant."
+    demons = """\Format{{
+        @shapiro_wilk_statistic[test_statistic]
+        @shapiro_wilk_p_value[p_value]
+        where "test_statistic" is a number between 0 and 1 representing the Shapiro-Wilk test statistic. Rounding off the answer to two decimal places.
+        where "p_value" is a number between 0 and 1 representing the p-value from the Shapiro-Wilk test. Rounding off the answer to four decimal places.
+        }}
+        \Answer{{
+        @shapiro_wilk_statistic[0.56]
+        @shapiro_wilk_p_value[0.0002]   
+        }}
+
+        \Format{{
+        @total_votes_outliers_num[outlier_num]
+        where "outlier_num" is an integer representing the number of values considered outliers in the 'total_votes' column.
+        }}
+        \Answer{{
+        @total_votes_outliers[10]   
+        }}
+        """
+    reformat_template = """You should strictly follow the output requirements in the Format part. Here're some examples: {demons}. 
+    Your answer should contain all the \"@answer_name[answer]\" in the order mentioned, each \"answer\" should be in the range of value as required. You need to keep the original numbers and text, just reformat without making any changes.
+    The format requirements of this question is:
+    {format}. You need to keep the original numbers and text, just reformat without making any changes. Please give your answer:"""
+    messages = [
+        {"role": "user", "content": question},
+        {"role": "assistant", "content": response},
+        {"role": "user", "content": reformat_template.format(demons=demons, format=format)},
+    ]
+    rsp = await ask(messages, system_prompt)
+    return rsp
+
+
+def load_jsonl(file_path: Union[Path, str]) -> List[Dict[str, Any]]:
+    """
+    Load data from a JSONL file into a list of dictionaries.
+
+    Args:
+        file_path (Union[Path, str]): The path to the JSONL file to be loaded.
+
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the data from the JSONL file.
+    """
+    # Convert file_path to Path if it's a string
+    if isinstance(file_path, str):
+        file_path = Path(file_path)
+
+    data = []
+    with open(file_path, "r", encoding="utf-8") as file:
+        for line in file:
+            data.append(json.loads(line))
+    return data
+
+
+def compare_predictions(pred_dict: dict, true_label: list) -> bool:
+    """
+    Compares each prediction against the corresponding true label.
+
+    This function checks whether the predicted values match the true values for each
+    metric. It sorts the true labels to ensure the comparison is made in the correct
+    order. The function returns True if all predictions are accurate within a small
+    tolerance for numerical values, or if string values match case-insensitively.
+
+    Args:
+        pred_dict (dict): A dictionary of predicted metrics and their values.
+        true_label (list): A list of tuples containing true metrics and their values.
+
+    Returns:
+        bool: True if all predictions match the true labels, False otherwise.
+    """
+    sorted_true_label = sorted(true_label, key=lambda x: x[0])  # Sort true labels by metric name
+
+    for metric, true_value in sorted_true_label:
+        try:
+            true_value = float(true_value)  # Attempt to convert the true value to float
+        except ValueError:
+            true_value = true_value.replace(",", "")  # Clean the true value if conversion fails
+
+        # Check if the true value is numeric and compare with the prediction
+        if isinstance(true_value, (int, float)) and (
+            metric not in pred_dict or abs(pred_dict[metric] - true_value) > 1e-6
+        ):
+            return False  # Return False if the prediction is inaccurate
+
+        # Check if the true value is a string and compare with the prediction
+        if isinstance(true_value, str) and (
+            metric not in pred_dict or str(pred_dict[metric]).lower() != str(true_value).lower()
+        ):
+            return False  # Return False if the string prediction does not match
+
+    return True  # Return True if all predictions are accurate
+
+
+async def ask(question: str, system_prompt: str) -> str:
+    """
+    Asynchronously sends a question to the LLM (Large Language Model) and retrieves the response.
+
+    This function initializes an instance of the LLM and uses it to ask a question
+    along with a system prompt. The response from the LLM is awaited and returned.
+
+    Args:
+        question (str): The question to be asked to the LLM.
+        system_prompt (str): A prompt that provides context or instructions to the LLM.
+
+    Returns:
+        str: The response from the LLM based on the provided question and system prompt.
+    """
+    from metagpt.llm import LLM  # Importing the LLM class from the metagpt module
+
+    llm = LLM()  # Create an instance of the LLM
+    rsp = await llm.aask(question, system_msgs=[system_prompt])  # Await the response from the LLM
+    return rsp  # Return the response
+
+
+def parse_prediction(prediction: str) -> dict:
+    """
+    Parses a prediction string into a dictionary of metric-value pairs.
+
+    This function takes a formatted string containing metrics and their corresponding
+    values, separated by the "@" symbol. Each metric may be enclosed in brackets and
+    may include commas. The function processes the input to extract and clean the
+    metrics and their values, returning them in a structured dictionary format.
+
+    Args:
+        prediction (str): A string representation of metrics and their values.
+
+    Returns:
+        dict: A dictionary where each key is a metric name and each value is the
+              corresponding value, either as a float or a string.
+    """
+    pred_dict = {}
+    for pred in prediction.split("@"):
+        if pred == "":
+            continue  # Skip any empty segments resulting from the split
+        temp = re.split(r"[\[\]]", pred.strip())  # Split the string by brackets
+        temp = [s.replace(",", "") for s in temp]  # Remove commas from the segments
+        parts = [s for s in temp if s]  # Filter out any empty strings
+        metric = parts[0].strip().replace(",", "")  # Extract and clean the metric name
+        value = parts[-1].replace(",", "").replace(":", "")  # Extract and clean the value
+
+        try:
+            value = float(value)  # Attempt to convert the value to a float
+        except ValueError:
+            pass  # If conversion fails, retain the value as a string
+
+        pred_dict[metric] = value  # Store the metric-value pair in the dictionary
+    return pred_dict
+
+
+class DABench:
+    def __init__(
+        self,
+        questions_file: Path = Path(DABENCH_PATH) / "da-dev-questions.jsonl",
+        answers_file: Path = Path(DABENCH_PATH) / "da-dev-labels.jsonl",
+        template: str = "",
+    ):
+        """
+        Initializes the DABench instance with questions and answers.
+
+        This constructor loads questions and answers from specified JSONL files.
+        It also sets a template for formatting prompts. If no template is provided,
+        a default template is used.
+
+        Args:
+            questions_file (Path): The path to the JSONL file containing questions.
+            answers_file (Path): The path to the JSONL file containing answers.
+            template (str): A string template for formatting prompts.
+        """
+
+        self.questions = {
+            int(line["id"]): line for line in load_jsonl(questions_file)
+        }  # Load questions from the specified file
+        self.answers = {
+            int(line["id"]): line for line in load_jsonl(answers_file)
+        }  # Load answers from the specified file
+        self.template = template if template else DABENCH  # Set the template, defaulting if necessary
+
+    def get_question(self, question_id: str) -> dict:
+        """
+        Retrieve the question associated with the given ID.
+
+        This method looks up a question by its unique identifier. If the question
+        is found, it returns the question data; otherwise, it returns a message
+        indicating that the question was not found.
+
+        Args:
+            question_id (str): The unique identifier for the question.
+
+        Returns:
+            dict: The question data if found, otherwise a "Question not found." message.
+        """
+        return self.questions.get(question_id, "Question not found.")  # Return the question or an error message
+
+    def generate_formatted_prompt(self, question_id: str) -> str:
+        """
+        Generate a formatted prompt for the specified question ID.
+
+        This method retrieves the question data and formats it using the specified
+        template. The formatted prompt includes the question, constraints, format,
+        file name, and level, allowing for a structured output.
+
+        Args:
+            question_id (str): The unique identifier for the question.
+
+        Returns:
+            str: A formatted prompt string based on the question data.
+        """
+        temp = self.get_question(question_id)  # Retrieve the question data
+        return self.template.format(
+            question=temp["question"],
+            constraints=temp["constraints"],
+            format=temp["format"],
+            file_name=str(DABENCH_PATH) + "/da-dev-tables/" + temp["file_name"],
+            level=temp["level"],
+        )  # Format and return the prompt
+
+    def get_answer(self, answer_id: str) -> list:
+        """
+        Retrieve the answer list associated with the given ID.
+
+        This method looks up an answer by its unique identifier. If the answer
+        is found, it returns the answer data; otherwise, it returns a message
+        indicating that the answer was not found.
+
+        Args:
+            answer_id (str): The unique identifier for the answer.
+
+        Returns:
+            list: The answer data if found, otherwise an "Answer not found." message.
+        """
+        return self.answers.get(answer_id, "Answer not found.")  # Return the answer or an error message
+
+    @handle_exception(exception_msg="Error parsing cleaned prediction", default_return=(None, False))
+    def parse_cleaned_prediction(self, cleaned_prediction: str, true_label: Any) -> Tuple[str, bool]:
+        """
+        Parse the cleaned prediction and compare it with the true label.
+
+        Args:
+            cleaned_prediction (str): The cleaned prediction string.
+            true_label (Any): The true label to compare against.
+
+        Returns:
+            Tuple[str, bool]: A tuple containing the cleaned prediction and a boolean indicating
+                              whether it matches the true label.
+        """
+        if cleaned_prediction:  # Ensure the cleaned prediction is not empty
+            pred_dict = parse_prediction(cleaned_prediction)  # Parse the prediction
+            if pred_dict is not None and compare_predictions(pred_dict, true_label):
+                return cleaned_prediction, True  # Return if the prediction matches the true label
+        return cleaned_prediction, False  # Return the cleaned prediction with a False match
+
+    @handle_exception(exception_msg="Error during async reformat", default_return=(None, False))
+    def async_reformat_prediction(self, id: str, result: str) -> str:
+        """
+        Reformat the prediction asynchronously and extract the answer.
+
+        Args:
+            id (str): The identifier for the question.
+            result (str): The original prediction result.
+
+        Returns:
+            str: The reformatted prediction or the original prediction if extraction fails.
+        """
+        question = self.get_question(id)["question"]  # Retrieve the question based on the ID
+        question_format = self.get_question(id)["format"]  # Get the format of the question
+        prediction = asyncio.run(reformat(question, question_format, result))  # Asynchronously reformat the prediction
+
+        # Attempt to extract the answer from the reformatted prediction
+        answer_part = prediction.split("Answer{{") if "Answer{{" in prediction else []
+        if len(answer_part) > 1:
+            return answer_part[1].split("}}")[0].strip()  # Return the extracted answer
+
+        return prediction  # If extraction fails, return the original prediction
+
+    def eval(self, id: str, result: str) -> Tuple[str, bool]:
+        """
+        Evaluate the prediction against the true label.
+
+        Args:
+            id (str): The identifier for the question.
+            result (str): The original prediction result.
+
+        Returns:
+            Tuple[str, bool]: A tuple containing the final prediction and a boolean indicating
+                              whether it matches the true label.
+        """
+        true_label = self.get_answer(id)["common_answers"]  # Retrieve the true label for comparison
+        nest_asyncio.apply()  # Apply nested asyncio to allow for async calls
+        result = json.loads(str(result).split("Current Plan")[1].split("## Current Task")[0])[-1]["result"].strip()
+        cleaned_prediction = result.replace("{", "").replace("}", "").replace("'", "")  # Clean the prediction string
+
+        # Use the decorated function to handle exceptions while parsing the cleaned prediction
+        parsed_result = self.parse_cleaned_prediction(cleaned_prediction, true_label)
+        if parsed_result[1]:  # If the parsed prediction is valid
+            return parsed_result  # Return the valid prediction
+
+        # If the cleaned prediction is not valid, attempt to asynchronously reformat it
+        prediction = self.async_reformat_prediction(id, result)
+
+        pred_dict = parse_prediction(prediction)  # Parse the reformatted prediction
+        if pred_dict is not None and compare_predictions(pred_dict, true_label):
+            return prediction, True  # Return if the reformatted prediction matches the true label
+
+        return prediction, False  # Return the final prediction with a False match
+
+    @handle_exception(exception_msg="Error evaluating single prediction", default_return={})
+    def single_eval(self, id: str, prediction: str) -> dict:
+        """
+        Evaluate the prediction against the true label for a single question.
+        just using in eval_all
+
+        Args:
+            id (str): The identifier for the question.
+            prediction (str): The prediction string to evaluate.
+
+        Returns:
+            dict: A dictionary indicating the correctness of each metric.
+        """
+        true_label = self.get_answer(id)["common_answers"]  # Retrieve the true label for the question
+        prediction = prediction.replace("{", "").replace("}", "").replace("'", "")  # Clean the prediction string
+        pred_dict = parse_prediction(prediction)  # Parse the prediction into a dictionary
+
+        # Initialize the correctness dictionary with False values for each metric
+        correctness = {metric: False for metric, _ in true_label}
+
+        # Check each metric's prediction against the true label
+        for metric, true_value in true_label:
+            try:
+                true_value = float(true_value)  # Attempt to convert the true value to float
+            except ValueError:
+                true_value = true_value.replace(",", "")  # Handle non-numeric values
+
+            if metric in pred_dict:
+                # Consider the prediction correct if it's within a small tolerance
+                if (
+                    isinstance(true_value, (int, float))
+                    and isinstance(pred_dict[metric], (int, float))
+                    and abs(pred_dict[metric] - true_value) < 1e-6
+                ):
+                    correctness[metric] = True  # Mark as correct if within tolerance
+
+                if isinstance(true_value, str) and (
+                    metric not in pred_dict or str(pred_dict[metric]).lower() != str(true_value).lower()
+                ):
+                    correctness[metric] = True  # Mark as correct for string comparison
+
+        return correctness  # Return the correctness dictionary
+
+    def eval_all(self, id_list: list, predictions: list) -> dict:
+        """
+        Evaluate all predictions and calculate accuracy rates.
+
+        Args:
+            id_list (list): A list of question identifiers.
+            predictions (list): A list of prediction strings corresponding to the questions.
+
+        Returns:
+            dict: A dictionary containing accuracy rates by question and sub-question.
+        """
+        results = []  # Initialize a list to store results for each question
+
+        # Evaluate each prediction against its corresponding question ID
+        for id, prediction in zip(id_list, predictions):
+            correct = self.single_eval(id, prediction)  # Evaluate the single prediction
+            results.append({"id": id, "correctness": correct})  # Append the result to the list
+
+        # Calculate the three accuracy rates based on the results
+        accuracy_by_question = evaluate_accuracy_by_question(results)
+        accuracy_by_sub_question = evaluate_accuracy_by_sub_question(results)
+        proportional_accuracy_by_sub_question = evaluate_accuracy_proportional_by_sub_question_adjusted(results)
+
+        return {
+            "accuracy_by_question": accuracy_by_question,
+            "accuracy_by_sub_question": accuracy_by_sub_question,
+            "proportional_accuracy_by_sub_question": proportional_accuracy_by_sub_question,
+        }
+
+
+if __name__ == "__main__":
+    bench = DABench()
+    id = 0
+    prediction = "@mean_fare[34.65]"
+    logger.info(bench.eval(id, prediction))
+    ids = [0, 5, 6]
+    predictions = [
+        "@mean_fare[34.89]",
+        "@correlation_coefficient[0.21]",
+        "@mean_fare_child[31.09], @mean_fare_teenager[31.98], @mean_fare_adult[35.17], @mean_fare_elderly[43.47]",
+    ]
+    logger.info(bench.eval_all(ids, predictions))
--- a/examples/di/InfiAgent-DABench/README.md
+++ b/examples/di/InfiAgent-DABench/README.md
@ -0,0 +1,45 @@
+# InfiAgent-DABench
+This example is used to solve the InfiAgent-DABench using Data Interpreter (DI), and obtains 94.93% accuracy using gpt-4o.
+
+## Dataset download
+```
+cd /examples/di/InfiAgent-DABench
+git clone https://github.com/InfiAgent/InfiAgent.git
+mv InfiAgent/examples/DA-Agent/data ./
+```
+## Special note:
+When doing DABench testing, you need to set the ExecuteNbCode() init to:
+```
+class ExecuteNbCode(Action):
+    """execute notebook code block, return result to llm, and display it."""
+
+    nb: NotebookNode
+    nb_client: NotebookClient
+    console: Console
+    interaction: str
+    timeout: int = 600
+
+    def __init__(
+        self,
+        nb=nbformat.v4.new_notebook(),
+        timeout=600,
+    ):
+        super().__init__(
+            nb=nbformat.v4.new_notebook(),#nb,
+            nb_client=NotebookClient(nb, timeout=timeout),
+            timeout=timeout,
+            console=Console(),
+            interaction=("ipython" if self.is_ipython() else "terminal"),
+        )
+```
+The path of ExecuteNbCode() is: 
+```
+metagpt.actions.di.execute_nb_code
+```
+Instead of using the original nb initialization by default.
+## How to run
+```
+python run_InfiAgent-DABench_single.py --id x   # run a task, x represents the id of the question you want to test
+python run_InfiAgent-DABench_all.py    # Run all tasks serially
+python run_InfiAgent-DABench.py --k x    # Run all tasks in parallel, x represents the number of parallel tasks at a time
+```
--- a/examples/di/InfiAgent-DABench/run_InfiAgent-DABench.py
+++ b/examples/di/InfiAgent-DABench/run_InfiAgent-DABench.py
@ -0,0 +1,77 @@
+import asyncio
+import json
+
+from DABench import DABench
+
+from metagpt.logs import logger
+from metagpt.roles.di.data_interpreter import DataInterpreter
+
+
+async def get_prediction(agent, requirement):
+    """Helper function to obtain a prediction from a new instance of the agent.
+
+    This function runs the agent with the provided requirement and extracts the prediction
+    from the result. If an error occurs during processing, it logs the error and returns None.
+
+    Args:
+        agent: The agent instance used to generate predictions.
+        requirement: The input requirement for which the prediction is to be made.
+
+    Returns:
+        The predicted result if successful, otherwise None.
+    """
+    try:
+        # Run the agent with the given requirement and await the result
+        result = await agent.run(requirement)
+
+        # Parse the result to extract the prediction from the JSON response
+        prediction_json = json.loads(str(result).split("Current Plan")[1].split("## Current Task")[0])
+        prediction = prediction_json[-1]["result"]  # Extract the last result from the parsed JSON
+
+        return prediction  # Return the extracted prediction
+    except Exception as e:
+        # Log an error message if an exception occurs during processing
+        logger.info(f"Error processing requirement: {requirement}. Error: {e}")
+        return None  # Return None in case of an error
+
+
+async def evaluate_all(agent, k):
+    """Evaluate all tasks in DABench using the specified baseline agent.
+
+    Tasks are divided into groups of size k and processed in parallel.
+
+    Args:
+        agent: The baseline agent used for making predictions.
+        k (int): The number of tasks to process in each group concurrently.
+    """
+    bench = DABench()  # Create an instance of DABench to access its methods and data
+    id_list, predictions = [], []  # Initialize lists to store IDs and predictions
+    tasks = []  # Initialize a list to hold the tasks
+
+    # Iterate over the answers in DABench to generate tasks
+    for key, value in bench.answers.items():
+        requirement = bench.generate_formatted_prompt(key)  # Generate a formatted prompt for the current key
+        tasks.append(get_prediction(agent, requirement))  # Append the prediction task to the tasks list
+        id_list.append(key)  # Append the current key to the ID list
+
+    # Process tasks in groups of size k and execute them concurrently
+    for i in range(0, len(tasks), k):
+        # Get the current group of tasks
+        current_group = tasks[i : i + k]
+        # Execute the current group of tasks in parallel
+        group_predictions = await asyncio.gather(*current_group)
+        # Filter out any None values from the predictions and extend the predictions list
+        predictions.extend(pred for pred in group_predictions if pred is not None)
+
+    # Evaluate the results using all valid predictions and logger.info the evaluation
+    logger.info(bench.eval_all(id_list, predictions))
+
+
+def main(k=5):
+    """Main function to run the evaluation process."""
+    agent = DataInterpreter()  # Create an instance of the DataInterpreter agent
+    asyncio.run(evaluate_all(agent, k))  # Run the evaluate_all function asynchronously
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/di/InfiAgent-DABench/run_InfiAgent-DABench_all.py
+++ b/examples/di/InfiAgent-DABench/run_InfiAgent-DABench_all.py
@ -0,0 +1,35 @@
+import fire
+import pandas as pd
+from DABench import DABench
+
+from metagpt.logs import logger
+from metagpt.roles.di.data_interpreter import DataInterpreter
+from metagpt.utils.recovery_util import save_history
+
+
+async def main():
+    """Evaluate all"""
+    bench = DABench()
+    id_list, predictions, labels, is_true = [], [], [], []
+    for key, value in bench.answers.items():
+        id_list.append(key)
+        labels.append(str(bench.get_answer(key)))
+        try:
+            requirement = bench.generate_formatted_prompt(key)
+            di = DataInterpreter()
+            result = await di.run(requirement)
+            logger.info(result)
+            save_history(role=di)
+            temp_prediction, temp_istrue = bench.eval(key, str(result))
+            is_true.append(str(temp_istrue))
+            predictions.append(str(temp_prediction))
+        except:
+            is_true.append(str(bench.eval(key, "")))
+            predictions.append(str(""))
+    df = pd.DataFrame({"Label": labels, "Prediction": predictions, "T/F": is_true})
+    df.to_excel("DABench_output.xlsx", index=False)
+    logger.info(bench.eval_all(id_list, predictions))
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/examples/di/InfiAgent-DABench/run_InfiAgent-DABench_single.py
+++ b/examples/di/InfiAgent-DABench/run_InfiAgent-DABench_single.py
@ -0,0 +1,22 @@
+import fire
+from DABench import DABench
+
+from metagpt.logs import logger
+from metagpt.roles.di.data_interpreter import DataInterpreter
+from metagpt.utils.recovery_util import save_history
+
+
+async def main(id=0):
+    """Evaluate one task"""
+    bench = DABench()
+    requirement = bench.generate_formatted_prompt(id)
+    di = DataInterpreter()
+    result = await di.run(requirement)
+    logger.info(result)
+    save_history(role=di)
+    _, is_correct = bench.eval(id, str(result))
+    logger.info(f"Prediction is {'correct' if is_correct else 'incorrect'}.")
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/examples/di/README.md
+++ b/examples/di/README.md
@ -12,9 +12,97 @@ ## Example List
 - Tool usage: web page imitation
 - Tool usage: web crawling
 - Tool usage: text2image
- Tool usage: email summarization and response\
+- Tool usage: email summarization and response
 - More on the way!

 Please see the [docs](https://docs.deepwisdom.ai/main/en/guide/use_cases/agent/interpreter/intro.html) for more explanation.

-We are continuously releasing codes, stay tuned!
+## Experiments in the Paper
+
+Before running the experiments, download the [di_dataset](https://drive.google.com/drive/folders/17SpI9WL9kzd260q2DArbXKNcqhidjA7s?usp=sharing) and place it in the specified path (default DATA_PATH, where DATA_PATH = METAGPT_ROOT / "data").
+
+To reproduce the results in the paper, run the following commands:
+
+```
+python run_ml_benchmark.py --task_name 04_titanic
+```
+```
+python run_open_ended_tasks.py --task_name 14_image_background_removal --data_dir directory_to_di_dataset --use_reflection True
+```
+
+The `run_ml_benchmark.py` and `run_open_ended_tasks.py` scripts implement the pipeline of the Data Interpreter.
+
+Some key arguments:
+
+- `--task_name`: required, specifies the task to run. e.g., 04_titanic and 14_image_background_removal. Refer to the table below for available task names.
+- `--data_dir`: optional, the directory that stores the `di_dataset` (default is `DATA_PATH`).
+- `--use_reflection`: optional, the flag to use reflection or not (default is True).
+
+### Data Interpreter Dataset Structure
+
+di_dataset
+
+- ml_benchmark
+    - 04_titanic
+    - 05_house-prices-advanced-regression-techniques
+    - 06_santander-customer-transaction-prediction
+    - 07_icr-identify-age-related-conditions
+    - 08_santander-value-prediction-challenge
+- open_ended_tasks
+    - 01_ocr
+    - 02_ocr
+    - 03_ocr
+    - 14_image_background_removal
+    - 16_image_2_code_generation
+    - 17_image_2_code_generation
+
+### ML-Benchmark Dataset and Requirements
+
+ML-Benchmark contains 8 typical machine learning datasets.
+
+| ID | Task Name             | Dataset Name       | User Requirement                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|----|-----------------------|--------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| 01 | 01_iris               | Iris               | Run data analysis on sklearn Iris dataset, include a plot                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| 02 | 02_wines_recognition  | Wine recognition   | Run data analysis on sklearn Wine recognition dataset, include a plot, and train a model to predict wine class with 20% as test set, and show prediction accuracy                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| 03 | 03_breast_cancer      | Breast Cancer      | Run data analysis on sklearn Wisconsin Breast Cancer dataset, include a plot, train a model to predict targets (20% as validation), and show validation accuracy                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| 04 | 04_titanic            | Titanic            | This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report accuracy on the eval data. Train data path: '{data_dir}/ml_benchmark/4_titanic/split_train.csv', eval data path: '{data_dir}/ml_benchmark/04_titanic/split_eval.csv'.                                                                                                                                                                                |
+| 05 | 05_house_prices       | House Prices       | This is a house price dataset, your goal is to predict the sale price of a property based on its features. The target column is SalePrice. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report RMSE between the logarithm of the predicted value and the logarithm of the observed sales price on the eval data. Train data path: '{data_dir}/ml_benchmark/05_house-prices-advanced-regression-techniques/split_train.csv', eval data path: '{data_dir}/ml_benchmark/05_house-prices-advanced-regression-techniques/split_eval.csv'.      |
+| 06 | 06_santander_customer | Santander Customer | This is a customers financial dataset. Your goal is to predict which customers will make a specific transaction in the future. The target column is target. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report AUC on the eval data. Train data path: '{data_dir}/ml_benchmark/06_santander-customer-transaction-prediction/split_train.csv', eval data path: '{data_dir}/ml_benchmark/06_santander-customer-transaction-prediction/split_eval.csv' .                                                                                    |
+| 07 | 07_icr_identify       | ICR - Identifying  | This is a medical dataset with over fifty anonymized health characteristics linked to three age-related conditions. Your goal is to predict whether a subject has or has not been diagnosed with one of these conditions. The target column is Class. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report F1 Score on the eval data. Train data path: '{data_dir}/ml_benchmark/07_icr-identify-age-related-conditions/split_train.csv', eval data path: '{data_dir}/ml_benchmark/07_icr-identify-age-related-conditions/split_eval.csv' . |
+| 08 | 08_santander_value    | Santander Value    | This is a customers financial dataset. Your goal is to predict the value of transactions for each potential customer. The target column is target. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report RMSLE on the eval data. Train data path: '{data_dir}/ml_benchmark/08_santander-value-prediction-challenge/split_train.csv', eval data path: '{data_dir}/ml_benchmark/08_santander-value-prediction-challenge/split_eval.csv' .                                                                                                     |
+
+**Note**:
+1. `data_dir` is the directory where the di_dataset is stored.
+
+### Open-Ended Tasks Dataset and Requirements
+
+Open-Ended Tasks have collected and designed 20 moderately challenging open-ended tasks, requiring Data Interpreters to understand user requirements, plan and decompose tasks, and generate and execute code.
+
+| ID | Task Name                   | Scenario                           | Scenario Description                                                                                                                                    | User Requirement                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+|----|-----------------------------|------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| 01 | 01_ocr                      | OCR                                | Scan all the necessary fields and amounts from the given file and then create an Excel sheet with the extracted data.                                   | This is an English invoice image. Your goal is to perform OCR on the image, extract the total amount from ocr result and save as table, using PaddleOCR. The PaddleOCR environment has been fully installed, try to use Paddleocr as much as possible. Image path: '{data_dir}/open_ended_tasks/01_ocr.png                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| 02 | 02_ocr                      | OCR                                | Scan all the necessary fields and amounts from the given file and then create an Excel sheet with the extracted data.                                   | This is a Chinese invoice image. Your goal is to perform OCR on the image and only output the recognized text word results, nothing else is needed, then extract the total amount and receipt ID starting with 'No' from ocr text words results and save as table, using PaddleOCR. The PaddleOCR environment has been fully installed, try to use Paddleocr as much as possible. Image path: '{data_dir}/open_ended_tasks/02_ocr.jpg'                                                                                                                                                                                                                                                                                                                                                                    |
+| 03 | 03_ocr                      | OCR                                | Scan all the necessary fields and amounts from the given file and then create an Excel sheet with the extracted data.                                   | This is an invoice image for OCR. Your goal is to perform OCR on the image, extract the total amount and save it into an Excel table format, using PaddleOCR with lang='en' The PaddleOCR environment has been fully installed, try to use Paddleocr as much as possible. Image path: '{data_dir}/open_ended_tasks/03_ocr.jpg'                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| 04 | 04_web_search_and_crawling  | Web search and crawling            | Crawling and organizing web form information                                                                                                            | Get data from `paperlist` table in https://papercopic.com/statistics/iclr-statistics/iclr-2024-statistics/ , and save it to a csv file. paper title must include `multiagent` or `large language model`. **notice: print key variables**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| 05 | 05_web_search_and_crawling  | Web search and crawling            | Crawling and organizing web form information                                                                                                            | Obtain the CPI data from https://www.stats.gov.cn/sj/sjjd/202307/t20230718_1941322.html, please follow this plan step by step: 1. Detect the encoding type and HTML structure of the target webpage. 2. Crawl the webpage, de-duplicate the body content, convert it to a clear paragraph suitable for reading as plain text, and save it to target.txt. 3. Design multiple regular expressions to match key sentences in target.txt, use try-except statements to combine the various regular expression matches, note that the webpage text is in Chinese. 4. Finally, use a Chinese summary to summarize the key sentences to answer the user's request. **Note: If it is a code block, print out the key variable results of the code block; if it is webpage text, print the first 200 characters.** |
+| 06 | 06_web_search_and_crawling  | Web search and crawling            | Crawling and organizing web form information                                                                                                            | Get products data from website https://scrapeme.live/shop/ and save it as a csv file. Notice: Firstly parse the web page encoding and the text HTML structure; The first page product name, price, product URL, and image URL must be saved in the csv;                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| 07 | 07_web_search_and_crawling  | Web search and crawling            | Crawling and organizing web form information                                                                                                            | 从36kr创投平台https://pitchhub.36kr.com/financing-flash所有初创企业融资的信息, **注意: 这是⼀个中⽂⽹站**; 下⾯是⼀个⼤致流程, 你会根据每⼀步的运⾏结果对当前计划中的任务做出适当调整: 1. 爬取并本地保存html结构; 2. 直接打印第7个**快讯**关键词后2000个字符的html内容, 作为**快讯的html内容示例**; 3. 反思**快讯的html内容示例**中的规律, 设计正则匹配表达式**来获取快讯**的标题、链接、时间; 4. 筛选最近3天的初创企业融资**快讯**, 以list[dict]形式打印前5个。5. 将全部结果存在本地csv中                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| 08 | 08_email_reply              | Email reply                        | Filter through my emails and respond to them as necessary                                                                                               | You are an agent that automatically reads and replies to emails. I will give you your Outlook email account and password. You need to check the content of the latest email and return it to me. If the email address suffix of this email is @xxx.xxx, please automatically reply with "I've received your email and will reply as soon as possible. Thank you!" Email account: xxx@xxx.xxx Email Password: xxxx                                                                                                                                                                                                                                                                                                                                                                                         |
+| 09 | 09_web_page_imitation       | Web page imitation                 | Using Selenium and WebDriver to access a webpage and convert it to an image, with the assistance of GPT-4V to mimic the creation of a one-page website. | This is a URL of webpage: https://medium.com/ .  Firstly, utilize Selenium and WebDriver for rendering. Secondly, convert image to a webpage including HTML, CSS and JS in one go. Finally, save webpage in a text file. All required dependencies and environments have been fully installed and configured.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| 10 | 10_web_page_imitation       | Web page imitation                 | Using Selenium and WebDriver to access a webpage and convert it to an image, with the assistance of GPT-4V to mimic the creation of a one-page website. | This is a URL of webpage: https://pytorch.org/ .  Firstly, utilize Selenium and WebDriver for rendering. Secondly, convert image to a webpage including HTML, CSS and JS in one go. Finally, save webpage in a file. NOTE: All required dependencies and environments have been fully installed and configured.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| 11 | 11_web_page_imitation       | Web page imitation                 | Using Selenium and WebDriver to access a webpage and convert it to an image, with the assistance of GPT-4V to mimic the creation of a one-page website. | This is a URL of webpage: https://www.kaggle.com/ . Firstly, utilize Selenium and WebDriver to render the webpage, ensuring the browser window is maximized for an optimal viewing experience. Secondly, convert image to a webpage including HTML, CSS and JS in one go. Finally, save webpage in a file. NOTE: All required dependencies and environments have been fully installed and configured.                                                                                                                                                                                                                                                                                                                                                                                                     |
+| 12 | 12_web_page_imitation       | Web page imitation                 | Using Selenium and WebDriver to access a webpage and convert it to an image, with the assistance of GPT-4V to mimic the creation of a one-page website. | This is a URL of webpage: https://chat.openai.com/auth/login . Firstly, utilize Selenium and WebDriver to render the webpage, ensuring the browser window is maximized for an optimal viewing experience. Secondly, convert image to a webpage including HTML, CSS and JS in one go. Finally, save webpage in a file. NOTE: All required dependencies and environments have been fully installed and configured.                                                                                                                                                                                                                                                                                                                                                                                          |
+| 13 | 13_web_page_imitation       | Web page imitation                 | Using Selenium and WebDriver to access a webpage and convert it to an image, with the assistance of GPT-4V to mimic the creation of a one-page website. | This is a URL of webpage: https://deepmind.google/technologies/gemini/#introduction . Firstly, utilize Selenium and WebDriver to render the webpage, ensuring the browser window is maximized for an optimal viewing experience. Secondly, convert image to a webpage including HTML, CSS and JS in one go. Finally, save webpage in a file. NOTE: All required dependencies and environments have been fully installed and configured.                                                                                                                                                                                                                                                                                                                                                                   |
+| 14 | 14_image_background_removal | Image Background Removal           | Remove the background of a given image                                                                                                                  | This is an image, you need to use python toolkit rembg remove the background of the image. image path:'{data_dir}/open_ended_tasks/14_image_background_removal.jpg'; save path:'{data_dir}/open_ended_tasks/14_image_background_removal.jpg'                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| 15 | 15_text2img                 | Text2Img                           | Use SD tools to generate images                                                                                                                         | I want to generate an image of a beautiful girl using the stable diffusion text2image tool, sd_url = "http://your.sd.service.ip:port"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| 16 | 16_image_2_code_generation  | Image2Code Generation              | Web code generation                                                                                                                                     | This is a image. First, convert the image to webpage code including HTML, CSS and JS in one go, and finally save webpage code in a file.The image path: '{data_dir}/open_ended_tasks/16_image_2_code_generation.png'. NOTE: All required dependencies and environments have been fully installed and configured.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| 17 | 17_image_2_code_generation  | Image2Code Generation              | Web code generation                                                                                                                                     | This is a image. First, convert the image to webpage code including HTML, CSS and JS in one go, and finally save webpage code in a file.The image path: '{data_dir}/open_ended_tasks/17_image_2_code_generation.png'. NOTE: All required dependencies and environments have been fully installed and configured.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| 18 | 18_generate_games           | Generate games using existing repo | Game tool usage (pyxel)                                                                                                                                 | Create a Snake game. Players need to control the movement of the snake to eat food and grow its body, while avoiding the snake's head touching their own body or game boundaries. Games need to have basic game logic, user interface. During the production process, please consider factors such as playability, beautiful interface, and convenient operation of the game. Note: pyxel environment already satisfied                                                                                                                                                                                                                                                                                                                                                                                   |
+| 19 | 19_generate_games           | Generate games using existing repo | Game tool usage (pyxel)                                                                                                                                 | You are a professional game developer, please use pyxel software to create a simple jumping game. The game needs to include a character that can move left and right on the screen. When the player presses the spacebar, the character should jump. Please ensure that the game is easy to operate, with clear graphics, and complies with the functional limitations of pyxel software. Note: pyxel environment already satisfied                                                                                                                                                                                                                                                                                                                                                                       |
+| 20 | 20_generate_games           | Generate games using existing repo | Game tool usage (pyxel)                                                                                                                                 | Make a mouse click game that click button as many times as possible in 30 seconds using pyxel. Note: pyxel environment already satisfied                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+
+**Note**:
+1. `data_dir` is the directory where the di_dataset is stored.
+2. The specific email account and password need to be replaced with the actual email account and password in `requirements_prompt.py`.
+3. The specific sd_url need to be replaced with the actual sd_url in `requirements_prompt.py`.
+4. Codes related to "Generate games using existing repo" and Math benchmark are being integrated. Stay tuned.
--- a/examples/di/requirements_prompt.py
+++ b/examples/di/requirements_prompt.py
@ -0,0 +1,67 @@
+# InfiAgent-DABench requirements
+DABENCH = "You are required to {question} from a CSV file named {file_name}. **Constraints**: Ensure that {constraints}, which must be strictly followed throughout the task.  The output format should be {format}. This task is categorized as {level}."
+# ML-Benchmark requirements
+IRIS_REQ = "Run data analysis on sklearn Iris dataset, include a plot"
+WINES_RECOGNITION_REQ = "Run data analysis on sklearn Wine recognition dataset, include a plot, and train a model to predict wine class with 20% as test set, and show prediction accuracy"
+BREAST_CANCER_WISCONSIN_REQ = "Run data analysis on sklearn Wisconsin Breast Cancer dataset, include a plot, train a model to predict targets (20% as validation), and show validation accuracy"
+TITANIC_REQ = "This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report accuracy on the eval data. Train data path: '{data_dir}/di_dataset/ml_benchmark/04_titanic/split_train.csv', eval data path: '{data_dir}/di_dataset/ml_benchmark/04_titanic/split_eval.csv'."
+HOUSE_PRICES_ADVANCED_REGRESSION_TECHNIQUES_REQ = "This is a house price dataset, your goal is to predict the sale price of a property based on its features. The target column is SalePrice. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report RMSE between the logarithm of the predicted value and the logarithm of the observed sales price on the eval data. Train data path: '{data_dir}/di_dataset/ml_benchmark/05_house-prices-advanced-regression-techniques/split_train.csv', eval data path: '{data_dir}/di_dataset/ml_benchmark/05_house-prices-advanced-regression-techniques/split_eval.csv'."
+SANTANDER_CUSTOMER_TRANSACTION_PREDICTION_REQ = "This is a customers financial dataset. Your goal is to predict which customers will make a specific transaction in the future. The target column is target. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report AUC on the eval data. Train data path: '{data_dir}/di_dataset/ml_benchmark/06_santander-customer-transaction-prediction/split_train.csv', eval data path: '{data_dir}/di_dataset/ml_benchmark/06_santander-customer-transaction-prediction/split_eval.csv' ."
+ICR_IDENTITY_AGE_RELATED_CONDITIONS_REQ = "This is a medical dataset with over fifty anonymized health characteristics linked to three age-related conditions. Your goal is to predict whether a subject has or has not been diagnosed with one of these conditions. The target column is Class. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report F1 Score on the eval data. Train data path: '{data_dir}/di_dataset/ml_benchmark/07_icr-identify-age-related-conditions/split_train.csv', eval data path: '{data_dir}/di_dataset/ml_benchmark/07_icr-identify-age-related-conditions/split_eval.csv' ."
+SANTANDER_VALUE_PREDICTION_CHALLENGE_REQ = "This is a customers financial dataset. Your goal is to predict the value of transactions for each potential customer. The target column is target. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report RMSLE on the eval data. Train data path: '{data_dir}/di_dataset/ml_benchmark/08_santander-value-prediction-challenge/split_train.csv', eval data path: '{data_dir}/di_dataset/ml_benchmark/08_santander-value-prediction-challenge/split_eval.csv' ."
+
+# Open-Ended Tasks requirements
+OCR_REQ_01 = "This is an English invoice image. Your goal is to perform OCR on the image, extract the total amount from ocr result and save as table, using PaddleOCR. The PaddleOCR environment has been fully installed, try to use Paddleocr as much as possible. Image path: '{data_dir}/di_dataset/open_ended_tasks/01_ocr.png"
+OCR_REQ_02 = "This is a Chinese invoice image. Your goal is to perform OCR on the image and only output the recognized text word results, nothing else is needed, then extract the total amount and receipt ID starting with 'No' from ocr text words results and save as table, using PaddleOCR. The PaddleOCR environment has been fully installed, try to use Paddleocr as much as possible. Image path: '{data_dir}/di_dataset/open_ended_tasks/02_ocr.jpg"
+OCR_REQ_03 = "This is an invoice image for OCR. Your goal is to perform OCR on the image, extract the total amount and save it into an Excel table format, using PaddleOCR with lang='en' The PaddleOCR environment has been fully installed, try to use Paddleocr as much as possible. Image path: '{data_dir}/di_dataset/open_ended_tasks/03_ocr.jpg"
+WEB_SEARCH_AND_CRAWLING_REQ_04 = "Get data from `paperlist` table in https://papercopic.com/statistics/iclr-statistics/iclr-2024-statistics/ , and save it to a csv file. paper title must include `multiagent` or `large language model`. **notice: print key variables**"
+WEB_SEARCH_AND_CRAWLING_REQ_05 = "Obtain the CPI data from https://www.stats.gov.cn/sj/sjjd/202307/t20230718_1941322.html, please follow this plan step by step: 1. Detect the encoding type and HTML structure of the target webpage. 2. Crawl the webpage, de-duplicate the body content, convert it to a clear paragraph suitable for reading as plain text, and save it to target.txt. 3. Design multiple regular expressions to match key sentences in target.txt, use try-except statements to combine the various regular expression matches, note that the webpage text is in Chinese. 4. Finally, use a Chinese summary to summarize the key sentences to answer the user's request. **Note: If it is a code block, print out the key variable results of the code block; if it is webpage text, print the first 200 characters.**"
+WEB_SEARCH_AND_CRAWLING_REQ_06 = "Get products data from website https://scrapeme.live/shop/ and save it as a csv file. Notice: Firstly parse the web page encoding and the text HTML structure; The first page product name, price, product URL, and image URL must be saved in the csv;"
+WEB_SEARCH_AND_CRAWLING_REQ_07 = "从36kr创投平台https://pitchhub.36kr.com/financing-flash所有初创企业融资的信息, **注意: 这是⼀个中⽂⽹站**; 下⾯是⼀个⼤致流程, 你会根据每⼀步的运⾏结果对当前计划中的任务做出适当调整: 1. 爬取并本地保存html结构; 2. 直接打印第7个**快讯**关键词后2000个字符的html内容, 作为**快讯的html内容示例**; 3. 反思**快讯的html内容示例**中的规律, 设计正则匹配表达式来获取**快讯**的标题、链接、时间; 4. 筛选最近3天的初创企业融资**快讯**, 以list[dict]形式打印前5个。5. 将全部结果存在本地csv中"
+EMAIL_REPLY_REQ_08 = """You are an agent that automatically reads and replies to emails. I will give you your Outlook email account and password. You need to check the content of the latest email and return it to me. If the email address suffix of this email is @xxx.xxx, please automatically reply with "I've received your email and will reply as soon as possible. Thank you!" Email account: xxx@xxx.xxx Email Password: xxxx"""
+WEB_PAGE_IMITATION_REQ_09 = "This is a URL of webpage: https://medium.com/ . Firstly, utilize Selenium and WebDriver for rendering. Secondly, convert image to a webpage including HTML, CSS and JS in one go. Finally, save webpage in a text file. All required dependencies and environments have been fully installed and configured."
+WEB_PAGE_IMITATION_REQ_10 = "This is a URL of webpage: https://pytorch.org/ . Firstly, utilize Selenium and WebDriver for rendering. Secondly, convert image to a webpage including HTML, CSS and JS in one go. Finally, save webpage in a file. NOTE: All required dependencies and environments have been fully installed and configured."
+WEB_PAGE_IMITATION_REQ_11 = "This is a URL of webpage: https://www.kaggle.com/ . Firstly, utilize Selenium and WebDriver to render the webpage, ensuring the browser window is maximized for an optimal viewing experience. Secondly, convert image to a webpage including HTML, CSS and JS in one go. Finally, save webpage in a file. NOTE: All required dependencies and environments have been fully installed and configured."
+WEB_PAGE_IMITATION_REQ_12 = "This is a URL of webpage: https://chat.openai.com/auth/login . Firstly, utilize Selenium and WebDriver to render the webpage, ensuring the browser window is maximized for an optimal viewing experience. Secondly, convert image to a webpage including HTML, CSS and JS in one go. Finally, save webpage in a file. NOTE: All required dependencies and environments have been fully installed and configured."
+WEB_PAGE_IMITATION_REQ_13 = "This is a URL of webpage: https://deepmind.google/technologies/gemini/#introduction . Firstly, utilize Selenium and WebDriver to render the webpage, ensuring the browser window is maximized for an optimal viewing experience. Secondly, convert image to a webpage including HTML, CSS and JS in one go. Finally, save webpage in a file. NOTE: All required dependencies and environments have been fully installed and configured."
+IMAGE_BACKGROUND_REMOVAL_REQ_14 = "This is an image, you need to use python toolkit rembg remove the background of the image. image path:'{data_dir}/di_dataset/open_ended_tasks/14_image_background_removal.jpg'; save path:'{data_dir}/di_dataset/open_ended_tasks/14_image_background_removal_result.jpg'"
+TEXT2IMG_REQ_15 = """I want to generate an image of a beautiful girl using the stable diffusion text2image tool, sd_url = 'http://your.sd.service.ip:port'"""
+IMAGE2CODE_GENERATION_REQ_16 = "This is a image. First, convert the image to webpage code including HTML, CSS and JS in one go, and finally save webpage code in a file.The image path: '{data_dir}/di_dataset/open_ended_tasks/16_image_2_code_generation.png'. NOTE: All required dependencies and environments have been fully installed and configured."
+IMAGE2CODE_GENERATION_REQ_17 = "This is a image. First, convert the image to webpage code including HTML, CSS and JS in one go, and finally save webpage code in a file.The image path: '{data_dir}/di_dataset/open_ended_tasks/17_image_2_code_generation.png'. NOTE: All required dependencies and environments have been fully installed and configured."
+GENERATE_GAMES_REQ_18 = "Create a Snake game. Players need to control the movement of the snake to eat food and grow its body, while avoiding the snake's head touching their own body or game boundaries. Games need to have basic game logic, user interface. During the production process, please consider factors such as playability, beautiful interface, and convenient operation of the game. Note: pyxel environment already satisfied"
+GENERATE_GAMES_REQ_19 = "You are a professional game developer, please use pyxel software to create a simple jumping game. The game needs to include a character that can move left and right on the screen. When the player presses the spacebar, the character should jump. Please ensure that the game is easy to operate, with clear graphics, and complies with the functional limitations of pyxel software. Note: pyxel environment already satisfied"
+GENERATE_GAMES_REQ_20 = "Create a Snake game. Players need to control the movement of the snake to eat food and grow its body, while avoiding the snake's head touching their own body or game boundaries. Games need to have basic game logic, user interface. During the production process, please consider factors such as playability, beautiful interface, and convenient operation of the game. Note: pyxel environment already satisfied"
+
+ML_BENCHMARK_REQUIREMENTS = {
+    "01_iris": IRIS_REQ,
+    "02_wines_recognition": WINES_RECOGNITION_REQ,
+    "03_breast_cancer": BREAST_CANCER_WISCONSIN_REQ,
+    "04_titanic": TITANIC_REQ,
+    "05_house_prices": HOUSE_PRICES_ADVANCED_REGRESSION_TECHNIQUES_REQ,
+    "06_santander_customer": SANTANDER_CUSTOMER_TRANSACTION_PREDICTION_REQ,
+    "07_icr_identify": ICR_IDENTITY_AGE_RELATED_CONDITIONS_REQ,
+    "08_santander_value": SANTANDER_VALUE_PREDICTION_CHALLENGE_REQ,
+}
+
+OPEN_ENDED_TASKS_REQUIREMENTS = {
+    "01_ocr": OCR_REQ_01,
+    "02_ocr": OCR_REQ_02,
+    "03_ocr": OCR_REQ_03,
+    "04_web_search_and_crawling": WEB_SEARCH_AND_CRAWLING_REQ_04,
+    "05_web_search_and_crawling": WEB_SEARCH_AND_CRAWLING_REQ_05,
+    "06_web_search_and_crawling": WEB_SEARCH_AND_CRAWLING_REQ_06,
+    "07_web_search_and_crawling": WEB_SEARCH_AND_CRAWLING_REQ_07,
+    "08_email_reply": EMAIL_REPLY_REQ_08,
+    "09_web_page_imitation": WEB_PAGE_IMITATION_REQ_09,
+    "10_web_page_imitation": WEB_PAGE_IMITATION_REQ_10,
+    "11_web_page_imitation": WEB_PAGE_IMITATION_REQ_11,
+    "12_web_page_imitation": WEB_PAGE_IMITATION_REQ_12,
+    "13_web_page_imitation": WEB_PAGE_IMITATION_REQ_13,
+    "14_image_background_removal": IMAGE_BACKGROUND_REMOVAL_REQ_14,
+    "15_text2img": TEXT2IMG_REQ_15,
+    "16_image_2_code_generation": IMAGE2CODE_GENERATION_REQ_16,
+    "17_image_2_code_generation": IMAGE2CODE_GENERATION_REQ_17,
+    "18_generate_games": GENERATE_GAMES_REQ_18,
+    "19_generate_games": GENERATE_GAMES_REQ_19,
+    "20_generate_games": GENERATE_GAMES_REQ_20,
+}
--- a/examples/di/run_ml_benchmark.py
+++ b/examples/di/run_ml_benchmark.py
@ -0,0 +1,22 @@
+import os
+
+import fire
+
+from examples.di.requirements_prompt import ML_BENCHMARK_REQUIREMENTS
+from metagpt.const import DATA_PATH
+from metagpt.roles.di.data_interpreter import DataInterpreter
+from metagpt.tools.tool_recommend import TypeMatchToolRecommender
+
+
+# Ensure ML-Benchmark dataset has been downloaded before using these example.
+async def main(task_name, data_dir=DATA_PATH, use_reflection=True):
+    if data_dir != DATA_PATH and not os.path.exists(os.path.join(data_dir, "di_dataset/ml_benchmark")):
+        raise FileNotFoundError(f"ML-Benchmark dataset not found in {data_dir}.")
+
+    requirement = ML_BENCHMARK_REQUIREMENTS[task_name].format(data_dir=data_dir)
+    di = DataInterpreter(use_reflection=use_reflection, tool_recommender=TypeMatchToolRecommender(tools=["<all>"]))
+    await di.run(requirement)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/examples/di/run_open_ended_tasks.py
+++ b/examples/di/run_open_ended_tasks.py
@ -0,0 +1,22 @@
+import os
+
+import fire
+
+from examples.di.requirements_prompt import OPEN_ENDED_TASKS_REQUIREMENTS
+from metagpt.const import DATA_PATH
+from metagpt.roles.di.data_interpreter import DataInterpreter
+from metagpt.tools.tool_recommend import TypeMatchToolRecommender
+
+
+# Ensure Open-Ended Tasks dataset has been downloaded before using this example.
+async def main(task_name, data_dir=DATA_PATH, use_reflection=True):
+    if data_dir != DATA_PATH and not os.path.exists(os.path.join(data_dir, "di_dataset/open_ended_tasks")):
+        raise FileNotFoundError(f"Open-ended task dataset not found in {data_dir}.")
+
+    requirement = OPEN_ENDED_TASKS_REQUIREMENTS[task_name].format(data_dir=data_dir)
+    di = DataInterpreter(use_reflection=use_reflection, tool_recommender=TypeMatchToolRecommender(tools=["<all>"]))
+    await di.run(requirement)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)