diff --git a/.gitignore b/.gitignore
index 966038946..6443b07bd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -188,5 +188,4 @@ cov.xml
 *-structure.json
 *.dot
 .python-version
-*.jsonl
-*.json
+*.csv
diff --git a/examples/ags/benchmark/drop.py b/examples/ags/benchmark/drop.py
index 89c619755..0c3b7812b 100644
--- a/examples/ags/benchmark/drop.py
+++ b/examples/ags/benchmark/drop.py
@@ -4,6 +4,7 @@ import pandas as pd
 import string
 import re
 from typing import List, Tuple, Callable, Dict, Any, Set, Union
+from collections import Counter
 import numpy as np
 from scipy.optimize import linear_sum_assignment
 from tqdm.asyncio import tqdm_asyncio
@@ -13,213 +14,69 @@ from examples.ags.benchmark.utils import generate_random_indices
 global cost
 cost = 0
 
-def _remove_articles(text: str) -> str:
-    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
-    return re.sub(regex, " ", text)
-
-
-def _white_space_fix(text: str) -> str:
-    return " ".join(text.split())
-
-
-EXCLUDE = set(string.punctuation)
-
-def _is_number(text: str) -> bool:
+def is_number(text: str) -> bool:
     try:
         float(text)
         return True
     except ValueError:
         return False
 
-def _normalize_number(text: str) -> str:
-    if _is_number(text):
-        return str(float(text))
-    else:
-        return text
-
-def _remove_punc(text: str) -> str:
-    if not _is_number(text):
-        return "".join(ch for ch in text if ch not in EXCLUDE)
-    else:
-        return text
-
-
-def _lower(text: str) -> str:
-    return text.lower()
-
-
-def _tokenize(text: str) -> List[str]:
-    return re.split(" |-", text)
-
-
-def _normalize_answer(text: str) -> str:
-    """Lower text and remove punctuation, articles and extra whitespace."""
-
-    parts = [
-        _white_space_fix(_remove_articles(_normalize_number(_remove_punc(_lower(token)))))
-        for token in _tokenize(text)
-    ]
-    parts = [part for part in parts if part.strip()]
-    normalized = " ".join(parts).strip()
-    return normalized
-
-
-def _answer_to_bags(
-    answer: Union[str, List[str], Tuple[str, ...]]
-) -> Tuple[List[str], List[Set[str]]]:
-    if isinstance(answer, (list, tuple)):
-        raw_spans = answer
-    else:
-        raw_spans = [answer]
-    normalized_spans: List[str] = []
-    token_bags = []
-    for raw_span in raw_spans:
-        normalized_span = _normalize_answer(raw_span)
-        normalized_spans.append(normalized_span)
-        token_bags.append(set(normalized_span.split()))
-    return normalized_spans, token_bags
-
-
-def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]:
+def normalize_answer(s):
     """
-    Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
-    between them and gets maximum metric values over all the answers.
+    Normalize answers for evaluation.
     """
-    scores = np.zeros([len(gold), len(predicted)])
-    for gold_index, gold_item in enumerate(gold):
-        for pred_index, pred_item in enumerate(predicted):
-            if _match_numbers_if_present(gold_item, pred_item):
-                scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
-    row_ind, col_ind = linear_sum_assignment(-scores)
 
-    max_scores = np.zeros([max(len(gold), len(predicted))])
-    for row, column in zip(row_ind, col_ind):
-        max_scores[row] = max(max_scores[row], scores[row, column])
-    return max_scores
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
 
+    def white_space_fix(text):
+        return " ".join(text.split())
 
-def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float:
-    intersection = len(gold_bag.intersection(predicted_bag))
-    if not predicted_bag:
-        precision = 1.0
-    else:
-        precision = intersection / float(len(predicted_bag))
-    if not gold_bag:
-        recall = 1.0
-    else:
-        recall = intersection / float(len(gold_bag))
-    f1 = (
-        (2 * precision * recall) / (precision + recall)
-        if not (precision == 0.0 and recall == 0.0)
-        else 0.0
-    )
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+def compute_f1_score(prediction, ground_truth):
+    """
+    Compute the F1 score between prediction and ground truth answers.
+    """
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
     return f1
 
-def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]) -> bool:
-    gold_numbers = set()
-    predicted_numbers = set()
-    for word in gold_bag:
-        if _is_number(word):
-            gold_numbers.add(word)
-    for word in predicted_bag:
-        if _is_number(word):
-            predicted_numbers.add(word)
-    if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
-        return True
-    return False
+# def fuzzy_match(s1: str, s2: str) -> bool:
+#     s1 = normalize(s1)
+#     s2 = normalize(s2)
 
-def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float:
-    intersection = len(gold_bag.intersection(predicted_bag))
-    if not predicted_bag:
-        precision = 1.0
-    else:
-        precision = intersection / float(len(predicted_bag))
-    if not gold_bag:
-        recall = 1.0
-    else:
-        recall = intersection / float(len(gold_bag))
-    f1 = (
-        (2 * precision * recall) / (precision + recall)
-        if not (precision == 0.0 and recall == 0.0)
-        else 0.0
-    )
-    return f1
+#     if s1 == "" or s2 == "":
+#         return s1 == s2
 
-def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]:
-    """
-    Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
-    between them and gets maximum metric values over all the answers.
-    """
-    scores = np.zeros([len(gold), len(predicted)])
-    for gold_index, gold_item in enumerate(gold):
-        for pred_index, pred_item in enumerate(predicted):
-            if _match_numbers_if_present(gold_item, pred_item):
-                scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
-    row_ind, col_ind = linear_sum_assignment(-scores)
+#     return s1 in s2 or s2 in s1
 
-    max_scores = np.zeros([max(len(gold), len(predicted))])
-    for row, column in zip(row_ind, col_ind):
-        max_scores[row] = max(max_scores[row], scores[row, column])
-    return max_scores
 
-def get_metrics(
-    predicted: Union[str, List[str], Tuple[str, ...]], gold: Union[str, List[str], Tuple[str, ...]]
-) -> Tuple[float, float]:
-    """
-    Takes a predicted answer and a gold answer (that are both either a string or a list of
-    strings), and returns exact match and the DROP F1 metric for the prediction.  If you are
-    writing a script for evaluating objects in memory (say, the output of predictions during
-    validation, or while training), this is the function you want to call, after using
-    :func:`answer_json_to_strings` when reading the gold answer from the released data file.
-    """
-    predicted_bags = _answer_to_bags(predicted)
-    gold_bags = _answer_to_bags(gold)
+# def drop_metric(sample: str, reference: list[str]) -> Tuple[float, float]:
+#     em_scores = []
+#     f1_scores = []
+#     for answer in reference:
+#         if answer.strip() != "":
+#             em, f1 = get_drop_metrics(sample, answer)
+#             em_scores.append(em)
+#             f1_scores.append(f1)
+#     return (max(em_scores), max(f1_scores))
 
-    if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
-        exact_match = 1.0
-    else:
-        exact_match = 0.0
-
-    f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
-    f1 = np.mean(f1_per_bag)
-    f1 = round(f1, 2)
-    return exact_match, f1
-
-def answer_json_to_strings(answer: Dict[str, Any]) -> Tuple[Tuple[str, ...], str]:
-    """
-    Takes an answer JSON blob from the DROP data release and converts it into strings used for
-    evaluation.
-    """
-    if "number" in answer and answer["number"]:
-        return tuple([str(answer["number"])]), "number"
-    elif "spans" in answer and answer["spans"]:
-        return tuple(answer["spans"]), "span" if len(answer["spans"]) == 1 else "spans"
-    elif "date" in answer:
-        return (
-            tuple(
-                [
-                    "{0} {1} {2}".format(
-                        answer["date"]["day"], answer["date"]["month"], answer["date"]["year"]
-                    )
-                ]
-            ),
-            "date",
-        )
-    else:
-        raise ValueError(
-            f"Answer type not found, should be one of number, spans or date at: {json.dumps(answer)}"
-        )
-
-def load_data(file_path: str, samples: int, test=False) -> List[Tuple[str, Dict[str, Any]]]:
-    with open(file_path, mode="r") as file:
-        data = json.load(file)
-        data = list(data.items())
-
-    random_indices = generate_random_indices(len(data), samples, test)
-    data = [data[i] for i in random_indices]
-    return data
-
-async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, Any]], graph: Callable) -> Tuple[str, str, float]:
+async def evaluate_problem(inputs: str, answers: List[Dict[str, Any]], graph: Callable) -> Tuple[str, str, float]:
 
     max_retries = 5
     retries = 0
@@ -227,21 +84,26 @@ async def evaluate_problem(question: str, passage: str, answers: List[Dict[str,
     while retries < max_retries:
         try:
             global cost
-            prediction, cost = await graph(question, passage)
+            prediction, cost = await graph(inputs)
 
-
-            max_score = 0.0
-            max_type = None
-            best_answer = None
+            f1_scores = []
 
             for answer in answers:
-                golden_answer, golden_type = answer_json_to_strings(answer)
-                _, f1_score = get_metrics(prediction, golden_answer)
-                if golden_answer[0].strip() != "":
-                    max_score = max(max_score, f1_score)
-                    if max_score == f1_score:
-                        max_type = golden_type
-                        best_answer = golden_answer
+                if answer.strip() != "":
+                    f1_score = compute_f1_score(prediction, answer)
+                    f1_scores.append(f1_score)
+
+            max_score = max(f1_scores)
+
+            # matches = [
+            #         fuzzy_match(prediction, answer)
+            #         for answer in answers
+            # ]
+
+            # score = True in matches
+
+            score = max_score
+
             break
 
         except Exception as e:
@@ -250,35 +112,30 @@ async def evaluate_problem(question: str, passage: str, answers: List[Dict[str,
 
             if retries == max_retries:
                 print("Maximum retries reached. Skipping this sample.")
-                best_answer = None
                 prediction = None
-                max_score = 0.0
+                score = 0.0
                 break
 
-    return best_answer, prediction, max_score
+    return prediction, score
 
-async def evaluate_all_passages(annotations: List[Tuple[str, Dict[str, Any]]], graph: Callable, max_concurrent_tasks: int = 50) -> List[List[Any]]:
+async def evaluate_all_questions(annotations: List[Tuple[str, Dict[str, Any]]], graph: Callable, max_concurrent_tasks: int = 50) -> List[List[Any]]:
     semaphore = asyncio.Semaphore(max_concurrent_tasks)
     results = []
 
-    async def sem_evaluate(id: str, annotation: Dict[str, Any]):
+    async def sem_evaluate(annotation: Dict[str, Any]):
         async with semaphore:
-            passage = annotation["passage"]
-            for qa_pair in annotation["qa_pairs"]:
-                question = qa_pair["question"]
-                answers = [qa_pair["answer"]]
-                if "validated_answers" in qa_pair and qa_pair["validated_answers"]:
-                    answers += qa_pair["validated_answers"]
-                best_answer, prediction, score = await evaluate_problem(question, passage, answers, graph)
-                results.append([id, question, prediction, best_answer, score])
+            inputs = annotation["context"]
+            answers = annotation["targets"]
+            prediction, score = await evaluate_problem(inputs, answers, graph)
+            results.append([annotation["id"], prediction, answers, score])
 
-    tasks = [sem_evaluate(id, annotation) for id, annotation in annotations]
+    tasks = [sem_evaluate(annotation) for annotation in annotations]
     await tqdm_asyncio.gather(*tasks, desc="Evaluating DROP passages", total=len(annotations))
 
     return results
 
 def save_results_to_csv(results: List[List[Any]], path: str) -> float:
-    df = pd.DataFrame(results, columns=["id", "question", "prediction", "best_answer", "score"])
+    df = pd.DataFrame(results, columns=["id", "prediction", "answers", "score"])
     average_score = df["score"].mean()
 
     output_file = f"{path}/{average_score:.5f}.csv"
@@ -287,11 +144,49 @@ def save_results_to_csv(results: List[List[Any]], path: str) -> float:
 
     return average_score
 
+# -- From ADAS --
+
+def load_drop(file_path, samples, test=False, total_length=1000):
+    import gzip
+    with gzip.open(file_path, "rb") as file:
+        data = [json.loads(line) for line in file]
+
+    random_indices = generate_random_indices(len(data), total_length, False)
+    random_indices = random_indices[:samples] if not test else random_indices[samples:]
+    examples = [data[i] for i in random_indices]
+
+    for example in examples:
+        example["targets"] = example["ref_text"].split("|")    
+
+    return examples
+
 async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> float:
-    data = load_data(file_path, samples, test=test)
-    results = await evaluate_all_passages(data, graph, max_concurrent_tasks=20)
+    # data = load_data(file_path, samples, test=test)
+    data = load_drop(file_path, samples, test=test)
+    results = await evaluate_all_questions(data, graph, max_concurrent_tasks=30)
     average_score = save_results_to_csv(results, path=path)
     print(f"Average score on DROP dataset: {average_score:.5f}")
     global cost
-    print(f"Total cost: {cost}")
-    return average_score
+    print(f"Total cost: {cost: .5f}")
+    print(f"Cost per sample: {(cost / len(data)):.9f}")
+    return average_score, cost
+
+def load_drop_from_file(file_path):
+    import gzip
+    with gzip.open(file_path, "rb") as file:
+        data = [json.loads(line) for line in file]
+
+    for example in data:
+        example["targets"] = example["ref_text"].split("|")    
+
+    return data
+
+async def optimize_hotpotqa_evaluation(graph: Callable, file_path: str, path: str) -> Tuple[float, float]:
+    data = await load_drop_from_file(file_path)
+    results = await evaluate_all_questions(data, graph, max_concurrent_tasks=50)
+    average_score = save_results_to_csv(results, path=path)
+    print(f"Average score on DROP dataset: {average_score:.5f}")
+    global cost
+    print(f"Total cost: {cost: .5f}")
+    print(f"Cost per sample: {(cost / len(data)):.9f}")
+    return average_score, cost
\ No newline at end of file
diff --git a/examples/ags/benchmark/gsm8k.py b/examples/ags/benchmark/gsm8k.py
index be33ce977..c7b2e315b 100644
--- a/examples/ags/benchmark/gsm8k.py
+++ b/examples/ags/benchmark/gsm8k.py
@@ -72,6 +72,8 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) ->
 
             score = loose_match_score(expected_output, output)
             break
+        
+        # TODO 添加LOG入口
 
         except Exception as e:
             retries += 1
@@ -108,3 +110,20 @@ async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path:
     print(f"Average score: {average_score:.5f}")
     print(f"Total Cost: {total_cost:.5f}")
     return average_score, total_cost
+
+
+async def load_file_data(file_path: str) -> List[dict]:
+    data = []
+    async with aiofiles.open(file_path, mode="r") as file:
+        async for line in file:
+            data.append(json.loads(line))
+    return data
+
+async def optimize_gsm8k_evaluation(graph: Callable, file_path: str, path: str) -> Tuple[float, float]:
+    """Optimize GSM8K evaluation main function"""
+    data = await load_file_data(file_path)
+    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=50)
+    average_score, total_cost = save_results_to_csv(results, path=path)
+    print(f"Average score: {average_score:.5f}")
+    print(f"Total Cost: {total_cost:.5f}")
+    return average_score, total_cost
\ No newline at end of file
diff --git a/examples/ags/benchmark/hotpotqa.py b/examples/ags/benchmark/hotpotqa.py
index 823af0eb1..885abad7e 100644
--- a/examples/ags/benchmark/hotpotqa.py
+++ b/examples/ags/benchmark/hotpotqa.py
@@ -59,25 +59,29 @@ def f1_score(prediction, ground_truth):
     return f1
 
 
-async def load_data(file_path: str, samples=20, total_length=1000, test=False) -> List[dict]:
+async def load_data(file_path: str, samples=20, total_length=1250, test=False) -> List[dict]:
     data = []
     async with aiofiles.open(file_path, mode="r") as file:
         async for line in file:
             data.append(json.loads(line))
-    data = data[:total_length] 
-    random_indices = generate_random_indices(len(data), samples, test)
+    random_indices = generate_random_indices(len(data), total_length, False) # get random indices of 1250
+    random_indices = random_indices[:samples] if not test else random_indices[samples:] # get n_samples for validation or test
     data = [data[i] for i in random_indices]
     return data
 
 async def evaluate_problem(input: str, context_str: str, graph: Callable, expected_output: str):
     max_retries = 5
     retries = 0
-    
+
+    # global cost
+    # prediction, cost = await graph(input, context_str) if graph else "None"
+    # score = f1_score(prediction, expected_output)
+
     while retries < max_retries:
         try:
             global cost
             prediction, cost = await graph(input, context_str) if graph else "None"
-            score = f1_score(prediction["solution"], expected_output)
+            score = f1_score(prediction, expected_output)
 
             break
         except Exception as e:
@@ -125,5 +129,23 @@ async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, pat
     average_score = save_results_to_csv(results, path=path)
     print(f"Average score on HotpotQA dataset: {average_score:.5f}")
     global cost
-    print(f"Total cost: {cost}")
+    print(f"Total cost: {cost: .5f}")
+    print(f"Cost per sample: {(cost / len(data)):.9f}")
     return average_score
+
+async def load_file_data(file_path: str) -> List[dict]:
+    data = []
+    async with aiofiles.open(file_path, mode="r") as file:
+        async for line in file:
+            data.append(json.loads(line))
+    return data
+
+async def optimize_hotpotqa_evaluation(graph: Callable, file_path: str, path: str) -> Tuple[float, float]:
+    data = await load_file_data(file_path)
+    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=50)
+    average_score = save_results_to_csv(results, path=path)
+    print(f"Average score on HotpotQA dataset: {average_score:.5f}")
+    global cost
+    print(f"Total cost: {cost: .5f}")
+    print(f"Cost per sample: {(cost / len(data)):.9f}")
+    return average_score, cost
\ No newline at end of file
diff --git a/examples/ags/benchmark/humaneval.py b/examples/ags/benchmark/humaneval.py
index ee61d5992..70c28cc5b 100644
--- a/examples/ags/benchmark/humaneval.py
+++ b/examples/ags/benchmark/humaneval.py
@@ -2,7 +2,7 @@ import json
 import asyncio
 import aiofiles
 import pandas as pd
-from typing import List, Tuple, Callable
+from typing import List, Tuple, Callable, Dict, Any, Optional
 from tqdm.asyncio import tqdm_asyncio
 
 from examples.ags.benchmark.utils import generate_random_indices
@@ -19,32 +19,81 @@ async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
     data = [data[i] for i in random_indices]
     return data
 
-async def check_solution(solution, test_cases, entry_point):
-    # Define a local dictionary to execute the solution
-    local_dict = {}
-    exec("from typing import List\n\n" + solution, {}, local_dict)
+# async def check_solution(solution, test_cases, entry_point):
+#     # Define a local dictionary to execute the solution
+#     local_dict = {}
+#     exec("from typing import List, Tuple, Callable, Dict\n\n" + solution, {}, local_dict)
 
-    # Ensure the entry point function is defined
-    if entry_point not in local_dict:
-        raise ValueError(f"Function {entry_point} is not defined in the solution.")
+#     # Ensure the entry point function is defined
+#     if entry_point not in local_dict:
+#         raise ValueError(f"Function {entry_point} is not defined in the solution.")
 
-    details = [False for _ in range(len(test_cases))]
+#     details = [False for _ in range(len(test_cases))]
 
-    # Check each test case
-    for i, test in enumerate(test_cases):
-        # Replace 'candidate' with the actual function call
-        test_expr = test.replace("candidate", entry_point)
-        try:
-            # Evaluate the test case
-            if eval(test_expr, {}, local_dict):
-                details[i] = True
-        except Exception as e:
-            print(f"Error evaluating test case '{test}': {e}")
+#     # Check each test case
+#     for i, test in enumerate(test_cases):
+#         # Replace 'candidate' with the actual function call
+#         test_expr = test.replace("candidate", entry_point)
+#         try:
+#             # Evaluate the test case
+#             if eval(test_expr, {}, local_dict):
+#                 details[i] = True
+#         except Exception as e:
+#             print(f"Error evaluating test case '{test}': {e}")
 
-    if all(details):
-        return PASS, details
+#     if all(details):
+#         return PASS, details
 
-    return FAIL, details
+#     return FAIL, details
+
+async def check_solution(solution, test, entry_point):
+    try:
+        # 定义一个包含所有必要模块的全局字典
+        global_dict = {
+            'math': __import__('math'),
+            'hashlib': __import__('hashlib'),
+            're': __import__('re'),
+            'List': List,
+            'Dict': Dict,
+            'Tuple': Tuple,
+            'Optional': Optional,
+            'Any': Any
+        }
+        if entry_point == "decode_cyclic":
+            solution = "\n\ndef encode_cyclic(s: str):\n    \"\"\"\n    returns encoded string by cycling groups of three characters.\n    \"\"\"\n    # split string to groups. Each of length 3.\n    groups = [s[(3 * i):min((3 * i + 3), len(s))] for i in range((len(s) + 2) // 3)]\n    # cycle elements in each group. Unless group has fewer elements than 3.\n    groups = [(group[1:] + group[0]) if len(group) == 3 else group for group in groups]\n    return \"\".join(groups)" + "\n\n" + solution
+        elif entry_point == "decode_shift":
+            solution = "\n\ndef encode_shift(s: str):\n    \"\"\"\n    returns encoded string by shifting every character by 5 in the alphabet.\n    \"\"\"\n    return \"\".join([chr(((ord(ch) + 5 - ord(\"a\")) % 26) + ord(\"a\")) for ch in s])\n\n\n" + solution
+        elif entry_point == "find_zero":
+            solution = "\n\ndef poly(xs: list, x: float):\n    return sum(coeff * (x ** i) for i, coeff in enumerate(xs))\n\n" + solution
+        # 执行解决方案
+        exec(solution, global_dict)
+        
+        # 确保入口点函数已定义
+        if entry_point not in global_dict:
+            raise ValueError(f"函数 {entry_point} 在解决方案中未定义。")
+        
+        # 执行测试用例
+        exec(test, global_dict)
+        
+        # 获取检查函数
+        check = global_dict["check"]
+        
+        # 运行检查函数
+        result = check(global_dict[entry_point])
+        
+        if result is None:
+            result = (PASS, "解决方案通过了所有测试用例。")
+    
+    except Exception as e:
+        # 记录详细的错误信息
+        error_message = f"错误: {str(e)}.\n 解决方案: {solution}.\n 测试: {test}"
+        result = (FAIL, error_message)
+        
+        # 将错误信息写入error.log文件
+        with open('error.log', 'a', encoding='utf-8') as log_file:
+            log_file.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {error_message}\n")
+    
+    return result
 
 async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int, str]:
     max_retries = 5
@@ -52,10 +101,10 @@ async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str,
 
     while retries < max_retries:
         try:
-            prediction = await graph(data["prompt"]) if graph else "None"
+            prediction = await graph(data["prompt"], data["entry_point"]) if graph else "None"
             cost = prediction[1]  # 添加这行来获取cost
             solution = prediction[0]  # 修改这行以获取实际的预测结果
-            ret = await check_solution(solution, data["test_cases"], data["entry_point"])
+            ret = await check_solution(solution, data["test"], data["entry_point"])
 
             score = 1 if ret[0] == PASS else 0
             break
@@ -114,14 +163,34 @@ def save_results_to_jsonl(results: List[Tuple[str, str, str, int, str]], path: s
             total_cost += float(result[4])  # 添加这行来累加cost
     print(f"save to {full_path}")
     avg_score /= len(results)
-    total_cost = results[-1][4]  # 使用最后一个结果的cost作为总cost
+    # 从full_path中读取所有结果,选择得分最高的
+    with open(full_path, 'r') as f:
+        all_results = [json.loads(line) for line in f]
+    max_result = max(all_results, key=lambda x: x['cost'])
+    total_cost = max_result['cost']
 
     return round(avg_score, 5), round(total_cost, 5)  # 修改返回值以包含total_cost
 
 async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
     data = await load_data(file_path, samples, test=test)
-    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
+    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=50)
     average_score, total_cost = save_results_to_jsonl(results, path=path)
     print(f"Average score on HumanEval dataset: {average_score:.5f}")
     print(f"Total Cost: {total_cost:.5f}")
     return average_score, total_cost  # 修改返回值以包含total_cost
+
+
+async def load_file_data(file_path: str) -> List[dict]:
+    data = []
+    async with aiofiles.open(file_path, mode="r") as file:
+        async for line in file:
+            data.append(json.loads(line))
+    return data
+
+async def optimize_humaneval_evaluation(graph: Callable, file_path: str, path: str) -> Tuple[float, float]:
+    data = await load_file_data(file_path)
+    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=50)
+    average_score, total_cost = save_results_to_jsonl(results, path=path)
+    print(f"Average score on HumanEval dataset: {average_score:.5f}")
+    print(f"Total Cost: {total_cost:.5f}")
+    return average_score, total_cost  
\ No newline at end of file
diff --git a/examples/ags/benchmark/math.py b/examples/ags/benchmark/math.py
index 6c0ddccf7..4862359dc 100644
--- a/examples/ags/benchmark/math.py
+++ b/examples/ags/benchmark/math.py
@@ -21,7 +21,7 @@ def extract_answer(text: str) -> str:
         return boxed_match.group(1)
 
     # If no \boxed{...}, return the last sentence
-    sentences = text.split(".")
+    sentences = text.split(".") # TODO 使用jinyu修改
     return sentences[-1].strip() if sentences else ""
 
 def parse_digits(num):
@@ -221,10 +221,11 @@ async def load_data(file_path: str, samples: int = 200, test=False) -> List[dict
     data = [data[i] for i in random_indices]
     return data
 
+
 def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
     df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"])
     average_score = df["score"].mean()
-    total_cost = df["cost"].iloc[-1]
+    total_cost = df["cost"].max()
 
     output_file = f"{path}/{average_score:.5f}.csv"
     df.to_csv(output_file, index=False)
@@ -277,3 +278,18 @@ async def math_evaluation(graph: Callable, file_path: str, samples: int, path: s
     print(f"Average score on MATH dataset: {average_score:.5f}")
     print(f"Total Cost: {total_cost:.5f}")
     return average_score, total_cost
+
+async def load_file_data(file_path: str) -> List[dict]:
+    data = []
+    async with aiofiles.open(file_path, mode="r") as file:
+        async for line in file:
+            data.append(json.loads(line))
+    return data
+
+async def optimize_math_evaluation(graph: Callable, file_path: str, path: str) -> Tuple[float, float]:
+    data = await load_file_data(file_path)
+    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=50)
+    average_score, total_cost = save_results_to_csv(results, path=path)
+    print(f"Average score on MATH dataset: {average_score:.5f}")
+    print(f"Total Cost: {total_cost:.5f}")
+    return average_score, total_cost
\ No newline at end of file
diff --git a/examples/ags/benchmark/mbpp.py b/examples/ags/benchmark/mbpp.py
index a33144e68..f2720a5f7 100644
--- a/examples/ags/benchmark/mbpp.py
+++ b/examples/ags/benchmark/mbpp.py
@@ -1,8 +1,9 @@
 import json
+import time
 import asyncio
 import aiofiles
 import pandas as pd
-from typing import List, Tuple, Callable
+from typing import List, Tuple, Callable, Any, Optional, Dict
 from tqdm.asyncio import tqdm_asyncio
 
 from examples.ags.benchmark.utils import generate_random_indices
@@ -19,37 +20,52 @@ async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
     data = [data[i] for i in random_indices]
     return data
 
-async def check_solution(solution, test_cases, timeout=1):
-    # Define a local dictionary to execute the solution
-    local_dict = {}
-    exec(solution, {}, local_dict)
 
-    details = [False for _ in range(len(test_cases))]
-
-    async def evaluate_test(test):
-        # Delete 'assert' from test
-        test_expr = test.replace("assert ", "")
-        try:
-            # Evaluate the test case with timeout
-            await asyncio.wait_for(asyncio.to_thread(eval, test_expr, {}, local_dict), timeout)
-            return True
-        except asyncio.TimeoutError:
-            print(f"Test case '{test}' timed out.")
-        except Exception as e:
-            print(f"Error evaluating test case '{test}': {e}")
-        return False
-
-    # Check each test case
-    for i, test in enumerate(test_cases):
-        result = await evaluate_test(test)
-        details[i] = result
-        if not result:
-            return FAIL, details
-
-    if all(details):
-        return PASS, details
-
-    return FAIL, details
+async def check_solution(solution, test, entry_point):
+    try:
+        # 定义一个包含所有必要模块的全局字典
+        global_dict = {
+            'math': __import__('math'),
+            'hashlib': __import__('hashlib'),
+            're': __import__('re'),
+            'List': List,
+            'Dict': Dict,
+            'Tuple': Tuple,
+            'Optional': Optional,
+            'Any': Any
+        }
+        # 执行解决方案
+        exec(solution, global_dict)
+        
+        # 确保入口点函数已定义
+        if entry_point not in global_dict:
+            raise ValueError(f"函数 {entry_point} 在解决方案中未定义。")
+        
+        # 执行测试用例
+        exec(test, global_dict)
+        
+        # 获取检查函数
+        check = global_dict["check"]
+        
+        # 运行检查函数
+        result = check()
+        
+        if result is None:
+            result = (PASS, "解决方案通过了所有测试用例。")
+    
+    # except ValueError as ve:
+    #     if "函数" in str(ve) and "在解决方案中未定义" in str(ve):
+    #         raise
+    except Exception as e:
+        # 记录详细的错误信息
+        error_message = f"错误: {str(e)}.\n 解决方案: {solution}.\n 测试: {test}"
+        result = (FAIL, error_message)
+        
+        # 将错误信息写入error.log文件
+        with open('error_mbpp.log', 'a', encoding='utf-8') as log_file:
+            log_file.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {error_message}\n")
+    
+    return result
 
 async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int, str]:
     max_retries = 5
@@ -57,10 +73,10 @@ async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str,
 
     while retries < max_retries:
         try:
-            prediction = await graph(data["prompt"]) if graph else "None"
+            prediction = await graph(data["prompt"], data["entry_point"]) if graph else "None"
             cost = prediction[1]
             solution = prediction[0]
-            ret = await check_solution(solution, data["test_list"])
+            ret = await check_solution(solution, data["test"], data["entry_point"]) 
 
             score = 1 if ret[0] == PASS else 0
             break
@@ -92,7 +108,7 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
 def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
     df = pd.DataFrame(results, columns=["question", "prediction", "test_case_details", "score", "cost"])
     average_score = df["score"].mean()
-    total_cost = df["cost"].iloc[-1]
+    total_cost = df["cost"].max()
 
     output_file = f"{path}/{average_score:.5f}.csv"
     df.to_csv(output_file, index=False)
@@ -100,9 +116,25 @@ def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str
     return average_score, total_cost
 
 async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
-    data = await load_data(file_path, samples)
-    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
-    average_score, total_cost = save_results_to_csv(results, path=path, test=test)
+    data = await load_data(file_path, samples, test)
+    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=25)
+    average_score, total_cost = save_results_to_csv(results, path=path)
     print(f"Average score on MBPP dataset: {average_score:.5f}")
     print(f"Total Cost: {total_cost:.5f}")
     return average_score, total_cost
+
+
+async def load_file_data(file_path: str) -> List[dict]:
+    data = []
+    async with aiofiles.open(file_path, mode="r") as file:
+        async for line in file:
+            data.append(json.loads(line))
+    return data
+
+async def optimize_mbpp_evaluation(graph: Callable, file_path: str, path: str) -> Tuple[float, float]:
+    data = await load_file_data(file_path)
+    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=50)
+    average_score, total_cost = save_results_to_csv(results, path=path)
+    print(f"Average score on MBPP dataset: {average_score:.5f}")
+    print(f"Total Cost: {total_cost:.5f}")
+    return average_score, total_cost
\ No newline at end of file
diff --git a/examples/ags/benchmark/utils.py b/examples/ags/benchmark/utils.py
index 871d5e50a..dc9239dc0 100644
--- a/examples/ags/benchmark/utils.py
+++ b/examples/ags/benchmark/utils.py
@@ -14,4 +14,6 @@ def generate_random_indices(n, n_samples, test=False):
     if test:
         return indices[n_samples:]
     else:
-        return indices[:n_samples]
\ No newline at end of file
+        return indices[:n_samples]
+    
+# TODO yzy 补充分割数据集的函数
\ No newline at end of file
diff --git a/examples/ags/experiments/baselines/cot_gsm8k.py b/examples/ags/experiments/baselines/cot_gsm8k.py
index 90266d546..968b3cce7 100644
--- a/examples/ags/experiments/baselines/cot_gsm8k.py
+++ b/examples/ags/experiments/baselines/cot_gsm8k.py
@@ -39,7 +39,7 @@ class CoTSolveGraph(SolveGraph):
 
     async def __call__(self, problem):
         solution = await self.cot_generate(problem, mode="context_fill")
-        return solution, self.llm.cost_manager.total_cost # {"solution": solution}
+        return solution, self.llm.cost_manager.total_cost
 
 if __name__ == "__main__":
     async def main():
@@ -49,7 +49,7 @@ if __name__ == "__main__":
         # llm_config = ModelsConfig.default().get("gpt-4o")
         graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="Gsm8K")
         file_path = "examples/ags/data/gsm8k.jsonl"
-        samples = 264 #264 # 1055 #314  
+        samples = 10 #264 # 1055 #314  
         # samples = 100
         path = "examples/ags/data/baselines/general/gsm8k/"
         score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=False)
diff --git a/examples/ags/experiments/baselines/cot_hotpotqa.py b/examples/ags/experiments/baselines/cot_hotpotqa.py
index e9f5592de..11a919f22 100644
--- a/examples/ags/experiments/baselines/cot_hotpotqa.py
+++ b/examples/ags/experiments/baselines/cot_hotpotqa.py
@@ -1,3 +1,7 @@
+import sys
+sys.path = ['H:\Hack\MetaGPT-MathAI'] + sys.path  # 不然找不到根目录的模块
+# print(sys.path)
+
 from examples.ags.scripts.operator import Operator
 from examples.ags.scripts.graph import SolveGraph
 from examples.ags.benchmark.hotpotqa import hotpotqa_evaluation
@@ -9,29 +13,31 @@ from pydantic import BaseModel, Field
 from typing import Tuple
 
 HOTPOTQA_PROMPT = """
-问题: {question}
-
-上下文:
-{context}
-
-请一步步思考,并在最后给出你的答案和支持性句子。使用XML标签包裹内容。
+Think step by step and solve the problem.
+1. In the "thought" field, explain your thinking process in detail.
+2. In the "answer" field, provide the final answer concisely and clearly. The answer should be a direct response to the question, without including explanations or reasoning.
+Question: {question}
+The revelant context: {context}
 """
 
 class GenerateOp(BaseModel):
-    answer: str = Field(default="", description="问题的答案")
+    thought: str = Field(default="", description="The step by step thinking process")
+    answer: str = Field(default="", description="The final answer to the question")
 
 class CoTGenerate(Operator):
     def __init__(self, llm: LLM, name: str = "Generate"):
         super().__init__(name, llm)
 
     async def __call__(self, question: str, context: str, mode: str = None) -> Tuple[str, str]:
+        thought = ""
         prompt = HOTPOTQA_PROMPT.format(question=question, context=context)
         fill_kwargs = {"context": prompt, "llm": self.llm}
         if mode:
             fill_kwargs["mode"] = mode
         node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
         response = node.instruct_content.model_dump()
-        return response["answer"]
+
+        return response
 
 class CoTSolveGraph(SolveGraph):
     def __init__(self, name: str, llm_config, dataset: str):
@@ -40,17 +46,21 @@ class CoTSolveGraph(SolveGraph):
 
     async def __call__(self, question: str, context: str) -> Tuple[str, str]:
         answer = await self.cot_generate(question, context, mode="context_fill")
-        return answer, self.llm.cost_manager.total_cost
+        return answer["answer"], self.llm.cost_manager.total_cost
 
 if __name__ == "__main__":
     async def main():
+        # llm_config = ModelsConfig.default().get("deepseek-chat")
         llm_config = ModelsConfig.default().get("gpt-4o-mini")
         # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+
         graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="HotpotQA")
-        file_path = "examples/ags/data/hotpotqa.jsonl"
-        samples = 50 # TODO 选择前1000条跑实验
-        path = "examples/ags/data/baselines/general/hotpotqa"
-        score = await hotpotqa_evaluation(graph, file_path, samples, path)
+
+        file_path = "examples/ags/data/hotpotqa.jsonl"   #相对路径有问题 等着再改
+        samples = 10 # 250 for validation, 1000 for test
+        path = "examples/ags/data/baselines/general/hotpotqa" #相对路径有问题 等着再改
+
+        score = await hotpotqa_evaluation(graph, file_path, samples, path, test=False)
         return score
 
     import asyncio 
diff --git a/examples/ags/experiments/baselines/cot_humaneval.py b/examples/ags/experiments/baselines/cot_humaneval.py
index 27d5f8801..e261a57a0 100644
--- a/examples/ags/experiments/baselines/cot_humaneval.py
+++ b/examples/ags/experiments/baselines/cot_humaneval.py
@@ -8,19 +8,21 @@ from metagpt.llm import LLM
 from pydantic import BaseModel, Field
 
 HUMANEVAL_PROMPT_GPT = """
-{question}\nPlease reason step by step, and put your python function in the end. 
+{question}\nPlease provide a step-by-step explanation in text, followed by your Python function without any additional text or test cases. 
 """
 
+# TODO 这里的Code_fill 不是很好做，需要对应着之前的代码修改一个版本才能跑通
+
 class GenerateOp(BaseModel):
-    solution: str = Field(default="", description="问题的Python函数实现")
+    solution: str = Field(default="", description="Python Solution For This Question.")
 
 class CoTGenerate(Operator):
     def __init__(self, llm: LLM, name: str = "Generate"):
         super().__init__(name, llm)
 
-    async def __call__(self, problem, mode: str = None):
+    async def __call__(self, problem, function_name, mode: str = None):
         prompt = HUMANEVAL_PROMPT_GPT.format(question=problem)
-        fill_kwargs = {"context": prompt, "llm": self.llm}
+        fill_kwargs = {"context": prompt, "llm": self.llm, "function_name": function_name}
         if mode:
             fill_kwargs["mode"] = mode
         node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
@@ -32,19 +34,20 @@ class CoTSolveGraph(SolveGraph):
         super().__init__(name, llm_config, dataset)
         self.cot_generate = CoTGenerate(self.llm)
 
-    async def __call__(self, problem):
-        solution = await self.cot_generate(problem, mode="code_fill")
-        return solution["solution"]
+    async def __call__(self, problem, function_name):
+        solution = await self.cot_generate(problem, function_name, mode="code_fill")
+        return solution["solution"], self.llm.cost_manager.total_cost
 
 if __name__ == "__main__":
     async def main():
         # llm_config = ModelsConfig.default().get("gpt-4o-mini")
-        llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+        # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+        llm_config = ModelsConfig.default().get("deepseek-chat")
         graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="HumanEval")
-        file_path = "examples/ags/data/human-eval-new.jsonl"
-        samples = 131 # 33/131  
+        file_path = "examples/ags/data/human-eval.jsonl"
+        samples = 33 # 33/131  
         path = "examples/ags/data/baselines/general/humaneval"
-        score = await humaneval_evaluation(graph, file_path, samples, path)
+        score = await humaneval_evaluation(graph, file_path, samples, path,test=True)
         return score
 
     import asyncio
diff --git a/examples/ags/experiments/baselines/cot_math.py b/examples/ags/experiments/baselines/cot_math.py
index 2c1df5cf7..f58862578 100644
--- a/examples/ags/experiments/baselines/cot_math.py
+++ b/examples/ags/experiments/baselines/cot_math.py
@@ -8,12 +8,8 @@ from metagpt.llm import LLM
 from pydantic import BaseModel, Field
 from typing import Dict, Any
 
-MATH_PROMPT_GPT = """
-{question}\nPlease reason step by step, and put your final answer in the end. Wrap content using xml tags.
-"""
-
-MATH_PROMPT_DS = """
-{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.
+GENERATE_COT_PROMPT = """
+{question}\nPlease reason step by step. At the end, provide the final answer in the format "\\boxed{{<number>}}", where <number> is a math answer(an expression or number), without any additional information or explanation.
 """
 
 class GenerateOp(BaseModel):
@@ -24,7 +20,7 @@ class CoTGenerate(Operator):
         super().__init__(name, llm)
 
     async def __call__(self, problem, mode: str = None):
-        prompt = MATH_PROMPT_GPT.format(question=problem)
+        prompt = GENERATE_COT_PROMPT.format(question=problem)
         fill_kwargs = {"context": prompt, "llm": self.llm}
         if mode:
             fill_kwargs["mode"] = mode
@@ -47,27 +43,12 @@ if __name__ == "__main__":
         llm_config = ModelsConfig.default().get("gpt-4o-mini")
         # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
         graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="Gsm8K")
-        file_path = "examples/ags/data/math.jsonl"
-        samples = 100
-        # samples = 100
+        file_path = "examples/ags/data/math_test.jsonl"
+        # samples = None
+        samples = 0
         path = "examples/ags/data/baselines/general/math"
-        score = await math_evaluation(graph, file_path, samples, path)
+        score = await math_evaluation(graph, file_path, samples, path,test=True)
         return score
 
     import asyncio
-    asyncio.run(main())
-
-
-# self consistency operator; universal self consistency; 
-
-# IO指的没有任何Trick，看LLM自身的一个效果。使用 model 发布者在对应的 dataset 使用的 prompt。
-
-# deepseek-chat; gpt-4o-mini; gpt-35-turbo-1106
-
-
-
-GENERATE_PROMPT = """
-Generate Solution for the following problem: {problem_description}
-"""
-
-# med ensemble 
\ No newline at end of file
+    asyncio.run(main())
\ No newline at end of file
diff --git a/examples/ags/experiments/baselines/cot_mbpp.py b/examples/ags/experiments/baselines/cot_mbpp.py
index 9db817fbe..37cc3b7f1 100644
--- a/examples/ags/experiments/baselines/cot_mbpp.py
+++ b/examples/ags/experiments/baselines/cot_mbpp.py
@@ -8,20 +8,20 @@ from metagpt.llm import LLM
 from pydantic import BaseModel, Field
 from typing import Tuple
 
-MBPP_PROMPT = """
-{question}\nPlease reason step by step, and put your python function in the end. 
-"""
+MBPP_PROMPT_COT = """
+{question}\nPlease provide a step-by-step explanation in text, followed by your Python function, ensure the output code is self-contained, meaning it should have the correct function name and return statement, without any additional text."""
+
 
 class GenerateOp(BaseModel):
-    solution: str = Field(default="", description="问题的Python函数实现")
+    solution: str = Field(default="", description="Python Solution For This Question.")
 
 class CoTGenerate(Operator):
     def __init__(self, llm: LLM, name: str = "Generate"):
         super().__init__(name, llm)
 
-    async def __call__(self, question: str, mode: str = None) -> Tuple[str, str]:
-        prompt = MBPP_PROMPT.format(question=question)
-        fill_kwargs = {"context": prompt, "llm": self.llm}
+    async def __call__(self, problem, function_name, mode: str = None):
+        prompt = MBPP_PROMPT_COT.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm, "function_name": function_name}
         if mode:
             fill_kwargs["mode"] = mode
         node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
@@ -33,19 +33,19 @@ class CoTSolveGraph(SolveGraph):
         super().__init__(name, llm_config, dataset)
         self.cot_generate = CoTGenerate(self.llm)
 
-    async def __call__(self, question: str) -> Tuple[str, str]:
-        response = await self.cot_generate(question, mode="context_fill")
-        return response["solution"]
+    async def __call__(self, question: str, entry_point) -> Tuple[str, str]:
+        solution = await self.cot_generate(question, entry_point, mode="code_fill")
+        return solution["solution"], self.llm.cost_manager.total_cost
 
 if __name__ == "__main__":
     async def main():
         llm_config = ModelsConfig.default().get("gpt-4o-mini")
         # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
         graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="MBPP")
-        file_path = "examples/ags/data/mbpp-new.jsonl"
-        samples = 30
+        file_path = "examples/ags/data/mbpp-new-new.jsonl"
+        samples = 86
         path = "examples/ags/data/baselines/general/mbpp"
-        score = await mbpp_evaluation(graph, file_path, samples, path)
+        score = await mbpp_evaluation(graph, file_path, samples, path, test=True)
         return score
 
     import asyncio 
diff --git a/examples/ags/experiments/baselines/io.py b/examples/ags/experiments/baselines/io.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/examples/ags/experiments/baselines/io_gsm8k.py b/examples/ags/experiments/baselines/io_gsm8k.py
new file mode 100644
index 000000000..fa22253ca
--- /dev/null
+++ b/examples/ags/experiments/baselines/io_gsm8k.py
@@ -0,0 +1,62 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.gsm8k import gsm8k_evaluation
+from examples.ags.scripts.operator_an import GenerateOp
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import Dict, Any
+
+GSM8K_PROMPT_IO = """
+{question}\nGenerate an answer to this question. At the end, provide the final answer in the format "Answer is <number>", where <number> is a single number.
+"""
+
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="solution for the problem")
+
+class Generate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, mode: str = None):
+        prompt = GSM8K_PROMPT_IO.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+class GenerateSolveGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.generate = Generate(self.llm)
+
+    async def __call__(self, problem):
+        solution = await self.generate(problem, mode="context_fill")
+        return solution, self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        # llm_config = ModelsConfig.default().get("deepseek-coder")
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        graph = GenerateSolveGraph(name="Generate", llm_config=llm_config, dataset="Gsm8K")
+        file_path = "examples/ags/data/gsm8k.jsonl"
+        samples = 1219
+        path = "examples/ags/data/baselines/general"
+        score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=True)
+        return score, cost
+
+    import asyncio
+    asyncio.run(main())
+
+
+# medprompt operator; universal self consistency; 
+
+# IO指的没有任何Trick，看LLM自身的一个效果。使用 model 发布者在对应的 dataset 使用的 prompt。
+
+# deepseek-chat; gpt-4o-mini; gpt-35-turbo-1106
+
+# med ensemble 
\ No newline at end of file
diff --git a/examples/ags/experiments/baselines/io_hotpotqa.py b/examples/ags/experiments/baselines/io_hotpotqa.py
new file mode 100644
index 000000000..40ee4f7c0
--- /dev/null
+++ b/examples/ags/experiments/baselines/io_hotpotqa.py
@@ -0,0 +1,67 @@
+import sys
+sys.path = ['H:\Hack\MetaGPT-MathAI'] + sys.path  # 不然找不到根目录的模块
+# print(sys.path)
+
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.hotpotqa import hotpotqa_evaluation
+from examples.ags.scripts.operator_an import GenerateOp
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import Tuple
+
+HOTPOTQA_PROMPT = """
+Given a question and a context, please answer the question.
+1. In the "thought" field, explain your thinking process.
+2. In the "answer" field, provide the final answer concisely and clearly. The answer should be a direct response to the question, without including explanations or reasoning.
+Question: {question}
+The revelant context: {context}
+"""
+
+class GenerateOp(BaseModel):
+    thought: str = Field(default="", description="The step by step thinking process")
+    answer: str = Field(default="", description="The final answer to the question")
+
+class IOGenerate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, question: str, context: str, mode: str = None) -> Tuple[str, str]:
+        thought = ""
+        prompt = HOTPOTQA_PROMPT.format(question=question, context=context)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+
+        return response
+
+class IOSolveGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = IOGenerate(self.llm)
+
+    async def __call__(self, question: str, context: str) -> Tuple[str, str]:
+        answer = await self.cot_generate(question, context, mode="context_fill")
+        return answer["answer"], self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        # llm_config = ModelsConfig.default().get("deepseek-chat")
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+
+        graph = IOSolveGraph(name="IO", llm_config=llm_config, dataset="HotpotQA")
+
+        file_path = "examples/ags/data/hotpotqa.jsonl"   #相对路径有问题 等着再改
+        samples = 250 # 250 for validation, 1000 for test
+        path = "examples/ags/data/baselines/general/hotpotqa" #相对路径有问题 等着再改
+
+        score = await hotpotqa_evaluation(graph, file_path, samples, path, test=True)
+        return score
+
+    import asyncio 
+    asyncio.run(main())
\ No newline at end of file
diff --git a/examples/ags/experiments/baselines/io_humaneval.py b/examples/ags/experiments/baselines/io_humaneval.py
new file mode 100644
index 000000000..a6906a8d6
--- /dev/null
+++ b/examples/ags/experiments/baselines/io_humaneval.py
@@ -0,0 +1,54 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.humaneval import humaneval_evaluation
+from examples.ags.scripts.operator_an import GenerateOp
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+
+HUMANEVAL_PROMPT_IO = """
+{question}\nGenerate an answer to this question, without any additional test cases. 
+"""
+
+# TODO 这里的Code_fill 不是很好做，需要对应着之前的代码修改一个版本才能跑通
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="Python Solution For This Question.")
+
+class Generate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, function_name, mode: str = None):
+        prompt = HUMANEVAL_PROMPT_IO.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm, "function_name": function_name}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+class IOSolveGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = Generate(self.llm)
+
+    async def __call__(self, problem, function_name):
+        solution = await self.cot_generate(problem, function_name, mode="code_fill")
+        return solution["solution"], self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        # llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+        llm_config = ModelsConfig.default().get("deepseek-chat")
+        graph = IOSolveGraph(name="Io", llm_config=llm_config, dataset="HumanEval")
+        file_path = "examples/ags/data/human-eval.jsonl"
+        samples = 33 # 33/131  
+        path = "examples/ags/data/baselines/general/humaneval"
+        score = await humaneval_evaluation(graph, file_path, samples, path,test=True)
+        return score
+
+    import asyncio
+    asyncio.run(main())
diff --git a/examples/ags/experiments/baselines/io_math.py b/examples/ags/experiments/baselines/io_math.py
new file mode 100644
index 000000000..532c210bf
--- /dev/null
+++ b/examples/ags/experiments/baselines/io_math.py
@@ -0,0 +1,54 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.math import math_evaluation
+from examples.ags.scripts.operator_an import GenerateOp
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import Dict, Any
+
+GENERATE_IO_PROMPT = """
+{question}\nPlease generate a solution for the problem. At the end, provide the final answer in the format "\\boxed{{<number>}}", where <number> is a math answer(an expression or number), without any additional information or explanation.
+"""
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="solution for the problem")
+
+class IOGenerate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, mode: str = None):
+        prompt = GENERATE_IO_PROMPT.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+class IOSolveGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = IOGenerate(self.llm)
+
+    async def __call__(self, problem):
+        solution = await self.cot_generate(problem, mode="context_fill")
+        return solution, self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        # llm_config = ModelsConfig.default().get("deepseek-coder")
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+        graph = IOSolveGraph(name="CoT", llm_config=llm_config, dataset="Gsm8K")
+        file_path = "examples/ags/data/math_test.jsonl" #486
+        # samples = None
+        samples = 0
+        path = "examples/ags/data/baselines/general/math"
+        score = await math_evaluation(graph, file_path, samples, path,test=True)
+        return score
+
+    import asyncio
+    asyncio.run(main())
\ No newline at end of file
diff --git a/examples/ags/experiments/baselines/io_mbpp.py b/examples/ags/experiments/baselines/io_mbpp.py
new file mode 100644
index 000000000..28a6ab497
--- /dev/null
+++ b/examples/ags/experiments/baselines/io_mbpp.py
@@ -0,0 +1,55 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.mbpp import mbpp_evaluation
+from examples.ags.scripts.operator_an import GenerateOp
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+
+MBPP_PROMPT_IO = """
+{question}\nGenerate an answer to this question, ensure the output code is self-contained, meaning it should have the correct function name and return statement, but without any additional test cases.
+"""
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="Python Solution For This Question.")
+
+class Generate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, function_name, mode: str = None):
+        prompt = MBPP_PROMPT_IO.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm, "function_name": function_name}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+class IOSolveGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = Generate(self.llm)
+
+    async def __call__(self, problem, function_name):
+        solution = await self.cot_generate(problem, function_name, mode="code_fill")
+        return solution["solution"], self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        # llm_config = ModelsConfig.default().get("deepseek-chat")
+        # llm_config = ModelsConfig.default().get("gpt-35-turbo")
+        graph = IOSolveGraph(name="Io", llm_config=llm_config, dataset="MBPP")
+        # result = await graph("Write a function to round every number of a given list of numbers and print the total sum multiplied by the length of the list.\n\ndef round_and_sum(list1):", "round_and_sum")
+        # print(result)
+
+        file_path = "examples/ags/data/mbpp-new-new.jsonl"
+        samples = 86 # 86/341
+        path = "examples/ags/data/baselines/general/mbpp"
+        score = await mbpp_evaluation(graph, file_path, samples, path, test=True)
+        return score
+
+    import asyncio
+    asyncio.run(main())
diff --git a/examples/ags/experiments/baselines/medprompt.py b/examples/ags/experiments/baselines/medprompt.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/examples/ags/experiments/baselines/medprompt_gsm8k.py b/examples/ags/experiments/baselines/medprompt_gsm8k.py
index ea07cddc3..e6c903590 100644
--- a/examples/ags/experiments/baselines/medprompt_gsm8k.py
+++ b/examples/ags/experiments/baselines/medprompt_gsm8k.py
@@ -63,7 +63,7 @@ class MdEnsemble(Operator):
     Link: https://arxiv.org/abs/2311.16452
     """
 
-    def __init__(self, name: str = "MdEnsemble", llm: LLM = LLM(), vote_count: int = 3):
+    def __init__(self, llm: LLM, name: str = "MdEnsemble", vote_count: int = 3):
         super().__init__(name, llm)
         self.vote_count = vote_count
 
@@ -107,11 +107,11 @@ class MedPromptGraph(SolveGraph):
     def __init__(self, name: str, llm_config, dataset: str, vote_count: int = 3):
         super().__init__(name, llm_config, dataset)
         self.cot_generate = CoTGenerate(self.llm)
-        self.md_ensemble = MdEnsemble(self.llm, vote_count=vote_count)
+        self.md_ensemble = MdEnsemble(llm=self.llm, vote_count=vote_count)
 
     async def __call__(self, problem):
         solutions = []
-        for i in range(2):
+        for i in range(3):
             solution = await self.cot_generate(problem, mode="context_fill")
             solutions.append(solution["solution"])
         solution = await self.md_ensemble(solutions, problem, mode="context_fill")
@@ -119,10 +119,10 @@ class MedPromptGraph(SolveGraph):
 
 if __name__ == "__main__":
     async def main():
-        llm_config = ModelsConfig.default().get("deepseek-coder")
-        # llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        # llm_config = ModelsConfig.default().get("deepseek-coder")
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
         # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
-        graph = MedPromptGraph(name="MedPrompt", llm_config=llm_config, dataset="Gsm8K", vote_count=2)
+        graph = MedPromptGraph(name="MedPrompt", llm_config=llm_config, dataset="Gsm8K", vote_count=5)
         file_path = "examples/ags/data/gsm8k.jsonl"
         samples = 264
         path = "examples/ags/data/baselines/general"
diff --git a/examples/ags/experiments/baselines/medprompt_humaneval.py b/examples/ags/experiments/baselines/medprompt_humaneval.py
new file mode 100644
index 000000000..8c8952718
--- /dev/null
+++ b/examples/ags/experiments/baselines/medprompt_humaneval.py
@@ -0,0 +1,124 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.humaneval import humaneval_evaluation
+from examples.ags.scripts.operator_an import GenerateOp
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import Dict, Any, List, Tuple
+from collections import Counter
+
+import random
+
+HUMANEVAL_PROMPT_GPT = """
+{question}\nPlease provide a step-by-step explanation in text, followed by your Python function without any additional text or test cases. 
+"""
+
+MD_ENSEMBLE_PROMPT = """
+Given the question described as follows: {question}
+Several solutions have been generated to address the given question. They are as follows:
+{solutions}
+
+Carefully evaluate these solutions and identify the solution that is more capable of solving the problem compared to other solutions, as this is crucial for problem-solving.
+
+In the "thought" field, provide a detailed explanation of your thought process. In the "solution_letter" field, output only the single letter ID (A, B, C, etc.) corresponding to the solution. Do not include any additional text or explanation in the "solution_letter" field.
+"""
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="Python Solution For This Question.")
+
+class CoTGenerate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, function_name, mode: str = None):
+        prompt = HUMANEVAL_PROMPT_GPT.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm, "function_name": function_name}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+
+class MdEnsembleOp(BaseModel):
+    thought: str = Field(
+        default="",
+        description="Step-by-step analysis of the solutions to determine the best one.",
+    )
+    solution_letter: str = Field(default="", description="The letter of the chosen best solution (only one letter).")
+
+
+class MdEnsemble(Operator):
+    """
+    Paper: Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine
+    Link: https://arxiv.org/abs/2311.16452
+    """
+
+    def __init__(self, llm: LLM, name: str = "MdEnsemble", vote_count: int = 5):
+        super().__init__(name, llm)
+        self.vote_count = vote_count
+
+    @staticmethod
+    def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]:
+        shuffled_solutions = solutions.copy()
+        random.shuffle(shuffled_solutions)
+        answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)}
+        return shuffled_solutions, answer_mapping
+
+    async def __call__(self, solutions: List[str], problem: str, mode: str = None):
+        print(f"solution count: {len(solutions)}")
+        all_responses = []
+
+        for _ in range(self.vote_count):
+            shuffled_solutions, answer_mapping = self.shuffle_answers(solutions)
+
+            solution_text = ""
+            for index, solution in enumerate(shuffled_solutions):
+                solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
+
+            prompt = MD_ENSEMBLE_PROMPT.format(solutions=solution_text, question=problem)
+            fill_kwargs = {"context": prompt, "llm": self.llm}
+            if mode:
+                fill_kwargs["mode"] = mode
+            node = await ActionNode.from_pydantic(MdEnsembleOp).fill(**fill_kwargs)
+            response = node.instruct_content.model_dump()
+
+            answer = response.get("solution_letter", "A")
+            answer = answer.strip().upper()
+
+            if answer in answer_mapping:
+                original_index = answer_mapping[answer]
+                all_responses.append(original_index)
+
+        most_frequent_index = Counter(all_responses).most_common(1)[0][0]
+        final_answer = solutions[most_frequent_index]
+        return {"solution": final_answer}  
+
+class MedPromptGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str, vote_count: int = 5):
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = CoTGenerate(self.llm)
+        self.md_ensemble = MdEnsemble(self.llm, vote_count=vote_count)
+
+    async def __call__(self, problem, function_name):
+        solutions = []
+        for i in range(3):
+            solution = await self.cot_generate(problem, function_name, mode="code_fill")
+            solutions.append(solution["solution"])
+        solution = await self.md_ensemble(solutions, problem, mode="context_fill")
+        return solution["solution"], self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        graph = MedPromptGraph(name="MedPrompt", llm_config=llm_config, dataset="HumanEval", vote_count=5)
+        file_path = "examples/ags/data/human-eval.jsonl"
+        samples = 33
+        path = "examples/ags/data/baselines/general/humaneval"
+        score, cost = await humaneval_evaluation(graph, file_path, samples, path, test=True)
+        return score, cost
+
+    import asyncio
+    asyncio.run(main())
\ No newline at end of file
diff --git a/examples/ags/experiments/baselines/medprompt_math.py b/examples/ags/experiments/baselines/medprompt_math.py
new file mode 100644
index 000000000..5b24622cd
--- /dev/null
+++ b/examples/ags/experiments/baselines/medprompt_math.py
@@ -0,0 +1,129 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.math import math_evaluation
+from examples.ags.scripts.operator_an import GenerateOp
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import Dict, Any, List, Tuple
+from collections import Counter
+
+import random
+
+GENERATE_COT_PROMPT = """
+{question}\nPlease reason step by step. At the end, provide the final answer in the format "\\boxed{{<number>}}", where <number> is a math answer(an expression or number), without any additional information or explanation.
+"""
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="solution for the problem")
+
+class CoTGenerate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, mode: str = None):
+        prompt = GENERATE_COT_PROMPT.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+
+MD_ENSEMBLE_PROMPT = """
+You are given a problem:
+{question}
+
+Here is a list of possible solutions to the problem:
+{solutions}
+
+Using the inputs above, your goal is to choose the best solution to the problem.
+The main consideration is that the solution can fully solve the problem in a correct and robust manner.
+Provide your final decision by writing the chosen solution letter.
+
+Please follow the required format in your response.
+"""
+
+class MdEnsembleOp(BaseModel):
+    thought: str = Field(
+        default="",
+        description="Step-by-step analysis of the solutions to determine the best one.",
+    )
+    solution_letter: str = Field(default="", description="The letter of the chosen best solution (only one letter).")
+
+
+class MdEnsemble(Operator):
+    """
+    Paper: Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine
+    Link: https://arxiv.org/abs/2311.16452
+    """
+
+    def __init__(self, name: str = "MdEnsemble", llm: LLM = LLM(), vote_count: int = 3):
+        super().__init__(name, llm)
+        self.vote_count = vote_count
+
+    @staticmethod
+    def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]:
+        shuffled_solutions = solutions.copy()
+        random.shuffle(shuffled_solutions)
+        answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)}
+        return shuffled_solutions, answer_mapping
+
+    async def __call__(self, solutions: List[str], problem: str, mode: str = None):
+        print(f"solution count: {len(solutions)}")
+        all_responses = []
+
+        for _ in range(self.vote_count):
+            shuffled_solutions, answer_mapping = self.shuffle_answers(solutions)
+
+            solution_text = ""
+            for index, solution in enumerate(shuffled_solutions):
+                solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
+
+            prompt = MD_ENSEMBLE_PROMPT.format(solutions=solution_text, question=problem)
+            fill_kwargs = {"context": prompt, "llm": self.llm}
+            if mode:
+                fill_kwargs["mode"] = mode
+            node = await ActionNode.from_pydantic(MdEnsembleOp).fill(**fill_kwargs)
+            response = node.instruct_content.model_dump()
+
+            answer = response.get("solution_letter", "")
+            answer = answer.strip().upper()
+
+            if answer in answer_mapping:
+                original_index = answer_mapping[answer]
+                all_responses.append(original_index)
+
+        most_frequent_index = Counter(all_responses).most_common(1)[0][0]
+        final_answer = solutions[most_frequent_index]
+        return {"solution": final_answer}  
+
+class MedPromptGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str, vote_count: int = 3):
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = CoTGenerate(llm=self.llm)
+        self.md_ensemble = MdEnsemble(llm=self.llm, vote_count=vote_count)
+
+    async def __call__(self, problem):
+        solutions = []
+        for i in range(2):
+            solution = await self.cot_generate(problem, mode="context_fill")
+            solutions.append(solution["solution"])
+        solution = await self.md_ensemble(solutions, problem, mode="context_fill")
+        return solution, self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        graph = MedPromptGraph(name="MedPrompt", llm_config=llm_config, dataset="Gsm8K", vote_count=2)
+        file_path = "examples/ags/data/math_test.jsonl"
+        # samples = None
+        samples = 0
+        path = "examples/ags/data/baselines/general/math"
+        score = await math_evaluation(graph, file_path, samples, path,test=True)
+        return score
+
+    import asyncio
+    asyncio.run(main())
\ No newline at end of file
diff --git a/examples/ags/experiments/baselines/medpromt_mbpp.py b/examples/ags/experiments/baselines/medpromt_mbpp.py
new file mode 100644
index 000000000..b3db3b447
--- /dev/null
+++ b/examples/ags/experiments/baselines/medpromt_mbpp.py
@@ -0,0 +1,127 @@
+from examples.ags.benchmark.mbpp import mbpp_evaluation
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.scripts.operator_an import GenerateOp
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import Dict, Any, List, Tuple
+from collections import Counter
+import random
+
+MBPP_PROMPT_COT = """
+{question}\nPlease provide a step-by-step explanation in text, followed by your Python function, ensure the output code is self-contained, meaning it should have the correct function name and return statement, without any additional text."""
+
+
+MD_ENSEMBLE_PROMPT = """
+Given the question described as follows: {question}
+Several solutions have been generated to address the given question. They are as follows:
+{solutions}
+
+Carefully evaluate these solutions and identify the solution that is more capable of solving the problem compared to other solutions, as this is crucial for problem-solving.
+
+In the "thought" field, provide a detailed explanation of your thought process. In the "solution_letter" field, output only the single letter ID (A, B, C, etc.) corresponding to the solution. Do not include any additional text or explanation in the "solution_letter" field.
+"""
+
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="Python Solution For This Question.")
+
+class CoTGenerate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, function_name, mode: str = None):
+        prompt = MBPP_PROMPT_COT.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm, "function_name": function_name}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+class MdEnsembleOp(BaseModel):
+    thought: str = Field(
+        default="",
+        description="Step-by-step analysis of the solutions to determine the best one.",
+    )
+    solution_letter: str = Field(default="", description="The letter of the chosen best solution (only one letter).")
+
+
+class MdEnsemble(Operator):
+    """
+    Paper: Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine
+    Link: https://arxiv.org/abs/2311.16452
+    """
+
+    def __init__(self, llm: LLM, name: str = "MdEnsemble", vote_count: int = 5):
+        super().__init__(name, llm)
+        self.vote_count = vote_count
+
+    @staticmethod
+    def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]:
+        shuffled_solutions = solutions.copy()
+        random.shuffle(shuffled_solutions)
+        answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)}
+        return shuffled_solutions, answer_mapping
+
+    async def __call__(self, solutions: List[str], problem: str, mode: str = None):
+        print(f"solution count: {len(solutions)}")
+        all_responses = []
+
+        for _ in range(self.vote_count):
+            shuffled_solutions, answer_mapping = self.shuffle_answers(solutions)
+
+            solution_text = ""
+            for index, solution in enumerate(shuffled_solutions):
+                solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
+
+            prompt = MD_ENSEMBLE_PROMPT.format(solutions=solution_text, question=problem)
+            fill_kwargs = {"context": prompt, "llm": self.llm}
+            if mode:
+                fill_kwargs["mode"] = mode
+            node = await ActionNode.from_pydantic(MdEnsembleOp).fill(**fill_kwargs)
+            response = node.instruct_content.model_dump()
+
+            answer = response.get("solution_letter", "A")
+            answer = answer.strip().upper()
+
+            if answer in answer_mapping:
+                original_index = answer_mapping[answer]
+                all_responses.append(original_index)
+
+        most_frequent_index = Counter(all_responses).most_common(1)[0][0]
+        final_answer = solutions[most_frequent_index]
+        return {"solution": final_answer}  
+
+class MedPromptGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str, vote_count: int = 5):
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = CoTGenerate(self.llm)
+        self.md_ensemble = MdEnsemble(self.llm, vote_count=vote_count)
+
+    async def __call__(self, problem, function_name):
+        solutions = []
+        for i in range(3):
+            solution = await self.cot_generate(problem, function_name, mode="code_fill")
+            solutions.append(solution["solution"])
+        solution = await self.md_ensemble(solutions, problem, mode="context_fill")
+        return solution["solution"], self.llm.cost_manager.total_cost
+
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        # llm_config = ModelsConfig.default().get("deepseek-chat")
+        # llm_config = ModelsConfig.default().get("gpt-35-turbo")
+        graph = MedPromptGraph(name="MedPrompt", llm_config=llm_config, dataset="MBPP")
+        file_path = "examples/ags/data/mbpp-new-new.jsonl"
+        samples = 86 # 86/341
+        path = "examples/ags/data/baselines/general/mbpp"
+        score = await mbpp_evaluation(graph, file_path, samples, path, test=True)
+        return score
+
+    import asyncio
+    asyncio.run(main())
+
diff --git a/examples/ags/experiments/baselines/multi_persona_gsm8k.py b/examples/ags/experiments/baselines/multi_persona_gsm8k.py
index 6737c9405..41152d13d 100644
--- a/examples/ags/experiments/baselines/multi_persona_gsm8k.py
+++ b/examples/ags/experiments/baselines/multi_persona_gsm8k.py
@@ -22,7 +22,7 @@ FINAL_DECISION_PROMPT = """
 Considering all the thinking processes and answers:
 {all_thinking}
 {all_answers}
-Please reason carefully and provide the final answer. To ensure accuracy, only provide the answer in the solution, without any steps.
+Please reason carefully and provide the final answer. To ensure accuracy, At the end, provide the final answer in solution field with the format "Answer is <number>", where <number> is a single number, without any additional information or explanation.
 """
 
 class DebateOp(BaseModel):
@@ -73,10 +73,9 @@ class MultiPersonaGraph(SolveGraph):
         self.debate_agents = [
             DebateAgent(self.llm, f"Debate Agent {i}", role)
             for i, role in enumerate([
-                'Math Competition Champion',
-                'Elementary School Math Teacher',
-                'Math Professor',
-                'Computer Scientist'
+                'Innovative Math Thinker - Math PhD',
+                'Critical Reasoning Expert - Math Professor',
+                'Computational Thinking Specialist - Math And Computer Science Researcher'
             ])
         ]
         self.final_decision_agent = FinalDecisionAgent(self.llm)
@@ -107,12 +106,12 @@ class MultiPersonaGraph(SolveGraph):
 
 if __name__ == "__main__":
     async def main():
-        llm_config = ModelsConfig.default().get("deepseek-coder")
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
         graph = MultiPersonaGraph(name="multi-persona", llm_config=llm_config, dataset="Gsm8K")
         file_path = "examples/ags/data/gsm8k.jsonl"
-        samples = 1
+        samples = 264
         path = "examples/ags/data/baselines/general"
-        score, cost = await gsm8k_evaluation(graph, file_path, samples, path)
+        score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=True)
         return score, cost
 
     import asyncio
diff --git a/examples/ags/experiments/baselines/multi_persona_hotpotqa.py b/examples/ags/experiments/baselines/multi_persona_hotpotqa.py
new file mode 100644
index 000000000..76288eab9
--- /dev/null
+++ b/examples/ags/experiments/baselines/multi_persona_hotpotqa.py
@@ -0,0 +1,133 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.hotpotqa import hotpotqa_evaluation
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import List
+
+DEBATE_INITIAL_PROMPT = """
+Given a question and context, please think step by step and then solve this task.
+
+Question: {question}
+Context: {relevant_context}
+"""
+
+DEBATE_PROMPT = """
+Given a question and context,
+
+Question: {question}
+Context: {relevant_context}
+
+Considering the solutions provided by other agents as additional suggestions. Please think carefully and provide an updated answer.
+"""
+
+FINAL_DECISION_PROMPT = """
+Given a question and context,
+
+Question: {question}
+Context: {relevant_context}
+
+Considering all the thinking processes and answers:
+{all_thinking}
+{all_answers}
+Please reason carefully and provide the final answer. Give the final answer in solution field. You MUST Keep the answer very concise in a few words, without any additional information.
+"""
+
+class DebateOp(BaseModel):
+    thinking: str = Field(default="", description="thinking process")
+    answer: str = Field(default="", description="answer")
+
+class FinalDecisionOp(BaseModel):
+    thinking: str = Field(default="", description="final thinking process")
+    solution: str = Field(default="", description="final answer")
+
+class DebateAgent(Operator):
+    def __init__(self, llm: LLM, name: str, role: str):
+        super().__init__(name, llm)
+        self.role = role
+
+    async def __call__(self, question: str, relevant_context: str, context: List[str] = None, mode: str = None):
+        role_prompt = f"You are a {self.role}. Based on your professional knowledge and thinking style,"
+        if context is None:
+            prompt = role_prompt + DEBATE_INITIAL_PROMPT.format(question=question, relevant_context=relevant_context)
+        else:
+            prompt = role_prompt + DEBATE_PROMPT.format(question=question, relevant_context=relevant_context) + "\n".join(context)
+
+        
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(DebateOp).fill(**fill_kwargs)
+        return node.instruct_content.model_dump()
+
+class FinalDecisionAgent(Operator):
+    def __init__(self, llm: LLM, name: str = "FinalDecision"):
+        super().__init__(name, llm)
+
+    async def __call__(self, question: str, relevant_context: str, all_thinking: List[str], all_answers: List[str], mode: str = None):
+        prompt = FINAL_DECISION_PROMPT.format(
+            question = question,
+            relevant_context = relevant_context,
+            all_thinking="\n".join(all_thinking),
+            all_answers="\n".join(all_answers)
+        )
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(FinalDecisionOp).fill(**fill_kwargs)
+        return node.instruct_content.model_dump()
+
+class MultiPersonaGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.debate_agents = [
+            DebateAgent(self.llm, f"Debate Agent {i}", role)
+            for i, role in enumerate([
+                'Comprehensive Knowledge Maven - Information Scientist',
+                'Analytical Insight Specialist - Cognitive Psychologist',
+                'Fact Verification Expert - Data Analyst'
+            ])
+        ]
+        self.final_decision_agent = FinalDecisionAgent(self.llm)
+
+    async def __call__(self, question, relevant_context):
+        max_round = 2
+        all_thinking = [[] for _ in range(max_round)]
+        all_answers = [[] for _ in range(max_round)]
+
+        for r in range(max_round):
+            for i, agent in enumerate(self.debate_agents):
+                if r == 0:
+                    result = await agent(question, relevant_context, mode="context_fill")
+                else:
+                    context = [f"{agent.role}'s previous round thinking: {all_thinking[r-1][i]}"] + \
+                              [f"{self.debate_agents[j].role}'s thinking: {all_thinking[r-1][j]}" for j in range(len(self.debate_agents)) if j != i]
+                    result = await agent(question, relevant_context, context, mode="context_fill")
+                all_thinking[r].append(result["thinking"])
+                all_answers[r].append(result["answer"])
+
+        final_result = await self.final_decision_agent(
+            question,
+            relevant_context,
+            [f"{agent.role}'s final thinking: {thinking}" for agent, thinking in zip(self.debate_agents, all_thinking[-1])],
+            [f"{agent.role}'s final answer: {answer}" for agent, answer in zip(self.debate_agents, all_answers[-1])],
+            mode="context_fill"
+        )
+        return final_result["solution"], self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        graph = MultiPersonaGraph(name="multi-persona", llm_config=llm_config, dataset="HotpotQA")
+
+        file_path = "examples/ags/data/hotpotqa.jsonl"   #相对路径有问题 等着再改
+        samples = 250 # 250 for validation, 1000 for test
+        path = "examples/ags/data/baselines/general/hotpotqa" #相对路径有问题 等着再改
+
+        score = await hotpotqa_evaluation(graph, file_path, samples, path, test=True)
+        return score
+
+    import asyncio
+    asyncio.run(main())
\ No newline at end of file
diff --git a/examples/ags/experiments/baselines/multi_persona_humaneval.py b/examples/ags/experiments/baselines/multi_persona_humaneval.py
new file mode 100644
index 000000000..b10581014
--- /dev/null
+++ b/examples/ags/experiments/baselines/multi_persona_humaneval.py
@@ -0,0 +1,117 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.humaneval import humaneval_evaluation
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import List
+
+DEBATE_INITIAL_PROMPT = """
+{question}\nPlease provide a step-by-step explanation in text, followed by your Python function without any additional text or test cases. 
+"""
+
+DEBATE_PROMPT = """
+{question}
+Considering the solutions provided by other agents as additional suggestions. Please think carefully and provide an updated python function without any additional text or test cases. 
+"""
+
+FINAL_DECISION_PROMPT = """
+{question}
+Considering all the thinking processes and answers:
+{all_thinking}
+{all_answers}
+Please reason carefully and provide the final answer. Make sure the code output is wrapped with ```python``` without any additional text or test cases.
+"""
+
+class DebateOp(BaseModel):
+    thinking: str = Field(default="", description="think")
+    answer: str = Field(default="", description="answer")
+
+class FinalDecisionOp(BaseModel):
+    solution: str = Field(default="", description="final answer")
+
+class DebateAgent(Operator):
+    def __init__(self, llm: LLM, name: str, role: str):
+        super().__init__(name, llm)
+        self.role = role
+
+    async def __call__(self, problem: str, function_name: str, context: List[str] = None, mode: str = None):
+        role_prompt = f"You are a {self.role}. Based on your professional knowledge and thinking style,"
+        if context is None:
+            prompt = role_prompt + DEBATE_INITIAL_PROMPT.format(question=problem)
+        else:
+            prompt = role_prompt + DEBATE_PROMPT.format(question=problem) + "\n".join(context)
+        
+        fill_kwargs = {"context": prompt, "llm": self.llm, "function_name":function_name}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(DebateOp).fill(**fill_kwargs)
+        return node.instruct_content.model_dump()
+
+class FinalDecisionAgent(Operator):
+    def __init__(self, llm: LLM, name: str = "FinalDecision"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem: str, function_name, all_thinking: List[str], all_answers: List[str], mode: str = None):
+        prompt = FINAL_DECISION_PROMPT.format(
+            question=problem,
+            all_thinking="\n".join(all_thinking),
+            all_answers="\n".join(all_answers)
+        )
+        fill_kwargs = {"context": prompt, "llm": self.llm, "function_name":function_name}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(FinalDecisionOp).fill(**fill_kwargs)
+        return node.instruct_content.model_dump()
+
+class MultiPersonaGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.debate_agents = [
+            DebateAgent(self.llm, f"Debate Agent {i}", role)
+            for i, role in enumerate([
+                'Innovative CS Thinker - ICPC Competitor',
+                'Critical Reasoning Expert - Math Professor',
+                'Computational Thinking Specialist - Computer Science Researcher'
+            ])
+        ]
+        self.final_decision_agent = FinalDecisionAgent(self.llm)
+
+    async def __call__(self, problem, function_name):
+        max_round = 2
+        all_thinking = [[] for _ in range(max_round)]
+        all_answers = [[] for _ in range(max_round)]
+
+        for r in range(max_round):
+            for i, agent in enumerate(self.debate_agents):
+                if r == 0:
+                    result = await agent(problem, function_name, mode="context_fill")
+                else:
+                    context = [f"{agent.role}'s previous round thinking: {all_thinking[r-1][i]}"] + \
+                              [f"{self.debate_agents[j].role}'s thinking: {all_thinking[r-1][j]}" for j in range(len(self.debate_agents)) if j != i]
+                    result = await agent(problem, context, mode="context_fill")
+                all_thinking[r].append(result["thinking"])
+                all_answers[r].append(result["answer"])
+
+        final_result = await self.final_decision_agent(
+            problem,
+            function_name ,
+            [f"{agent.role}'s final thinking: {thinking}" for agent, thinking in zip(self.debate_agents, all_thinking[-1])],
+            [f"{agent.role}'s final answer: {answer}" for agent, answer in zip(self.debate_agents, all_answers[-1])],
+            mode="code_fill"
+        )
+        return final_result['solution'], self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        graph = MultiPersonaGraph(name="multi-persona", llm_config=llm_config, dataset="HumanEval")
+        file_path = "examples/ags/data/human-eval.jsonl"
+        samples = 33
+        path = "examples/ags/data/baselines/general/humaneval"
+        score, cost = await humaneval_evaluation(graph, file_path, samples, path, test=True)
+        return score, cost
+    
+    import asyncio
+    asyncio.run(main())
\ No newline at end of file
diff --git a/examples/ags/experiments/baselines/multi_persona_math.py b/examples/ags/experiments/baselines/multi_persona_math.py
new file mode 100644
index 000000000..bf42088dc
--- /dev/null
+++ b/examples/ags/experiments/baselines/multi_persona_math.py
@@ -0,0 +1,122 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.math import math_evaluation
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import List
+
+DEBATE_INITIAL_PROMPT = """
+{question}\nPlease reason step by step, the reason process can be put in the thinking field. At the end, provide the final answer in the answer field with the format "\\boxed{{<number>}}", where <number> is a math answer(an expression or number), without any additional information or explanation.
+Make sure the output is wrapped with correct xml tags!
+"""
+
+DEBATE_PROMPT = """
+{question}
+Considering the solutions provided by other agents as additional suggestions, the reason process can be put in the thinking field. Please think carefully and provide an updated answer in the answer field with the format "\\boxed{{<number>}}", where <number> is a math answer(an expression or number), without any additional information or explanation.
+Make sure the output is wrapped with correct xml tags!
+"""
+
+FINAL_DECISION_PROMPT = """
+{question}
+Considering all the thinking processes and answers:
+{all_thinking}
+{all_answers}
+
+The thinking process can be put in the thinking field.
+Please reason carefully and provide the final answer in the answer field with the format "\\boxed{{<number>}}", where <number> is a math answer(an expression or number), without any additional information or explanation.
+Make sure the output is wrapped with correct xml tags!
+"""
+
+class DebateOp(BaseModel):
+    thinking: str = Field(default="", description="thinking process")
+    answer: str = Field(default="", description="answer")
+
+class FinalDecisionOp(BaseModel):
+    thinking: str = Field(default="", description="final thinking process")
+    solution: str = Field(default="", description="final answer")
+
+class DebateAgent(Operator):
+    def __init__(self, llm: LLM, name: str, role: str):
+        super().__init__(name, llm)
+        self.role = role
+
+    async def __call__(self, problem: str, context: List[str] = None, mode: str = None):
+        role_prompt = f"You are a {self.role}. Based on your professional knowledge and thinking style,"
+        if context is None:
+            prompt = role_prompt + DEBATE_INITIAL_PROMPT.format(question=problem)
+        else:
+            prompt = role_prompt + DEBATE_PROMPT.format(question=problem) + "\n".join(context)
+        
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(DebateOp).fill(**fill_kwargs)
+        return node.instruct_content.model_dump()
+
+class FinalDecisionAgent(Operator):
+    def __init__(self, llm: LLM, name: str = "FinalDecision"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem: str, all_thinking: List[str], all_answers: List[str], mode: str = None):
+        prompt = FINAL_DECISION_PROMPT.format(
+            question=problem,
+            all_thinking="\n".join(all_thinking),
+            all_answers="\n".join(all_answers)
+        )
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(FinalDecisionOp).fill(**fill_kwargs)
+        return node.instruct_content.model_dump()
+
+class MultiPersonaGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.debate_agents = [
+            DebateAgent(self.llm, f"Debate Agent {i}", role)
+            for i, role in enumerate([
+                'Innovative Math Thinker - Math PhD',
+                'Critical Reasoning Expert - Math Professor',
+                'Computational Thinking Specialist - Math And Computer Science Researcher'
+            ])
+        ]
+        self.final_decision_agent = FinalDecisionAgent(self.llm)
+
+    async def __call__(self, problem):
+        max_round = 2
+        all_thinking = [[] for _ in range(max_round)]
+        all_answers = [[] for _ in range(max_round)]
+
+        for r in range(max_round):
+            for i, agent in enumerate(self.debate_agents):
+                if r == 0:
+                    result = await agent(problem, mode="context_fill")
+                else:
+                    context = [f"{agent.role}'s previous round thinking: {all_thinking[r-1][i]}"] + \
+                              [f"{self.debate_agents[j].role}'s thinking: {all_thinking[r-1][j]}" for j in range(len(self.debate_agents)) if j != i]
+                    result = await agent(problem, context, mode="context_fill")
+                all_thinking[r].append(result["thinking"])
+                all_answers[r].append(result["answer"])
+
+        final_result = await self.final_decision_agent(
+            problem,
+            [f"{agent.role}'s final thinking: {thinking}" for agent, thinking in zip(self.debate_agents, all_thinking[-1])],
+            [f"{agent.role}'s final answer: {answer}" for agent, answer in zip(self.debate_agents, all_answers[-1])],
+            mode="context_fill"
+        )
+        return final_result, self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        graph = MultiPersonaGraph(name="multi-persona", llm_config=llm_config, dataset="MATH")
+        file_path = "examples/ags/data/math_test.jsonl"
+        samples = 0
+        path = "examples/ags/data/baselines/general/math"
+        score = await math_evaluation(graph, file_path, samples, path,test=True)
+        return score
+
+    import asyncio
+    asyncio.run(main())
\ No newline at end of file
diff --git a/examples/ags/experiments/baselines/multi_persona_mbpp.py b/examples/ags/experiments/baselines/multi_persona_mbpp.py
new file mode 100644
index 000000000..58eb7aead
--- /dev/null
+++ b/examples/ags/experiments/baselines/multi_persona_mbpp.py
@@ -0,0 +1,118 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.mbpp import mbpp_evaluation
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import List
+
+DEBATE_INITIAL_PROMPT = """
+{question}\nPlease provide a step-by-step explanation in text, followed by your Python function, ensure the output code is self-contained, meaning it should have the correct function name and return statement, without any additional text."""
+
+
+DEBATE_PROMPT = """
+{question}
+Considering the solutions provided by other agents as additional suggestions. Please think carefully and provide an updated self-contained python function which meaning it should have the correct function name and return statement, but it shouldn't have any additional text or test cases. 
+"""
+
+FINAL_DECISION_PROMPT = """
+{question}
+Considering all the thinking processes and answers:
+{all_thinking}
+{all_answers}
+Please reason carefully and provide the final answer. Make sure the output code is self-contained, meaning it should have the correct function name and return statement, without any additional text."""
+
+
+class DebateOp(BaseModel):
+    thinking: str = Field(default="", description="think")
+    answer: str = Field(default="", description="answer")
+
+class FinalDecisionOp(BaseModel):
+    solution: str = Field(default="", description="final answer")
+
+class DebateAgent(Operator):
+    def __init__(self, llm: LLM, name: str, role: str):
+        super().__init__(name, llm)
+        self.role = role
+
+    async def __call__(self, problem: str, function_name: str, context: List[str] = None, mode: str = None):
+        role_prompt = f"You are a {self.role}. Based on your professional knowledge and thinking style,"
+        if context is None:
+            prompt = role_prompt + DEBATE_INITIAL_PROMPT.format(question=problem)
+        else:
+            prompt = role_prompt + DEBATE_PROMPT.format(question=problem) + "\n".join(context)
+        
+        fill_kwargs = {"context": prompt, "llm": self.llm, "function_name":function_name}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(DebateOp).fill(**fill_kwargs)
+        return node.instruct_content.model_dump()
+
+class FinalDecisionAgent(Operator):
+    def __init__(self, llm: LLM, name: str = "FinalDecision"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem: str, function_name, all_thinking: List[str], all_answers: List[str], mode: str = None):
+        prompt = FINAL_DECISION_PROMPT.format(
+            question=problem,
+            all_thinking="\n".join(all_thinking),
+            all_answers="\n".join(all_answers)
+        )
+        fill_kwargs = {"context": prompt, "llm": self.llm, "function_name":function_name}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(FinalDecisionOp).fill(**fill_kwargs)
+        return node.instruct_content.model_dump()
+
+class MultiPersonaGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.debate_agents = [
+            DebateAgent(self.llm, f"Debate Agent {i}", role)
+            for i, role in enumerate([
+                'Innovative CS Thinker - ICPC Competitor',
+                'Critical Reasoning Expert - Math Professor',
+                'Computational Thinking Specialist - Computer Science Researcher'
+            ])
+        ]
+        self.final_decision_agent = FinalDecisionAgent(self.llm)
+
+    async def __call__(self, problem, function_name):
+        max_round = 2
+        all_thinking = [[] for _ in range(max_round)]
+        all_answers = [[] for _ in range(max_round)]
+
+        for r in range(max_round):
+            for i, agent in enumerate(self.debate_agents):
+                if r == 0:
+                    result = await agent(problem, function_name, mode="context_fill")
+                else:
+                    context = [f"{agent.role}'s previous round thinking: {all_thinking[r-1][i]}"] + \
+                              [f"{self.debate_agents[j].role}'s thinking: {all_thinking[r-1][j]}" for j in range(len(self.debate_agents)) if j != i]
+                    result = await agent(problem, context, mode="context_fill")
+                all_thinking[r].append(result["thinking"])
+                all_answers[r].append(result["answer"])
+
+        final_result = await self.final_decision_agent(
+            problem,
+            function_name ,
+            [f"{agent.role}'s final thinking: {thinking}" for agent, thinking in zip(self.debate_agents, all_thinking[-1])],
+            [f"{agent.role}'s final answer: {answer}" for agent, answer in zip(self.debate_agents, all_answers[-1])],
+            mode="code_fill"
+        )
+        return final_result['solution'], self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        graph = MultiPersonaGraph(name="multi-persona", llm_config=llm_config, dataset="MBPP")
+        file_path = "examples/ags/data/mbpp-new-new.jsonl"
+        samples = 86 # 86/341
+        path = "examples/ags/data/baselines/general/mbpp"
+        score, cost = await mbpp_evaluation(graph, file_path, samples, path, test=True)
+        print(f"per cost:{cost/341}")
+        return score 
+    
+    import asyncio
+    asyncio.run(main())
\ No newline at end of file
diff --git a/examples/ags/experiments/baselines/self_consistency_gsm8k.py b/examples/ags/experiments/baselines/self_consistency_gsm8k.py
index 1fecf1cc1..3355290fc 100644
--- a/examples/ags/experiments/baselines/self_consistency_gsm8k.py
+++ b/examples/ags/experiments/baselines/self_consistency_gsm8k.py
@@ -12,7 +12,7 @@ from collections import Counter
 import random
 
 GSM8K_PROMPT_GPT = """
-{question}\nPlease reason step by step, and to ensure accuracy, provide the correct answer in the final, without any additional text.
+{question}\nPlease reason step by step. At the end, provide the final answer in the format "Answer is <number>", where <number> is a single number, without any additional information or explanation.
 """
 
 GSM8K_PROMPT_DS = """
@@ -36,15 +36,17 @@ class CoTGenerate(Operator):
         return response
 
 SC_ENSEMBLE_PROMPT = """
-Given the question descripted as follows: {question}
-some solutions are generated to solve the question as follows:
+Given the question described as follows: {question}
+Several solutions have been generated to address the given question. They are as follows:
 {solutions}
 
-Evaluate these solutions and select the most consistent solution based on majority consensus.
-Give your answer with a single id of solution (without anything else).
+Carefully evaluate these solutions and identify the answer that appears most frequently across them. This consistency in answers is crucial for determining the most reliable solution.
+
+In the "thought" field, provide a detailed explanation of your thought process. In the "solution_letter" field, output only the single letter ID (A, B, C, etc.) corresponding to the most consistent solution. Do not include any additional text or explanation in the "solution_letter" field.
 """
 
 class ScEnsembleOp(BaseModel):
+    thought: str = Field(default="", description="The thought of the most consistent solution.")
     solution_letter: str = Field(default="", description="The letter of most consistent solution.")
 
 
@@ -73,7 +75,7 @@ class ScEnsemble(Operator):
         node = await ActionNode.from_pydantic(ScEnsembleOp).fill(**fill_kwargs)
         response = node.instruct_content.model_dump()
 
-        answer = response.get("solution_letter", "")
+        answer = response.get("solution_letter", "A")
         answer = answer.strip().upper()
 
         return {"solution": solutions[answer_mapping[answer]]}
@@ -87,7 +89,7 @@ class SelfConsistencyGraph(SolveGraph):
 
     async def __call__(self, problem):
         solutions = []
-        for i in range(2):
+        for i in range(5):
             solution = await self.cot_generate(problem, mode="context_fill")
             solutions.append(solution["solution"])
         solution = await self.sc_ensemble(solutions, problem, mode="context_fill")
@@ -100,9 +102,9 @@ if __name__ == "__main__":
         # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
         graph = SelfConsistencyGraph(name="SelfConsistency", llm_config=llm_config, dataset="Gsm8K")
         file_path = "examples/ags/data/gsm8k.jsonl"
-        samples = 1
+        samples = 264
         path = "examples/ags/data/baselines/general"
-        score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=False)
+        score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=True)
         return score, cost
 
     import asyncio
diff --git a/examples/ags/experiments/baselines/self_consistency_humaneval.py b/examples/ags/experiments/baselines/self_consistency_humaneval.py
new file mode 100644
index 000000000..c151715d0
--- /dev/null
+++ b/examples/ags/experiments/baselines/self_consistency_humaneval.py
@@ -0,0 +1,103 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.humaneval import humaneval_evaluation
+from examples.ags.scripts.operator_an import GenerateOp
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import List
+
+HUMANEVAL_PROMPT_GPT = """
+{question}\nPlease provide a step-by-step explanation in text, followed by your Python function without any additional text or test cases. 
+"""
+
+SC_ENSEMBLE_PROMPT = """
+Given the question described as follows: {question}
+Several solutions have been generated to address the given question. They are as follows:
+{solutions}
+
+Carefully evaluate these solutions and identify the answer that appears most frequently across them. This consistency in answers is crucial for determining the most reliable solution.
+
+In the "thought" field, provide a detailed explanation of your thought process. In the "solution_letter" field, output only the single letter ID (A, B, C, etc.) corresponding to the most consistent solution. Do not include any additional text or explanation in the "solution_letter" field.
+"""
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="Python Solution For This Question.")
+
+class CoTGenerate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, function_name, mode: str = None):
+        prompt = HUMANEVAL_PROMPT_GPT.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm, "function_name": function_name}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+class ScEnsembleOp(BaseModel):
+    thought: str = Field(default="", description="The thought of the most consistent solution.")
+    solution_letter: str = Field(default="", description="The letter of most consistent solution.")
+
+class ScEnsemble(Operator):
+    """
+    Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models
+    Link: https://arxiv.org/abs/2203.11171
+    Paper: Universal Self-Consistency for Large Language Model Generation
+    Link: https://arxiv.org/abs/2311.17311
+    """
+
+    def __init__(self, llm, name: str = "ScEnsemble"):
+        super().__init__(name, llm)
+
+    async def __call__(self, solutions: List[str], problem: str, mode: str = None):
+        answer_mapping = {}
+        solution_text = ""
+        for index, solution in enumerate(solutions):
+            answer_mapping[chr(65 + index)] = index
+            solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
+
+        prompt = SC_ENSEMBLE_PROMPT.format(solutions=solution_text, question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(ScEnsembleOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+
+        answer = response.get("solution_letter", "A")
+        answer = answer.strip().upper()
+
+        return {"solution": solutions[answer_mapping[answer]]}
+    
+    
+class SelfConsistencyGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = CoTGenerate(llm=self.llm)
+        self.sc_ensemble = ScEnsemble(llm=self.llm)
+
+    async def __call__(self, problem, function_name):
+        solutions = []
+        for i in range(5):
+            solution = await self.cot_generate(problem, function_name, mode="code_fill")
+            solutions.append(solution["solution"])
+        solution = await self.sc_ensemble(solutions, problem, mode="context_fill")
+        return solution["solution"], self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        # llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+        llm_config = ModelsConfig.default().get("deepseek-chat")
+        graph = SelfConsistencyGraph(name="SelfConsistency", llm_config=llm_config, dataset="HumanEval")
+        file_path = "examples/ags/data/human-eval.jsonl"
+        samples = 33 # 33/131  
+        path = "examples/ags/data/baselines/general/humaneval"
+        score = await humaneval_evaluation(graph, file_path, samples, path,test=True)
+        return score
+
+    import asyncio
+    asyncio.run(main())
diff --git a/examples/ags/experiments/baselines/self_consistency_math.py b/examples/ags/experiments/baselines/self_consistency_math.py
new file mode 100644
index 000000000..774dbd8d4
--- /dev/null
+++ b/examples/ags/experiments/baselines/self_consistency_math.py
@@ -0,0 +1,108 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.math import math_evaluation
+from examples.ags.scripts.operator_an import GenerateOp
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import Dict, Any, List, Tuple
+from collections import Counter
+
+import random
+
+GENERATE_COT_PROMPT = """
+{question}\nPlease reason step by step. At the end, provide the final answer in the format "\\boxed{{<number>}}", where <number> is a math answer(an expression or number), without any additional information or explanation.
+"""
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="solution for the problem")
+
+class CoTGenerate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, mode: str = None):
+        prompt = GENERATE_COT_PROMPT.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+SC_ENSEMBLE_PROMPT = """
+Given the question described as follows: {question}
+Several solutions have been generated to address the given question. They are as follows:
+{solutions}
+
+Carefully evaluate these solutions and identify the answer that appears most frequently across them. This consistency in answers is crucial for determining the most reliable solution.
+
+In the "thought" field, provide a detailed explanation of your thought process. In the "solution_letter" field, output only the single letter ID (A, B, C, etc.) corresponding to the most consistent solution. Do not include any additional text or explanation in the "solution_letter" field.
+"""
+
+class ScEnsembleOp(BaseModel):
+    thought: str = Field(default="", description="The thought of the most consistent solution.")
+    solution_letter: str = Field(default="", description="The letter of most consistent solution.")
+
+
+class ScEnsemble(Operator):
+    """
+    Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models
+    Link: https://arxiv.org/abs/2203.11171
+    Paper: Universal Self-Consistency for Large Language Model Generation
+    Link: https://arxiv.org/abs/2311.17311
+    """
+
+    def __init__(self, name: str = "ScEnsemble", llm: LLM = LLM()):
+        super().__init__(name, llm)
+
+    async def __call__(self, solutions: List[str], problem: str, mode: str = None):
+        answer_mapping = {}
+        solution_text = ""
+        for index, solution in enumerate(solutions):
+            answer_mapping[chr(65 + index)] = index
+            solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
+
+        prompt = SC_ENSEMBLE_PROMPT.format(solutions=solution_text, question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(ScEnsembleOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+
+        answer = response.get("solution_letter", "A")
+        answer = answer.strip().upper()
+
+        return {"solution": solutions[answer_mapping[answer]]}
+
+
+class SelfConsistencyGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = CoTGenerate(llm=self.llm)
+        self.sc_ensemble = ScEnsemble(llm=self.llm)
+
+    async def __call__(self, problem):
+        solutions = []
+        for i in range(5):
+            solution = await self.cot_generate(problem, mode="context_fill")
+            solutions.append(solution["solution"])
+        solution = await self.sc_ensemble(solutions, problem, mode="context_fill")
+        return solution, self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        # llm_config = ModelsConfig.default().get("deepseek-coder")
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+        graph = SelfConsistencyGraph(name="SelfConsistency", llm_config=llm_config, dataset="Gsm8K")
+        file_path = "examples/ags/data/math_test.jsonl"
+        # samples = None
+        samples = 0
+        path = "examples/ags/data/baselines/general/math"
+        score = await math_evaluation(graph, file_path, samples, path,test=True)
+        return score
+
+    import asyncio
+    asyncio.run(main())
\ No newline at end of file
diff --git a/examples/ags/experiments/baselines/self_consistency_mbpp.py b/examples/ags/experiments/baselines/self_consistency_mbpp.py
new file mode 100644
index 000000000..2691433b7
--- /dev/null
+++ b/examples/ags/experiments/baselines/self_consistency_mbpp.py
@@ -0,0 +1,102 @@
+from examples.ags.benchmark.mbpp import mbpp_evaluation
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.scripts.operator_an import GenerateOp
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import List
+MBPP_PROMPT_COT = """
+{question}\nPlease provide a step-by-step explanation in text, followed by your Python function, ensure the output code is self-contained, meaning it should have the correct function name and return statement, without any additional text."""
+
+
+SC_ENSEMBLE_PROMPT = """
+Given the question described as follows: {question}
+Several solutions have been generated to address the given question. They are as follows:
+{solutions}
+
+Carefully evaluate these solutions and identify the answer that appears most frequently across them. This consistency in answers is crucial for determining the most reliable solution.
+
+In the "thought" field, provide a detailed explanation of your thought process. In the "solution_letter" field, output only the single letter ID (A, B, C, etc.) corresponding to the most consistent solution. Do not include any additional text or explanation in the "solution_letter" field.
+"""
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="Python Solution For This Question.")
+
+class CoTGenerate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, function_name, mode: str = None):
+        prompt = MBPP_PROMPT_COT.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm, "function_name": function_name}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+class ScEnsembleOp(BaseModel):
+    thought: str = Field(default="", description="The thought of the most consistent solution.")
+    solution_letter: str = Field(default="", description="The letter of most consistent solution.")
+
+class ScEnsemble(Operator):
+    """
+    Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models
+    Link: https://arxiv.org/abs/2203.11171
+    Paper: Universal Self-Consistency for Large Language Model Generation
+    Link: https://arxiv.org/abs/2311.17311
+    """
+
+    def __init__(self, llm, name: str = "ScEnsemble"):
+        super().__init__(name, llm)
+
+    async def __call__(self, solutions: List[str], problem: str, mode: str = None):
+        answer_mapping = {}
+        solution_text = ""
+        for index, solution in enumerate(solutions):
+            answer_mapping[chr(65 + index)] = index
+            solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
+
+        prompt = SC_ENSEMBLE_PROMPT.format(solutions=solution_text, question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(ScEnsembleOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+
+        answer = response.get("solution_letter", "A")
+        answer = answer.strip().upper()
+
+        return {"solution": solutions[answer_mapping[answer]]}
+    
+class SelfConsistencyGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = CoTGenerate(llm=self.llm)
+        self.sc_ensemble = ScEnsemble(llm=self.llm)
+
+    async def __call__(self, problem, function_name):
+        solutions = []
+        for i in range(5):
+            solution = await self.cot_generate(problem, function_name, mode="code_fill")
+            solutions.append(solution["solution"])
+        solution = await self.sc_ensemble(solutions, problem, mode="context_fill")
+        return solution["solution"], self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        # llm_config = ModelsConfig.default().get("deepseek-chat")
+        # llm_config = ModelsConfig.default().get("gpt-35-turbo")
+        graph = SelfConsistencyGraph(name="SelfConsistency", llm_config=llm_config, dataset="MBPP")
+        file_path = "examples/ags/data/mbpp-new-new.jsonl"
+        samples = 86 # 86/341
+        path = "examples/ags/data/baselines/general/mbpp"
+        score = await mbpp_evaluation(graph, file_path, samples, path, test=True)
+        return score
+
+    import asyncio
+    asyncio.run(main())
+
diff --git a/examples/ags/experiments/baselines/self_refine_gsm8k.py b/examples/ags/experiments/baselines/self_refine_gsm8k.py
index 4ab23be94..880396903 100644
--- a/examples/ags/experiments/baselines/self_refine_gsm8k.py
+++ b/examples/ags/experiments/baselines/self_refine_gsm8k.py
@@ -8,36 +8,38 @@ from pydantic import BaseModel, Field
 from typing import Dict, Any
 
 GSM8K_PROMPT_GPT = """
-{question}\nPlease reason step by step, and put your final answer in the end. Wrap content using xml tags.
-"""
-
-GSM8K_PROMPT_DS = """
-{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.
+{question}\nPlease reason step by step. At the end, provide the final answer in the format "Answer is <number>", where <number> is a single number, without any additional information or explanation.
 """
 
 REVIEW_PROMPT = """
-For the question described as {question},
-please review the following solution: {solution}, and criticize on where might be wrong. You should provide a review result in boolean format.
-If you believe the solution is capable of resolving the issue, return True; otherwise, return False, and include your feedback.
+Given a problem and a thoughtful solution, your task is to using critical thinking (questioning) to review the solution's correctness and provide a review result in boolean format.
+
+problem: {problem}
+solution: {solution}
+
+If you are more than 95 percent confident that the final answer is incorrect, please return False and give a feedback for the error. Otherwise, please return True and give a explanation for the correctness.
 """
 
 REVISE_PROMPT = """
-For the question described as {question}, \nand an error solution: {solution}, \nwith the feedback: {feedback},
-Given the previous solution and feedback, carefully refine the solution to solve the question and ensure it aligns with the original format.
+Given a problem and a thoughtful solution which is just reviewed as incorrect, your task is to revise the solution to solve the question and ensure the final answer in the format "Answer is <number>", where <number> is a single number.
+
+problem: {problem}
+solution: {solution}
+feedback: {feedback}
 """
 
 class GenerateOp(BaseModel):
     solution: str = Field(default="", description="solution for the problem")
 
 class ReviewOp(BaseModel):
-    review_result: bool = Field(
-        default=False,
-        description="The Review Result (Bool). If you think this solution looks good for you, return 'true'; If not, return 'false'",
-    )
     feedback: str = Field(
         default="",
         description="Your FeedBack for this problem based on the criteria. If the review result is true, you can put it 'nothing here'.",
     )
+    review_result: bool = Field(
+        default=False,
+        description="The Review Result (Bool). If you think this solution looks good for you, return 'true'; If not, return 'false'",
+    )
 
 
 class ReviseOp(BaseModel):
@@ -48,7 +50,7 @@ class CoTGenerate(Operator):
     def __init__(self, llm: LLM, name: str = "Generate"):
         super().__init__(name, llm)
 
-    async def __call__(self, problem, mode: str = None):
+    async def __call__(self, problem, mode: str = "context_fill"):
         prompt = GSM8K_PROMPT_GPT.format(question=problem)
         fill_kwargs = {"context": prompt, "llm": self.llm}
         if mode:
@@ -61,8 +63,8 @@ class Review(Operator):
     def __init__(self, llm: LLM, name: str = "Review"):
         super().__init__(name, llm)
 
-    async def __call__(self, problem, solution, mode: str = None):
-        prompt = REVIEW_PROMPT.format(question=problem, solution=solution)
+    async def __call__(self, problem, solution, mode: str = "context_fill"):
+        prompt = REVIEW_PROMPT.format(problem=problem, solution=solution)
         fill_kwargs = {"context": prompt, "llm": self.llm}
         if mode:
             fill_kwargs["mode"] = mode
@@ -71,11 +73,11 @@ class Review(Operator):
         return response
 
 class Revise(Operator):
-    def __init__(self, name: str = "Revise", llm: LLM = LLM()):
+    def __init__(self, llm: LLM, name: str = "Revise"):
         super().__init__(name, llm)
 
-    async def __call__(self, problem, solution, feedback, mode: str = None):
-        prompt = REVISE_PROMPT.format(question=problem, solution=solution, feedback=feedback)
+    async def __call__(self, problem, solution, feedback, mode: str = "context_fill"):
+        prompt = REVISE_PROMPT.format(problem=problem, solution=solution, feedback=feedback)
         fill_kwargs = {"context": prompt, "llm": self.llm}
         if mode:
             fill_kwargs["mode"] = mode
@@ -92,7 +94,7 @@ class SelfRefineGraph(SolveGraph):
 
     async def __call__(self, problem):
         solution = await self.cot_generate(problem, mode="context_fill")
-        for i in range(5):
+        for i in range(3):
             review = await self.review(problem, solution)
             if review["review_result"]:
                 break
@@ -101,14 +103,12 @@ class SelfRefineGraph(SolveGraph):
 
 if __name__ == "__main__":
     async def main():
-        llm_config = ModelsConfig.default().get("deepseek-coder")
-        # llm_config = ModelsConfig.default().get("gpt-4o-mini")
-        # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
         graph = SelfRefineGraph(name="self-refine", llm_config=llm_config, dataset="Gsm8K")
         file_path = "examples/ags/data/gsm8k.jsonl"
-        samples = 10
+        samples = 264
         path = "examples/ags/data/baselines/general"
-        score, cost = await gsm8k_evaluation(graph, file_path, samples, path)
+        score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=True)
         return score, cost
 
     import asyncio
diff --git a/examples/ags/experiments/baselines/self_refine_humaneval.py b/examples/ags/experiments/baselines/self_refine_humaneval.py
new file mode 100644
index 000000000..44fdd08af
--- /dev/null
+++ b/examples/ags/experiments/baselines/self_refine_humaneval.py
@@ -0,0 +1,119 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.humaneval import humaneval_evaluation
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import Dict, Any
+
+HUMANEVAL_PROMPT_GPT = """
+{question}\nPlease provide a step-by-step explanation in text, followed by your Python function without any additional text or test cases. 
+"""
+
+
+REVIEW_PROMPT = """
+Given a problem and a thoughtful solution, your task is to using critical thinking (questioning) to review the solution's correctness and provide a review result in boolean format.
+
+problem: {problem}
+solution: {solution}
+
+If you are more than 95 percent confident that the final answer is incorrect, please return False and give a feedback for the error. Otherwise, please return True and give a explanation for the correctness.
+"""
+
+REVISE_PROMPT = """
+Given a problem and a thoughtful solution which is just reviewed as incorrect, your task is to revise the solution to solve the question and ensure the final code solution is wrapped with ```python```.
+
+problem: {problem}
+solution: {solution}
+feedback: {feedback}
+
+Ensure the output code is self-contained, and without any additional text or test cases.
+"""
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="solution for the problem")
+
+class ReviewOp(BaseModel):
+    review_result: bool = Field(
+        default=False,
+        description="The Review Result (Bool). If you think this solution looks good for you, return 'true'; If not, return 'false'",
+    )
+    feedback: str = Field(
+        default="",
+        description="Your FeedBack for this problem based on the criteria. If the review result is true, you can put it 'nothing here'.",
+    )
+
+
+class ReviseOp(BaseModel):
+    solution: str = Field(default="", description="Based on the feedback, revised solution for this problem")
+
+
+class CoTGenerate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, function_name, mode: str = None):
+        prompt = HUMANEVAL_PROMPT_GPT.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm, "function_name": function_name}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+    
+class Review(Operator):
+    def __init__(self, llm: LLM, name: str = "Review"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, solution, mode: str = None):
+        prompt = REVIEW_PROMPT.format(problem=problem, solution=solution)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(ReviewOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+class Revise(Operator):
+    def __init__(self, llm: LLM, name: str = "Revise"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, solution, feedback, mode: str = None):
+        prompt = REVISE_PROMPT.format(problem=problem, solution=solution, feedback=feedback)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(ReviseOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response  
+
+class SelfRefineGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        llm_config.temperature = 0.0
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = CoTGenerate(self.llm)
+        self.review = Review(self.llm)
+        self.revise = Revise(self.llm)
+
+    async def __call__(self, problem, function_name):
+        solution = await self.cot_generate(problem, function_name, mode="code_fill")
+        for i in range(3):
+            review = await self.review(problem, solution, mode="context_fill")
+            if review["review_result"]:
+                break
+            solution = await self.revise(problem, solution, review["feedback"], mode="code_fill")
+        return solution["solution"], self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        graph = SelfRefineGraph(name="self-refine", llm_config=llm_config, dataset="HumanEval")
+        file_path = "examples/ags/data/human-eval.jsonl"
+        samples = 33
+        path = "examples/ags/data/baselines/general/humaneval"
+        score, cost = await humaneval_evaluation(graph, file_path, samples, path, test=True)
+        return score, cost
+
+    import asyncio
+    asyncio.run(main())
diff --git a/examples/ags/experiments/baselines/self_refine_math.py b/examples/ags/experiments/baselines/self_refine_math.py
new file mode 100644
index 000000000..bf700ace6
--- /dev/null
+++ b/examples/ags/experiments/baselines/self_refine_math.py
@@ -0,0 +1,119 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.math import math_evaluation
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import Dict, Any
+
+GENERATE_COT_PROMPT = """
+{question}\nPlease reason step by step. At the end, provide the final answer in the format "\\boxed{{<number>}}", where <number> is a math answer(an expression or number), without any additional information or explanation.
+"""
+
+REVIEW_PROMPT = """
+Given a problem and a thoughtful solution, your task is to using critical thinking (questioning) to review the solution's correctness and provide a review result in boolean format.
+
+problem: {problem}
+solution: {solution}
+
+If you are more than 95 percent confident that the final answer is incorrect, please return False and give a feedback for the error. Otherwise, please return True and give a explanation for the correctness.
+"""
+
+REVISE_PROMPT = """
+Given a problem and a thoughtful solution which is just reviewed as incorrect, your task is to revise the solution to solve the question and ensure the final answer in the format "\\boxed{{<number>}}", where <number> is a math answer(an expression or number), without any additional information or explanation.
+
+problem: {problem}
+solution: {solution}
+feedback: {feedback}
+"""
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="solution for the problem")
+
+class ReviewOp(BaseModel):
+    feedback: str = Field(
+        default="",
+        description="Your FeedBack for this problem based on the criteria. If the review result is true, you can put it 'nothing here'.",
+    )
+    review_result: bool = Field(
+        default=False,
+        description="The Review Result (Bool). If you think this solution looks good for you, return 'true'; If not, return 'false'",
+    )
+
+
+class ReviseOp(BaseModel):
+    solution: str = Field(default="", description="Based on the feedback, revised solution for this problem")
+
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="solution for the problem")
+
+class CoTGenerate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, mode: str = None):
+        prompt = GENERATE_COT_PROMPT.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+    
+class Review(Operator):
+    def __init__(self, llm: LLM, name: str = "Review"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, solution, mode: str = "context_fill"):
+        prompt = REVIEW_PROMPT.format(problem=problem, solution=solution)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(ReviewOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+class Revise(Operator):
+    def __init__(self, llm: LLM, name: str = "Revise"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, solution, feedback, mode: str = "context_fill"):
+        prompt = REVISE_PROMPT.format(problem=problem, solution=solution, feedback=feedback)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(ReviseOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response  
+
+class SelfRefineGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = CoTGenerate(self.llm)
+        self.review = Review(self.llm)
+        self.revise = Revise(self.llm)
+
+    async def __call__(self, problem):
+        solution = await self.cot_generate(problem, mode="context_fill")
+        for i in range(3):
+            review = await self.review(problem, solution)
+            if review["review_result"]:
+                break
+            solution = await self.revise(problem, solution, review["feedback"])
+        return solution, self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        graph = SelfRefineGraph(name="self-refine", llm_config=llm_config, dataset="Gsm8K")
+        file_path = "examples/ags/data/math_test.jsonl"
+        # samples = None
+        samples = 10
+        path = "examples/ags/data/baselines/general/math"
+        score = await math_evaluation(graph, file_path, samples, path,test=False)
+        return score
+
+    import asyncio
+    asyncio.run(main())
diff --git a/examples/ags/experiments/baselines/self_refine_mbpp.py b/examples/ags/experiments/baselines/self_refine_mbpp.py
new file mode 100644
index 000000000..70cb3125c
--- /dev/null
+++ b/examples/ags/experiments/baselines/self_refine_mbpp.py
@@ -0,0 +1,117 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.mbpp import mbpp_evaluation
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import Dict, Any
+
+MBPP_PROMPT_COT = """
+{question}\nPlease provide a step-by-step explanation in text, followed by your Python function, ensure the output code is self-contained, meaning it should have the correct function name and return statement, without any additional text."""
+
+
+REVIEW_PROMPT = """
+Given a problem and a thoughtful solution, your task is to using critical thinking (questioning) to review the solution's correctness and provide a review result in boolean format.
+
+problem: {problem}
+solution: {solution}
+
+If you are more than 95 percent confident that the final answer is incorrect, please return False and give a feedback for the error. Otherwise, please return True and give a explanation for the correctness.
+"""
+
+REVISE_PROMPT = """
+Given a problem and a thoughtful solution which is just reviewed as incorrect, your task is to revise the solution to solve the question and ensure the final code solution is wrapped with ```python```.
+
+problem: {problem}
+solution: {solution}
+feedback: {feedback}
+
+Ensure the output code is self-contained, meaning it should have the correct function name and return statement, without any additional text.
+"""
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="solution for the problem")
+
+class ReviewOp(BaseModel):
+    feedback: str = Field(
+        default="",
+        description="Your FeedBack for this problem based on the criteria. If the review result is true, you can put it 'nothing here'.",
+    )
+    review_result: bool = Field(
+        default=False,
+        description="The Review Result (Bool). If you think this solution looks good for you, return 'true'; If not, return 'false'",
+    )
+
+class ReviseOp(BaseModel):
+    solution: str = Field(default="", description="Based on the feedback, revised solution for this problem")
+
+class CoTGenerate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, function_name, mode: str = None):
+        prompt = MBPP_PROMPT_COT.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm, "function_name": function_name}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+    
+class Review(Operator):
+    def __init__(self, llm: LLM, name: str = "Review"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, solution, mode: str = None):
+        prompt = REVIEW_PROMPT.format(problem=problem, solution=solution)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(ReviewOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+class Revise(Operator):
+    def __init__(self, llm: LLM, name: str = "Revise"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, solution, feedback, mode: str = None):
+        prompt = REVISE_PROMPT.format(problem=problem, solution=solution, feedback=feedback)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(ReviseOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response  
+
+class SelfRefineGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = CoTGenerate(self.llm)
+        self.review = Review(self.llm)
+        self.revise = Revise(self.llm)
+
+    async def __call__(self, problem, function_name):
+        solution = await self.cot_generate(problem, function_name, mode="code_fill")
+        for i in range(3):
+            review = await self.review(problem, solution, mode="context_fill")
+            if review["review_result"]:
+                break
+            solution = await self.revise(problem, solution, review["feedback"], mode="code_fill")
+        return solution["solution"], self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        graph = SelfRefineGraph(name="self-refine", llm_config=llm_config, dataset="MBPP")
+        file_path = "examples/ags/data/mbpp-new-new.jsonl"
+        samples = 86# 86/341
+        path = "examples/ags/data/baselines/general/mbpp"
+        score, cost = await mbpp_evaluation(graph, file_path, samples, path, test=True)
+        print(f"per cost:{cost/341}")
+        return score 
+
+    import asyncio
+    asyncio.run(main())
diff --git a/examples/ags/scripts/evaluator.py b/examples/ags/scripts/evaluator.py
index 4d17afb6b..19894e1be 100644
--- a/examples/ags/scripts/evaluator.py
+++ b/examples/ags/scripts/evaluator.py
@@ -20,13 +20,13 @@ from sympy.parsing.latex import parse_latex
 from sympy.parsing.sympy_parser import parse_expr
 from tqdm.asyncio import tqdm_asyncio
 
-from examples.ags.benchmark.gsm8k import gsm8k_evaluation
+from examples.ags.benchmark.gsm8k import optimize_gsm8k_evaluation
 from examples.ags.benchmark.utils import generate_random_indices
-from examples.ags.benchmark.math import math_evaluation
-from examples.ags.benchmark.humaneval import humaneval_evaluation
-from examples.ags.benchmark.mbpp import mbpp_evaluation
-from examples.ags.benchmark.drop import drop_evaluation
-from examples.ags.benchmark.hotpotqa import hotpotqa_evaluation
+from examples.ags.benchmark.math import optimize_math_evaluation
+from examples.ags.benchmark.humaneval import optimize_humaneval_evaluation
+from examples.ags.benchmark.mbpp import optimize_mbpp_evaluation
+from examples.ags.benchmark.drop import optimize_drop_evaluation
+from examples.ags.benchmark.hotpotqa import optimize_hotpotqa_evaluation
 
 DatasetType = Literal["HumanEval", "MBPP", "Gsm8K", "MATH", "HotpotQA", "DROP"]
 
@@ -41,119 +41,158 @@ class Evaluator:
 
     def validation_evaluate(self, dataset: DatasetType, graph, params: dict, path):
         """
-        Evaluates on validation dataset.
+        dataset: dataset type
+        graph: graph class
+        params: params for graph
+        path: path to save results
         """
         if dataset == "Gsm8K":
-            return self._gsm8k_eval(graph, params, path)
+            return self._gsm8k_eval(graph, params, path, test=False)
         elif dataset == "MATH":
-            return self._math_eval(graph, params, path)
+            return self._math_eval(graph, params, path, test=False)
         elif dataset == "HumanEval":
-            return self._humaneval_eval(graph, params, path)
+            return self._humaneval_eval(graph, params, path, test=False)
         elif dataset == "HotpotQA":
-            return self._hotpotqa_eval(graph, params, path)
+            return self._hotpotqa_eval(graph, params, path, test=False)
         elif dataset == "MBPP":
-            return self._mbpp_eval(graph, params, path)
+            return self._mbpp_eval(graph, params, path, test=False)
         elif dataset == "DROP":
-            return self._drop_eval(graph, params, path)
+            return self._drop_eval(graph, params, path, test=False)
 
-    def test_evaluate(self, dataset: DatasetType):
+    def test_evaluate(self, dataset: DatasetType, graph, params: dict, path):
         """
         Evaluates on test dataset.
         """
+        if dataset == "Gsm8K":
+            return self._gsm8k_eval(graph, params, path, test=True)
+        elif dataset == "MATH":
+            return self._math_eval(graph, params, path, test=True)
+        elif dataset == "HumanEval":
+            return self._humaneval_eval(graph, params, path, test=True)
+        elif dataset == "HotpotQA":
+            return self._hotpotqa_eval(graph, params, path, test=True)
+        elif dataset == "MBPP":
+            return self._mbpp_eval(graph, params, path, test=True)
+        elif dataset == "DROP":
+            return self._drop_eval(graph, params, path, test=True)
         pass
 
-    async def _gsm8k_eval(self, graph_class, params, path, samples: int = 50):
+    async def _gsm8k_eval(self, graph_class, params, path, test=False):
         """
-        Evaluate on GSM8K dataset.
+        评估GSM8K数据集。
         """
         async def load_graph():
             dataset = params["dataset"]
             llm_config = params["llm_config"]
             return graph_class(name="Gsm8K", llm_config=llm_config, dataset=dataset)
+        
+        if test:
+            data_path = "examples/ags/data/gsm8k_test.jsonl"
+        else:
+            data_path = "examples/ags/data/gsm8k_validate.jsonl"
 
         graph = await load_graph()
-        file_path = "examples/ags/data/gsm8k.jsonl"
         
-        score = await gsm8k_evaluation(graph, file_path, samples, path)
+        score, cost = await optimize_gsm8k_evaluation(graph, data_path, path)
         
-        return score
+        return score, cost
 
-    async def _math_eval(self, graph_class, params, path, samples: int = 200):
+    async def _math_eval(self, graph_class, params, path, test=False):
         """
-        Evaluate on MATH dataset.
+        评估MATH数据集。
         """
         async def load_graph():
             dataset = params["dataset"]
             llm_config = params["llm_config"]
             return graph_class(name="MATH", llm_config=llm_config, dataset=dataset)
+        
+        if test:
+            data_path = "examples/ags/data/math_test.jsonl"
+        else:
+            data_path = "examples/ags/data/math_validate.jsonl"
 
         graph = await load_graph()
-        file_path = "examples/ags/w_action_node/data/math.jsonl"  # 替换为实际的 MATH.jsonl 路径
         
-        score = await math_evaluation(graph, file_path, samples, path)
+        score, cost = await optimize_math_evaluation(graph, data_path, path)
         
-        return score
+        return score, cost
 
-    async def _humaneval_eval(self, graph_class, params, path, samples: int = 1):
+    async def _humaneval_eval(self, graph_class, params, path, test=False):
         """
-        Evaluate on HumanEval dataset.
+        评估HumanEval数据集。
         """
         async def load_graph():
             dataset = params["dataset"]
             llm_config = params["llm_config"]
             return graph_class(name="HumanEval", llm_config=llm_config, dataset=dataset)
+        
+        if test:
+            data_path = "examples/ags/data/human-eval_test.jsonl"
+        else:
+            data_path = "examples/ags/data/human-eval_validate.jsonl"
 
         graph = await load_graph()
-        file_path = "examples/ags/scripts/data/human-eval-new.jsonl"
         
-        score = await humaneval_evaluation(graph, file_path, samples, path)
+        score, cost = await optimize_humaneval_evaluation(graph, data_path, path)
         
-        return score
+        return score, cost
 
-    async def _hotpotqa_eval(self, graph_class, params, path, samples: int = 20):
+    async def _hotpotqa_eval(self, graph_class, params, path, test=False):
         """
-        Evaluate on HotpotQA dataset.
+        评估HotpotQA数据集。
         """
         async def load_graph():
             dataset = params["dataset"]
             llm_config = params["llm_config"]
             return graph_class(name="HotpotQA", llm_config=llm_config, dataset=dataset)
+        
+        if test:
+            data_path = "examples/ags/data/hotpotqa_test.jsonl"
+        else:
+            data_path = "examples/ags/data/hotpotqa_validate.jsonl"
 
         graph = await load_graph()
-        file_path = "examples/ags/scripts/data/hotpotqa.jsonl"
         
-        score = await hotpotqa_evaluation(graph, file_path, samples, path)
+        score, cost = await optimize_hotpotqa_evaluation(graph, data_path, path)
         
-        return score
+        return score, cost
 
-    async def _mbpp_eval(self, graph_class, params, path, samples: int = 1):
+    async def _mbpp_eval(self, graph_class, params, path, test=False):
         """
-        Evaluate on MBPP dataset.
+        评估MBPP数据集。
         """
         async def load_graph():
             dataset = params["dataset"]
             llm_config = params["llm_config"]
             return graph_class(name="MBPP", llm_config=llm_config, dataset=dataset)
+        
+        if test:
+            data_path = "examples/ags/data/mbpp_test.jsonl"
+        else:
+            data_path = "examples/ags/data/mbpp_validate.jsonl"
 
         graph = await load_graph()
-        file_path = "examples/ags/scripts/data/mbpp-new.jsonl"
         
-        score = await mbpp_evaluation(graph, file_path, samples, path)
+        score, cost = await optimize_mbpp_evaluation(graph, data_path, path)
         
-        return score
+        return score, cost
 
-    async def _drop_eval(self, graph_class, params, path):
+    async def _drop_eval(self, graph_class, params, path, test=False):
         """
-        Evaluate on DROP dataset.
+        评估DROP数据集。
         """
         async def load_graph():
             dataset = params["dataset"]
             llm_config = params["llm_config"]
             return graph_class(name="DROP", llm_config=llm_config, dataset=dataset)
+        
+        if test:
+            data_path = "examples/ags/data/drop_test.json"
+        else:
+            data_path = "examples/ags/data/drop_validate.json"
 
         graph = await load_graph()
-        file_path = "examples/ags/scripts/data/drop_dataset_dev.json"
         
-        score = await drop_evaluation(graph, file_path, path)
+        score, cost = await optimize_drop_evaluation(graph, data_path, path)
         
-        return score
+        return score, cost
diff --git a/examples/ags/scripts/operator.py b/examples/ags/scripts/operator.py
index b4d3f40e2..a5fd0168c 100644
--- a/examples/ags/scripts/operator.py
+++ b/examples/ags/scripts/operator.py
@@ -347,6 +347,7 @@ class ScEnsemble(Operator):
         return {"solution": solutions[answer_mapping[answer]]}  # {"final_solution": "xxx"}
 
 
+
 class Rephrase(Operator):
     """
     Paper: Code Generation with AlphaCodium: From Prompt Engineering to Flow Engineering
@@ -403,15 +404,7 @@ class Test(Operator):
             return "no error"
 
     async def __call__(
-        self,
-        problem_id,
-        problem,
-        rephrase_problem,
-        solution,
-        test_cases,
-        entry_point,
-        test_loop: int = 3,
-        mode: str = None,
+        self, problem_id, problem, rephrase_problem, solution, test_cases, entry_point, test_loop: int = 3
     ):
         solution = solution["final_solution"]
         for _ in range(test_loop):
@@ -427,10 +420,7 @@ class Test(Operator):
                     exec_pass=f"executed unsuccessfully, error: \n {result}",
                     test_fail="executed unsucessfully",
                 )
-                fill_kwargs = {"context": prompt, "llm": self.llm}
-                if mode:
-                    fill_kwargs["mode"] = mode
-                node = await ActionNode.from_pydantic(ReflectionTestOp).fill(**fill_kwargs)
+                node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm)
                 response = node.instruct_content.model_dump()
                 solution = response["refined_solution"]
             else:
@@ -441,10 +431,7 @@ class Test(Operator):
                     exec_pass="executed successfully",
                     test_fail=result,
                 )
-                fill_kwargs = {"context": prompt, "llm": self.llm}
-                if mode:
-                    fill_kwargs["mode"] = mode
-                node = await ActionNode.from_pydantic(ReflectionTestOp).fill(**fill_kwargs)
+                node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm)
                 response = node.instruct_content.model_dump()
                 solution = response["refined_solution"]
 
@@ -455,22 +442,24 @@ class PythonInterpreterOp(Operator):
     def __init__(self, name: str = "PythonInterpreterOp", llm: LLM = LLM()):
         super().__init__(name, llm)
 
-    async def run_code(self, code, timeout=600):
-        with open("solve_code.py", "w", encoding="utf-8") as f:  # TODO 这种依赖
-            f.write(code)
+    async def exec_code(self, code, timeout=600):
         try:
-            process = Popen([sys.executable, "solve_code.py"], stdout=PIPE, stderr=PIPE)
-            stdout, stderr = process.communicate(timeout=timeout)
-            if process.returncode != 0:
-                return "Error", stderr.decode("utf-8", errors="ignore")
+            # 创建一个新的全局命名空间
+            global_namespace = {}
+            
+            # 使用exec执行代码
+            exec(code, global_namespace)
+            
+            # 假设代码中定义了一个名为'solve'的函数
+            if 'solve' in global_namespace:
+                result = global_namespace['solve']()
+                return "Success", str(result)
             else:
-                return "Success", stdout.decode("utf-8", errors="ignore")
-        except TimeoutExpired:
-            process.terminate()
-            stdout, stderr = process.communicate()
-            return "Timeout", "代码执行超时。请尝试优化代码、算法或其他技术以减少执行时间。"
+                return "Error", "未找到'solve'函数"
         except Exception as e:
-            return "Error", str(e)
+            exc_type, exc_value, exc_traceback = sys.exc_info()
+            tb_str = traceback.format_exception(exc_type, exc_value, exc_traceback)
+            return "Error", f"执行错误: {str(e)}\n{''.join(tb_str)}"
 
     def extract_code_block(self, code_block):
         match = re.search(r"```python(.*?)```", code_block, re.DOTALL)
@@ -491,9 +480,9 @@ class PythonInterpreterOp(Operator):
             response = node.instruct_content.model_dump()
 
             code = self.extract_code_block(response["code"])
-            status, output = await self.run_code(code)
+            status, output = await self.exec_code(code)
 
             if status == "Success":
                 return {"code": code, "output": output}
 
-        return {"code": code, "output": "code execution error, no result!"}
+        return {"code": code, "output": "代码执行错误，无结果！"}
diff --git a/samples_eval_results.json b/samples_eval_results.json
new file mode 100644
index 000000000..cbca304c0
--- /dev/null
+++ b/samples_eval_results.json
@@ -0,0 +1 @@
+{"date": "2024-07-15 15:06", "hash": "fe585eb4df8c88d844eeb463ea4d0302", "eval": {"HumanEval/0": [{"task_id": "HumanEval/0", "solution": "from typing import List\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n    for i in range(len(numbers)):\n        for j in range(i + 1, len(numbers)):\n            if abs(numbers[i] - numbers[j]) < threshold:\n                return True\n    return False", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/1": [{"task_id": "HumanEval/1", "solution": "from typing import List\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n    paren_string = paren_string.replace(' ', '')\n    result = []\n    stack = []\n    current_group = []\n\n    for char in paren_string:\n        if char == '(':\n            if stack:\n                current_group.append(char)\n            stack.append(char)\n        elif char == ')':\n            stack.pop()\n            current_group.append(char)\n            if not stack:\n                result.append(''.join(current_group))\n                current_group = []\n\n    return result", "base_status": "fail", "plus_status": null, "base_fail_tests": [["(()()) ((())) () ((())()())"]], "plus_fail_tests": []}], "HumanEval/3": [{"task_id": "HumanEval/3", "solution": "from typing import List\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account falls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n    balance = 0\n    for operation in operations:\n        balance += operation\n        if balance < 0:\n            return True\n    return False", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/2": [{"task_id": "HumanEval/2", "solution": "def truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n    return number - int(number)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/5": [{"task_id": "HumanEval/5", "solution": "from typing import List\ndef intersperse(numbers: List[int], delimeter: int) -> List[int]:\n    \"\"\" Insert a number 'delimeter' between every two consecutive elements of input list `numbers'\n    >>> intersperse([], 4)\n    []\n    >>> intersperse([1, 2, 3], 4)\n    [1, 4, 2, 4, 3]\n    \"\"\"\n    if not numbers:\n        return []\n    \n    result = []\n    for i in range(len(numbers) - 1):\n        result.append(numbers[i])\n        result.append(delimeter)\n    result.append(numbers[-1])\n    return result", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/4": [{"task_id": "HumanEval/4", "solution": "from typing import List\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n    \"\"\" For a given list of input numbers, calculate Mean Absolute Deviation\n    around the mean of this dataset.\n    Mean Absolute Deviation is the average absolute difference between each\n    element and a centerpoint (mean in this case):\n    MAD = average | x - x_mean |\n    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n    1.0\n    \"\"\"\n    if not numbers:\n        return 0.0\n    mean_value = sum(numbers) / len(numbers)\n    absolute_deviations = [abs(x - mean_value) for x in numbers]\n    return sum(absolute_deviations) / len(absolute_deviations)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/7": [{"task_id": "HumanEval/7", "solution": "from typing import List\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    \"\"\" Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], 'a')\n    []\n    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')\n    ['abc', 'bacd', 'array']\n    \"\"\"\n    return [s for s in strings if substring in s]", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/6": [{"task_id": "HumanEval/6", "solution": "from typing import List\ndef parse_nested_parens(paren_string: str) -> List[int]:\n    \"\"\" Input to this function is a string represented multiple groups for nested parentheses separated by spaces.\n    For each of the group, output the deepest level of nesting of parentheses.\n    E.g. (()()) has maximum two levels of nesting while ((())) has three.\n\n    >>> parse_nested_parens('(()()) ((())) () ((())()())')\n    [2, 3, 1, 3]\n    \"\"\"\n    groups = paren_string.split()\n    max_depths = []\n    for group in groups:\n        max_depth = 0\n        current_depth = 0\n        for char in group:\n            if char == '(':\n                current_depth += 1\n                if current_depth > max_depth:\n                    max_depth = current_depth\n            elif char == ')':\n                current_depth -= 1\n        max_depths.append(max_depth)\n    return max_depths", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/8": [{"task_id": "HumanEval/8", "solution": "from typing import List, Tuple\ndef sum_product(numbers: List[int]) -> Tuple[int, int]:\n    \"\"\" For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.\n    Empty sum should be equal to 0 and empty product should be equal to 1.\n    >>> sum_product([])\n    (0, 1)\n    >>> sum_product([1, 2, 3, 4])\n    (10, 24)\n    \"\"\"\n    if not numbers:\n        return (0, 1)\n    total_sum = sum(numbers)\n    total_product = 1\n    for num in numbers:\n        total_product *= num\n    return (total_sum, total_product)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/9": [{"task_id": "HumanEval/9", "solution": "from typing import List\ndef rolling_max(numbers: List[int]) -> List[int]:\n    \"\"\" From a given list of integers, generate a list of rolling maximum element found until given moment\n    in the sequence.\n    >>> rolling_max([1, 2, 3, 2, 3, 4, 2])\n    [1, 2, 3, 3, 3, 4, 4]\n    \"\"\"\n    max_so_far = float('-inf')\n    result = []\n    for num in numbers:\n        if num > max_so_far:\n            max_so_far = num\n        result.append(max_so_far)\n    return result", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/11": [{"task_id": "HumanEval/11", "solution": "from typing import List\ndef string_xor(a: str, b: str) -> str:\n    \"\"\" Input are two strings a and b consisting only of 1s and 0s.\n    Perform binary XOR on these inputs and return result also as a string.\n    >>> string_xor('010', '110')\n    '100'\n    \"\"\"\n    result = []\n    for char_a, char_b in zip(a, b):\n        xor_result = '1' if char_a != char_b else '0'\n        result.append(xor_result)\n    return ''.join(result)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/10": [{"task_id": "HumanEval/10", "solution": "def is_palindrome(string: str) -> bool:\n    \"\"\" Test if given string is a palindrome \"\"\"\n    return string == string[::-1]\ndef make_palindrome(string: str) -> str:\n    \"\"\" Find the shortest palindrome that begins with a supplied string.\n    Algorithm idea is simple:\n    - Find the longest postfix of supplied string that is a palindrome.\n    - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\n    \"\"\"\n    for i in range(len(string)):\n        if is_palindrome(string[i:]) and i != 0:\n            return string + string[:i][::-1]\n    return string + string[::-1]", "base_status": "fail", "plus_status": null, "base_fail_tests": [["x"]], "plus_fail_tests": []}], "HumanEval/13": [{"task_id": "HumanEval/13", "solution": "def greatest_common_divisor(a: int, b: int) -> int:\n    \"\"\" Return a greatest common divisor of two integers a and b\n    >>> greatest_common_divisor(3, 5)\n    1\n    >>> greatest_common_divisor(25, 15)\n    5\n    \"\"\"\n    while b:\n        a, b = b, a % b\n    return a", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/12": [{"task_id": "HumanEval/12", "solution": "from typing import List, Optional\ndef longest(strings: List[str]) -> Optional[str]:\n    \"\"\" Out of list of strings, return the longest one. Return the first one in case of multiple\n    strings of the same length. Return None in case the input list is empty.\n    >>> longest([])\n    None\n    >>> longest(['a', 'b', 'c'])\n    'a'\n    >>> longest(['a', 'bb', 'ccc'])\n    'ccc'\n    \"\"\"\n    if not strings:\n        return None\n    \n    longest_string = strings[0]\n    for string in strings:\n        if len(string) > len(longest_string):\n            longest_string = string\n    \n    return longest_string", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/14": [{"task_id": "HumanEval/14", "solution": "from typing import List\ndef all_prefixes(string: str) -> List[str]:\n    \"\"\" Return list of all prefixes from shortest to longest of the input string\n    >>> all_prefixes('abc')\n    ['a', 'ab', 'abc']\n    \"\"\"\n    return [string[:i] for i in range(1, len(string) + 1)]", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/16": [{"task_id": "HumanEval/16", "solution": "def count_distinct_characters(string: str) -> int:\n    \"\"\" Given a string, find out how many distinct characters (regardless of case)\n    >>> count_distinct_characters('xyzXYZ')\n    3\n    >>> count_distinct_characters('Jerry')\n    4\n    \"\"\"\n    return len(set(string.lower()))", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/15": [{"task_id": "HumanEval/15", "solution": "def string_sequence(n: int) -> str:\n    \"\"\" Return a string containing space-delimited numbers starting from 0 upto n inclusive.\n    >>> string_sequence(0)\n    '0'\n    >>> string_sequence(5)\n    '0 1 2 3 4 5'\n    \"\"\"\n    return ' '.join(str(i) for i in range(n + 1))", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/18": [{"task_id": "HumanEval/18", "solution": "def how_many_times(string: str, substring: str) -> int:\n    count = start = 0\n    while True:\n        start = string.find(substring, start) + 1\n        if start > 0:\n            count += 1\n        else:\n            return count", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/17": [{"task_id": "HumanEval/17", "solution": "from typing import List\ndef parse_music(music_string: str) -> List[int]:\n    \"\"\" Input to this function is a string representing musical notes in a special ASCII format.\n    Your task is to parse this string and return list of integers corresponding to how many beats does each\n    not last.\n\n    Here is a legend:\n    'o' - whole note, lasts four beats\n    'o|' - half note, lasts two beats\n    '.|' - quater note, lasts one beat\n\n    >>> parse_music('o o| .| o| o| .| .| .| .| o o')\n    [4, 2, 1, 2, 2, 1, 1, 1, 1, 4, 4]\n    \"\"\"\n    notes = music_string.split()\n    beats = []\n    for note in notes:\n        if note == 'o':\n            beats.append(4)\n        elif note == 'o|':\n            beats.append(2)\n        elif note == '.|':\n            beats.append(1)\n    return beats", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/19": [{"task_id": "HumanEval/19", "solution": "from typing import List\ndef sort_numbers(numbers: str) -> str:\n    \"\"\" Input is a space-delimited string of numberals from 'zero' to 'nine'.\n    Valid choices are 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight' and 'nine'.\n    Return the string with numbers sorted from smallest to largest\n    >>> sort_numbers('three one five')\n    'one three five'\n    \"\"\"\n    num_dict = {'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9}\n    num_list = numbers.split()\n    sorted_list = sorted(num_list, key=lambda x: num_dict[x])\n    return ' '.join(sorted_list)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/20": [{"task_id": "HumanEval/20", "solution": "from typing import List, Tuple\ndef find_closest_elements(numbers: List[float]) -> Tuple[float, float]:\n    \"\"\" From a supplied list of numbers (of length at least two) select and return two that are the closest to each\n    other and return them in order (smaller number, larger number).\n    >>> find_closest_elements([1.0, 2.0, 3.0, 4.0, 5.0, 2.2])\n    (2.0, 2.2)\n    >>> find_closest_elements([1.0, 2.0, 3.0, 4.0, 5.0, 2.0])\n    (2.0, 2.0)\n    \"\"\"\n    numbers.sort()  # Sort the list to ensure the smallest difference is between consecutive elements\n    min_diff = float('inf')\n    closest_pair = (0.0, 0.0)\n    for i in range(len(numbers) - 1):\n        diff = numbers[i + 1] - numbers[i]\n        if diff < min_diff:\n            min_diff = diff\n            closest_pair = (numbers[i], numbers[i + 1])\n    return closest_pair", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/21": [{"task_id": "HumanEval/21", "solution": "from typing import List\ndef rescale_to_unit(numbers: List[float]) -> List[float]:\n    \"\"\" Given list of numbers (of at least two elements), apply a linear transform to that list,\n    such that the smallest number will become 0 and the largest will become 1\n    >>> rescale_to_unit([1.0, 2.0, 3.0, 4.0, 5.0])\n    [0.0, 0.25, 0.5, 0.75, 1.0]\n    \"\"\"\n    if len(numbers) < 2:\n        raise ValueError(\"List must contain at least two elements\")\n    \n    min_num = min(numbers)\n    max_num = max(numbers)\n    \n    if min_num == max_num:\n        raise ValueError(\"All numbers in the list must be distinct\")\n    \n    rescaled_numbers = [(x - min_num) / (max_num - min_num) for x in numbers]\n    return rescaled_numbers", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/22": [{"task_id": "HumanEval/22", "solution": "from typing import List, Any\ndef filter_integers(values: List[Any]) -> List[int]:\n    \"\"\" Filter given list of any python values only for integers\n    >>> filter_integers(['a', 3.14, 5])\n    [5]\n    >>> filter_integers([1, 2, 3, 'abc', {}, []])\n    [1, 2, 3]\n    \"\"\"\n    return [value for value in values if isinstance(value, int)]", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/24": [{"task_id": "HumanEval/24", "solution": "def largest_divisor(n: int) -> int:\n    \"\"\" For a given number n, find the largest number that divides n evenly, smaller than n\n    >>> largest_divisor(15)\n    5\n    \"\"\"\n    for i in range(n-1, 0, -1):\n        if n % i == 0:\n            return i", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/23": [{"task_id": "HumanEval/23", "solution": "def strlen(string: str) -> int:\n    \"\"\" Return length of given string\n    >>> strlen('')\n    0\n    >>> strlen('abc')\n    3\n    \"\"\"\n    return len(string)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/25": [{"task_id": "HumanEval/25", "solution": "from typing import List\ndef factorize(n: int) -> List[int]:\n    \"\"\" Return list of prime factors of given integer in the order from smallest to largest.\n    Each of the factors should be listed number of times corresponding to how many times it appeares in factorization.\n    Input number should be equal to the product of all factors\n    >>> factorize(8)\n    [2, 2, 2]\n    >>> factorize(25)\n    [5, 5]\n    >>> factorize(70)\n    [2, 5, 7]\n    \"\"\"\n    factors = []\n    divisor = 2\n    while n > 1:\n        while n % divisor == 0:\n            factors.append(divisor)\n            n //= divisor\n        divisor += 1\n    return factors", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/26": [{"task_id": "HumanEval/26", "solution": "from typing import List\ndef remove_duplicates(numbers: List[int]) -> List[int]:\n    \"\"\" From a list of integers, remove all elements that occur more than once.\n    Keep order of elements left the same as in the input.\n    >>> remove_duplicates([1, 2, 3, 2, 4])\n    [1, 3, 4]\n    \"\"\"\n    from collections import Counter\n    count = Counter(numbers)\n    return [num for num in numbers if count[num] == 1]", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/28": [{"task_id": "HumanEval/28", "solution": "from typing import List\ndef concatenate(strings: List[str]) -> str:\n    \"\"\" Concatenate list of strings into a single string\n    >>> concatenate([])\n    ''\n    >>> concatenate(['a', 'b', 'c'])\n    'abc'\n    \"\"\"\n    return ''.join(strings)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/27": [{"task_id": "HumanEval/27", "solution": "def flip_case(string: str) -> str:\n    \"\"\" For a given string, flip lowercase characters to uppercase and uppercase to lowercase.\n    >>> flip_case('Hello')\n    'hELLO'\n    \"\"\"\n    return ''.join(char.lower() if char.isupper() else char.upper() for char in string)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/29": [{"task_id": "HumanEval/29", "solution": "from typing import List\ndef filter_by_prefix(strings: List[str], prefix: str) -> List[str]:\n    \"\"\" Filter an input list of strings only for ones that start with a given prefix.\n    >>> filter_by_prefix([], 'a')\n    []\n    >>> filter_by_prefix(['abc', 'bcd', 'cde', 'array'], 'a')\n    ['abc', 'array']\n    \"\"\"\n    return [s for s in strings if s.startswith(prefix)]", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/30": [{"task_id": "HumanEval/30", "solution": "def get_positive(l: list):\n    \"\"\"Return only positive numbers in the list.\n    >>> get_positive([-1, 2, -4, 5, 6])\n    [2, 5, 6]\n    >>> get_positive([5, 3, -5, 2, -3, 3, 9, 0, 123, 1, -10])\n    [5, 3, 2, 3, 9, 123, 1]\n    \"\"\"\n    return [x for x in l if x > 0]", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/32": [{"task_id": "HumanEval/32", "solution": "import math\ndef poly(xs: list, x: float):\n    \"\"\"\n    Evaluates polynomial with coefficients xs at point x.\n    return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n\n    \"\"\"\n    return sum([coeff * math.pow(x, i) for i, coeff in enumerate(xs)])\ndef find_zero(xs: list):\n    \"\"\" xs are coefficients of a polynomial.\n    find_zero find x such that poly(x) = 0.\n    find_zero returns only only zero point, even if there are many.\n    Moreover, find_zero only takes list xs having even number of coefficients\n    and largest non zero coefficient as it guarantees\n    a solution.\n    >>> round(find_zero([1, 2]), 2) # f(x) = 1 + 2x\n    -0.5\n    >>> round(find_zero([-6, 11, -6, 1]), 2) # (x - 1) * (x - 2) * (x - 3) = -6 + 11x - 6x^2 + x^3\n    1.0\n    \"\"\"\n    def f(x):\n        return poly(xs, x)\n\n    # Initial guess for the root\n    x0 = 0.0\n    tolerance = 1e-5\n    max_iterations = 1000\n\n    for _ in range(max_iterations):\n        fx0 = f(x0)\n        if abs(fx0) < tolerance:\n            return x0\n        # Using the derivative of the polynomial for the Newton-Raphson method\n        df = sum([coeff * i * math.pow(x0, i - 1) for i, coeff in enumerate(xs) if i > 0])\n        if df == 0:\n            break\n        x0 = x0 - fx0 / df\n\n    return x0", "base_status": "fail", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/31": [{"task_id": "HumanEval/31", "solution": "def is_prime(n):\n    \"\"\"Return true if a given number is prime, and false otherwise.\n    >>> is_prime(6)\n    False\n    >>> is_prime(101)\n    True\n    >>> is_prime(11)\n    True\n    >>> is_prime(13441)\n    True\n    >>> is_prime(61)\n    True\n    >>> is_prime(4)\n    False\n    >>> is_prime(1)\n    False\n    \"\"\"\n    if n <= 1:\n        return False\n    if n == 2:\n        return True\n    if n % 2 == 0:\n        return False\n    for i in range(3, int(n**0.5) + 1, 2):\n        if n % i == 0:\n            return False\n    return True", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/34": [{"task_id": "HumanEval/34", "solution": "def unique(l: list):\n    \"\"\"Return sorted unique elements in a list\n    >>> unique([5, 3, 5, 2, 3, 3, 9, 0, 123])\n    [0, 2, 3, 5, 9, 123]\n    \"\"\"\n    return sorted(set(l))", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/33": [{"task_id": "HumanEval/33", "solution": "def sort_third(l: list):\n    \"\"\"This function takes a list l and returns a list l' such that\n    l' is identical to l in the indicies that are not divisible by three, while its values at the indicies that are divisible by three are equal\n    to the values of the corresponding indicies of l, but sorted.\n    >>> sort_third([1, 2, 3])\n    [1, 2, 3]\n    >>> sort_third([5, 6, 3, 4, 8, 9, 2])\n    [2, 6, 3, 4, 8, 9, 5]\n    \"\"\"\n    indices = [i for i in range(len(l)) if i % 3 == 0]\n    sorted_values = sorted([l[i] for i in indices])\n    for i, value in zip(indices, sorted_values):\n        l[i] = value\n    return l", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/35": [{"task_id": "HumanEval/35", "solution": "def max_element(l: list):\n    \"\"\"Return maximum element in the list.\n    >>> max_element([1, 2, 3])\n    3\n    >>> max_element([5, 3, -5, 2, -3, 3, 9, 0, 123, 1, -10])\n    123\n    \"\"\"\n    return max(l)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/36": [{"task_id": "HumanEval/36", "solution": "def fizz_buzz(n: int):\n    count = 0\n    for i in range(n):\n        if (i % 11 == 0 or i % 13 == 0) and '7' in str(i):\n            count += 1\n    return count", "base_status": "fail", "plus_status": null, "base_fail_tests": [[78]], "plus_fail_tests": []}], "HumanEval/37": [{"task_id": "HumanEval/37", "solution": "def sort_even(l: list):\n    \"\"\"This function takes a list l and returns a list l' such that\n    l' is identical to l in the odd indicies, while its values at the even indicies are equal\n    to the values of the even indicies of l, but sorted.\n    >>> sort_even([1, 2, 3])\n    [1, 2, 3]\n    >>> sort_even([5, 6, 3, 4])\n    [3, 6, 5, 4]\n    \"\"\"\n    even_indices_values = [l[i] for i in range(0, len(l), 2)]\n    even_indices_values.sort()\n    result = l[:]\n    for i, val in zip(range(0, len(l), 2), even_indices_values):\n        result[i] = val\n    return result", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/38": [{"task_id": "HumanEval/38", "solution": "def decode_cyclic(s: str):\n    \"\"\"\n    takes as input string encoded with encode_cyclic function. Returns decoded string.\n    \"\"\"\n    # split string to groups. Each of length 3.\n    groups = [s[(3 * i):min((3 * i + 3), len(s))] for i in range((len(s) + 2) // 3)]\n    # cycle elements in each group back to original position. Unless group has fewer elements than 3.\n    groups = [(group[-1] + group[:-1]) if len(group) == 3 else group for group in groups]\n    return \"\".join(groups)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/40": [{"task_id": "HumanEval/40", "solution": "def triples_sum_to_zero(l: list):\n    \"\"\"\n    triples_sum_to_zero takes a list of integers as an input.\n    it returns True if there are three distinct elements in the list that\n    sum to zero, and False otherwise.\n\n    >>> triples_sum_to_zero([1, 3, 5, 0])\n    False\n    >>> triples_sum_to_zero([1, 3, -2, 1])\n    True\n    >>> triples_sum_to_zero([1, 2, 3, 7])\n    False\n    >>> triples_sum_to_zero([2, 4, -5, 3, 9, 7])\n    True\n    >>> triples_sum_to_zero([1])\n    False\n    \"\"\"\n    l.sort()\n    for i in range(len(l) - 2):\n        left, right = i + 1, len(l) - 1\n        while left < right:\n            current_sum = l[i] + l[left] + l[right]\n            if current_sum == 0:\n                return True\n            elif current_sum < 0:\n                left += 1\n            else:\n                right -= 1\n    return False", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/39": [{"task_id": "HumanEval/39", "solution": "def is_prime(num):\n    if num < 2:\n        return False\n    for i in range(2, int(num**0.5) + 1):\n        if num % i == 0:\n            return False\n    return True\ndef prime_fib(n: int):\n    \"\"\"\n    prime_fib returns n-th number that is a Fibonacci number and it's also prime.\n    >>> prime_fib(1)\n    2\n    >>> prime_fib(2)\n    3\n    >>> prime_fib(3)\n    5\n    >>> prime_fib(4)\n    13\n    >>> prime_fib(5)\n    89\n    \"\"\"\n    a, b = 0, 1\n    count = 0\n    while True:\n        a, b = b, a + b\n        if is_prime(b):\n            count += 1\n            if count == n:\n                return b", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/42": [{"task_id": "HumanEval/42", "solution": "def incr_list(l: list):\n    \"\"\"Return list with elements incremented by 1.\n    >>> incr_list([1, 2, 3])\n    [2, 3, 4]\n    >>> incr_list([5, 3, 5, 2, 3, 3, 9, 0, 123])\n    [6, 4, 6, 3, 4, 4, 10, 1, 124]\n    \"\"\"\n    return [x + 1 for x in l]", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/41": [{"task_id": "HumanEval/41", "solution": "def car_race_collision(n: int):\n    \"\"\"\n    Imagine a road that's a perfectly straight infinitely long line.\n    n cars are driving left to right;  simultaneously, a different set of n cars\n    are driving right to left.   The two sets of cars start out being very far from\n    each other.  All cars move in the same speed.  Two cars are said to collide\n    when a car that's moving left to right hits a car that's moving right to left.\n    However, the cars are infinitely sturdy and strong; as a result, they continue moving\n    in their trajectory as if they did not collide.\n\n    This function outputs the number of such collisions.\n    \"\"\"\n    return n * n", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/43": [{"task_id": "HumanEval/43", "solution": "def pairs_sum_to_zero(l):\n    \"\"\"\n    pairs_sum_to_zero takes a list of integers as an input.\n    it returns True if there are two distinct elements in the list that\n    sum to zero, and False otherwise.\n    >>> pairs_sum_to_zero([1, 3, 5, 0])\n    False\n    >>> pairs_sum_to_zero([1, 3, -2, 1])\n    False\n    >>> pairs_sum_to_zero([1, 2, 3, 7])\n    False\n    >>> pairs_sum_to_zero([2, 4, -5, 3, 5, 7])\n    True\n    >>> pairs_sum_to_zero([1])\n    False\n    \"\"\"\n    seen = set()\n    for num in l:\n        if -num in seen:\n            return True\n        seen.add(num)\n    return False", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/44": [{"task_id": "HumanEval/44", "solution": "def change_base(x: int, base: int):\n    \"\"\"Change numerical base of input number x to base.\n    return string representation after the conversion.\n    base numbers are less than 10.\n    >>> change_base(8, 3)\n    '22'\n    >>> change_base(8, 2)\n    '1000'\n    >>> change_base(7, 2)\n    '111'\n    \"\"\"\n    if x == 0:\n        return '0'\n    digits = []\n    while x:\n        digits.append(str(x % base))\n        x //= base\n    return ''.join(digits[::-1])", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/45": [{"task_id": "HumanEval/45", "solution": "def triangle_area(a, h):\n    \"\"\"Given length of a side and high return area for a triangle.\n    >>> triangle_area(5, 3)\n    7.5\n    \"\"\"\n    return 0.5 * a * h", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/46": [{"task_id": "HumanEval/46", "solution": "def fib4(n: int):\n    if n == 0:\n        return 0\n    elif n == 1:\n        return 0\n    elif n == 2:\n        return 2\n    elif n == 3:\n        return 0\n    else:\n        a, b, c, d = 0, 0, 2, 0\n        for i in range(4, n + 1):\n            next_value = a + b + c + d\n            a, b, c, d = b, c, d, next_value\n        return d", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/47": [{"task_id": "HumanEval/47", "solution": "def median(l: list):\n    \"\"\"Return median of elements in the list l.\n    >>> median([3, 1, 2, 4, 5])\n    3\n    >>> median([-10, 4, 6, 1000, 10, 20])\n    15.0\n    \"\"\"\n    sorted_l = sorted(l)\n    n = len(sorted_l)\n    if n % 2 == 1:\n        return sorted_l[n // 2]\n    else:\n        return (sorted_l[n // 2 - 1] + sorted_l[n // 2]) / 2", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/48": [{"task_id": "HumanEval/48", "solution": "def is_palindrome(text: str):\n    \"\"\"\n    Checks if given string is a palindrome\n    >>> is_palindrome('')\n    True\n    >>> is_palindrome('aba')\n    True\n    >>> is_palindrome('aaaaa')\n    True\n    >>> is_palindrome('zbcd')\n    False\n    \"\"\"\n    return text == text[::-1]", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/49": [{"task_id": "HumanEval/49", "solution": "def modp(n: int, p: int):\n    \"\"\"Return 2^n modulo p (be aware of numerics).\n    >>> modp(3, 5)\n    3\n    >>> modp(1101, 101)\n    2\n    >>> modp(0, 101)\n    1\n    >>> modp(3, 11)\n    8\n    >>> modp(100, 101)\n    1\n    \"\"\"\n    result = 1\n    base = 2\n    while n > 0:\n        if n % 2 == 1:\n            result = (result * base) % p\n        base = (base * base) % p\n        n //= 2\n    return result", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/50": [{"task_id": "HumanEval/50", "solution": "def decode_shift(s: str):\n    \"\"\"\n    takes as input string encoded with encode_shift function. Returns decoded string.\n    \"\"\"\n    return \"\".join([chr(((ord(ch) - 5 - ord(\"a\")) % 26) + ord(\"a\")) for ch in s])", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/52": [{"task_id": "HumanEval/52", "solution": "def below_threshold(l: list, t: int):\n    \"\"\"Return True if all numbers in the list l are below threshold t.\n    >>> below_threshold([1, 2, 4, 10], 100)\n    True\n    >>> below_threshold([1, 20, 4, 10], 5)\n    False\n    \"\"\"\n    return all(x < t for x in l)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/51": [{"task_id": "HumanEval/51", "solution": "def remove_vowels(text):\n    \"\"\"\n    remove_vowels is a function that takes string and returns string without vowels.\n    >>> remove_vowels('')\n    ''\n    >>> remove_vowels(\"abcdef\\nghijklm\")\n    'bcdf\\nghjklm'\n    >>> remove_vowels('abcdef')\n    'bcdf'\n    >>> remove_vowels('aaaaa')\n    ''\n    >>> remove_vowels('aaBAA')\n    'B'\n    >>> remove_vowels('zbcd')\n    'zbcd'\n    \"\"\"\n    vowels = 'aeiouAEIOU'\n    return ''.join(char for char in text if char not in vowels)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/54": [{"task_id": "HumanEval/54", "solution": "def same_chars(s0: str, s1: str):\n    \"\"\"\n    Check if two words have the same characters.\n    >>> same_chars('eabcdzzzz', 'dddzzzzzzzddeddabc')\n    True\n    >>> same_chars('abcd', 'dddddddabc')\n    True\n    >>> same_chars('dddddddabc', 'abcd')\n    True\n    >>> same_chars('eabcd', 'dddddddabc')\n    False\n    >>> same_chars('abcd', 'dddddddabce')\n    False\n    >>> same_chars('eabcdzzzz', 'dddzzzzzzzddddabc')\n    False\n    \"\"\"\n    return sorted(s0) == sorted(s1)", "base_status": "fail", "plus_status": null, "base_fail_tests": [["eabcdzzzz", "dddzzzzzzzddeddabc"]], "plus_fail_tests": []}], "HumanEval/53": [{"task_id": "HumanEval/53", "solution": "def add(x: int, y: int):\n    \"\"\"Add two numbers x and y\n    >>> add(2, 3)\n    5\n    >>> add(5, 7)\n    12\n    \"\"\"\n    return x + y", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/55": [{"task_id": "HumanEval/55", "solution": "def fib(n: int):\n    \"\"\"Return n-th Fibonacci number.\n    >>> fib(10)\n    55\n    >>> fib(1)\n    1\n    >>> fib(8)\n    21\n    \"\"\"\n    if n <= 0:\n        return 0\n    elif n == 1:\n        return 1\n    else:\n        a, b = 0, 1\n        for _ in range(2, n + 1):\n            a, b = b, a + b\n        return b", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/56": [{"task_id": "HumanEval/56", "solution": "def correct_bracketing(brackets: str):\n    \"\"\" brackets is a string of \"<\" and \">\".\n    return True if every opening bracket has a corresponding closing bracket.\n\n    >>> correct_bracketing(\"<\")\n    False\n    >>> correct_bracketing(\"<>\")\n    True\n    >>> correct_bracketing(\"<<><>>\")\n    True\n    >>> correct_bracketing(\"><<><\")\n    False\n    \"\"\"\n    balance = 0\n    for bracket in brackets:\n        if bracket == '<':\n            balance += 1\n        elif bracket == '>':\n            balance -= 1\n        if balance < 0:\n            return False\n    return balance == 0", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/57": [{"task_id": "HumanEval/57", "solution": "def monotonic(l: list):\n    \"\"\"Return True if list elements are monotonically increasing or decreasing.\n    >>> monotonic([1, 2, 4, 20])\n    True\n    >>> monotonic([1, 20, 4, 10])\n    False\n    >>> monotonic([4, 1, 0, -10])\n    True\n    \"\"\"\n    return all(x <= y for x, y in zip(l, l[1:])) or all(x >= y for x, y in zip(l, l[1:]))", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/58": [{"task_id": "HumanEval/58", "solution": "def common(l1: list, l2: list):\n    \"\"\"Return sorted unique common elements for two lists.\n    >>> common([1, 4, 3, 34, 653, 2, 5], [5, 7, 1, 5, 9, 653, 121])\n    [1, 5, 653]\n    >>> common([5, 3, 2, 8], [3, 2])\n    [2, 3]\n\n    \"\"\"\n    return sorted(set(l1) & set(l2))", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/59": [{"task_id": "HumanEval/59", "solution": "def largest_prime_factor(n: int):\n    \"\"\"Return the largest prime factor of n. Assume n > 1 and is not a prime.\n    >>> largest_prime_factor(13195)\n    29\n    >>> largest_prime_factor(2048)\n    2\n    \"\"\"\n    def is_prime(num):\n        if num <= 1:\n            return False\n        if num <= 3:\n            return True\n        if num % 2 == 0 or num % 3 == 0:\n            return False\n        i = 5\n        while i * i <= num:\n            if num % i == 0 or num % (i + 2) == 0:\n                return False\n            i += 6\n        return True\n\n    max_prime = -1\n    while n % 2 == 0:\n        max_prime = 2\n        n //= 2\n    for i in range(3, int(n**0.5) + 1, 2):\n        while n % i == 0:\n            max_prime = i\n            n //= i\n    if n > 2:\n        max_prime = n\n    return max_prime", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/60": [{"task_id": "HumanEval/60", "solution": "def sum_to_n(n: int):\n    \"\"\"sum_to_n is a function that sums numbers from 1 to n.\n    >>> sum_to_n(30)\n    465\n    >>> sum_to_n(100)\n    5050\n    >>> sum_to_n(5)\n    15\n    >>> sum_to_n(10)\n    55\n    >>> sum_to_n(1)\n    1\n    \"\"\"\n    return sum(range(1, n + 1))", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/61": [{"task_id": "HumanEval/61", "solution": "def correct_bracketing(brackets: str):\n    \"\"\" brackets is a string of \"(\" and \")\".\n    return True if every opening bracket has a corresponding closing bracket.\n\n    >>> correct_bracketing(\"(\")\n    False\n    >>> correct_bracketing(\"()\")\n    True\n    >>> correct_bracketing(\"(()())\")\n    True\n    >>> correct_bracketing(\")(()\")\n    False\n    \"\"\"\n    stack = []\n    for bracket in brackets:\n        if bracket == '(':\n            stack.append(bracket)\n        elif bracket == ')':\n            if not stack:\n                return False\n            stack.pop()\n    return not stack", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/62": [{"task_id": "HumanEval/62", "solution": "def derivative(xs: list):\n    \"\"\" xs represent coefficients of a polynomial.\n    xs[0] + xs[1] * x + xs[2] * x^2 + ....\n     Return derivative of this polynomial in the same form.\n    >>> derivative([3, 1, 2, 4, 5])\n    [1, 4, 12, 20]\n    >>> derivative([1, 2, 3])\n    [2, 6]\n    \"\"\"\n    return [xs[i] * i for i in range(1, len(xs))]", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}, {"task_id": "HumanEval/62", "solution": "def derivative(xs):\n    # Check if the list is empty\n    if len(xs) == 0:\n        return []\n    # Initialize the resulting list\n    result = []\n    # Iterate through the coefficients\n    for i in range(1, len(xs)):\n        # Calculate the derivative of each term and append it to the result\n        result.append(xs[i] * i)\n    return result", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/63": [{"task_id": "HumanEval/63", "solution": "def fibfib(n: int):\n    if n == 0:\n        return 0\n    elif n == 1:\n        return 0\n    elif n == 2:\n        return 1\n    else:\n        a, b, c = 0, 0, 1\n        for i in range(3, n + 1):\n            a, b, c = b, c, a + b + c\n        return c", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/64": [{"task_id": "HumanEval/64", "solution": "def vowels_count(s):\n    vowels = 'aeiouAEIOU'\n    count = 0\n    s = s.lower()\n    for i, char in enumerate(s):\n        if char in vowels:\n            count += 1\n        elif char == 'y' and i == len(s) - 1:\n            count += 1\n    return count", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/65": [{"task_id": "HumanEval/65", "solution": "def circular_shift(x, shift):\n    x_str = str(x)\n    num_digits = len(x_str)\n    if shift > num_digits:\n        return x_str[::-1]\n    else:\n        shifted_str = x_str[-shift:] + x_str[:-shift]\n        return shifted_str", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/66": [{"task_id": "HumanEval/66", "solution": "def digitSum(s):\n    \"\"\"Task\n    Write a function that takes a string as input and returns the sum of the upper characters only'\n    ASCII codes.\n\n    Examples:\n        digitSum(\"\") => 0\n        digitSum(\"abAB\") => 131\n        digitSum(\"abcCd\") => 67\n        digitSum(\"helloE\") => 69\n        digitSum(\"woArBld\") => 131\n        digitSum(\"aAaaaXa\") => 153\n    \"\"\"\n    return sum(ord(char) for char in s if char.isupper())", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/67": [{"task_id": "HumanEval/67", "solution": "def fruit_distribution(s, n):\n    # Split the string to extract the number of apples and oranges\n    parts = s.split()\n    apples = int(parts[0])\n    oranges = int(parts[3])\n    \n    # Calculate the number of mangoes\n    mangoes = n - apples - oranges\n    \n    return mangoes", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/68": [{"task_id": "HumanEval/68", "solution": "def pluck(arr):\n    \"\"\"\n    Given an array representing a branch of a tree that has non-negative integer nodes\n    your task is to pluck one of the nodes and return it.\n    The plucked node should be the node with the smallest even value.\n    If multiple nodes with the same smallest even value are found return the node that has smallest index.\n\n    The plucked node should be returned in a list, [ smalest_value, its index ],\n    If there are no even values or the given array is empty, return []\n    \"\"\"\n    if not arr:\n        return []\n\n    smallest_even_value = float('inf')\n    smallest_index = -1\n\n    for index, value in enumerate(arr):\n        if value % 2 == 0 and value < smallest_even_value:\n            smallest_even_value = value\n            smallest_index = index\n\n    if smallest_index == -1:\n        return []\n    else:\n        return [smallest_even_value, smallest_index]", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/69": [{"task_id": "HumanEval/69", "solution": "def search(lst):\n    from collections import Counter\n    freq = Counter(lst)\n    result = -1\n    for num, count in freq.items():\n        if count >= num:\n            result = max(result, num)\n    return result", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/70": [{"task_id": "HumanEval/70", "solution": "def strange_sort_list(lst):\n    '''\n    Given list of integers, return list in strange order.\n    Strange sorting, is when you start with the minimum value,\n    then maximum of the remaining integers, then minimum and so on.\n\n    Examples:\n    strange_sort_list([1, 2, 3, 4]) == [1, 4, 2, 3]\n    strange_sort_list([5, 5, 5, 5]) == [5, 5, 5, 5]\n    strange_sort_list([]) == []\n    '''\n    if not lst:\n        return []\n    \n    sorted_lst = sorted(lst)\n    result = []\n    left = 0\n    right = len(sorted_lst) - 1\n    toggle = True\n    \n    while left <= right:\n        if toggle:\n            result.append(sorted_lst[left])\n            left += 1\n        else:\n            result.append(sorted_lst[right])\n            right -= 1\n        toggle = not toggle\n    \n    return result", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/71": [{"task_id": "HumanEval/71", "solution": "def triangle_area(a, b, c):\n    '''\n    Given the lengths of the three sides of a triangle. Return the area of\n    the triangle rounded to 2 decimal points if the three sides form a valid triangle.\n    Otherwise return -1\n    Three sides make a valid triangle when the sum of any two sides is greater\n    than the third side.\n    Example:\n    triangle_area(3, 4, 5) == 6.00\n    triangle_area(1, 2, 10) == -1\n    '''\n    \n    # Check if the sides form a valid triangle\n    if a + b > c and a + c > b and b + c > a:\n        # Calculate the semi-perimeter\n        s = (a + b + c) / 2\n        # Calculate the area using Heron's formula\n        area = (s * (s - a) * (s - b) * (s - c)) ** 0.5\n        # Return the area rounded to 2 decimal points\n        return round(area, 2)\n    else:\n        return -1", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/72": [{"task_id": "HumanEval/72", "solution": "def will_it_fly(q, w):\n    '''\n    Write a function that returns True if the object q will fly, and False otherwise.\n    The object q will fly if it's balanced (it is a palindromic list) and the sum of its elements is less than or equal the maximum possible weight w.\n\n    Example:\n    will_it_fly([1, 2], 5) \u279e False \n    # 1+2 is less than the maximum possible weight, but it's unbalanced.\n\n    will_it_fly([3, 2, 3], 1) \u279e False\n    # it's balanced, but 3+2+3 is more than the maximum possible weight.\n\n    will_it_fly([3, 2, 3], 9) \u279e True\n    # 3+2+3 is less than the maximum possible weight, and it's balanced.\n\n    will_it_fly([3], 5) \u279e True\n    # 3 is less than the maximum possible weight, and it's balanced.\n    '''\n    \n    # Check if the list is palindromic\n    is_palindromic = q == q[::-1]\n    \n    # Calculate the sum of the elements\n    total_weight = sum(q)\n    \n    # Check if the total weight is less than or equal to the maximum possible weight\n    is_within_weight_limit = total_weight <= w\n    \n    # The object will fly if it's balanced and within the weight limit\n    return is_palindromic and is_within_weight_limit", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/73": [{"task_id": "HumanEval/73", "solution": "def smallest_change(arr):\n    n = len(arr)\n    changes = 0\n    for i in range(n // 2):\n        if arr[i] != arr[n - i - 1]:\n            changes += 1\n    return changes", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/74": [{"task_id": "HumanEval/74", "solution": "def total_match(lst1, lst2):\n    '''\n    Write a function that accepts two lists of strings and returns the list that has \n    total number of chars in the all strings of the list less than the other list.\n\n    if the two lists have the same number of chars, return the first list.\n\n    Examples\n    total_match([], []) \u279e []\n    total_match(['hi', 'admin'], ['hI', 'Hi']) \u279e ['hI', 'Hi']\n    total_match(['hi', 'admin'], ['hi', 'hi', 'admin', 'project']) \u279e ['hi', 'admin']\n    total_match(['hi', 'admin'], ['hI', 'hi', 'hi']) \u279e ['hI', 'hi', 'hi']\n    total_match(['4'], ['1', '2', '3', '4', '5']) \u279e ['4']\n    '''\n    \n    def total_chars(lst):\n        return sum(len(s) for s in lst)\n\n    total_chars_lst1 = total_chars(lst1)\n    total_chars_lst2 = total_chars(lst2)\n\n    if total_chars_lst1 < total_chars_lst2:\n        return lst1\n    else:\n        return lst2", "base_status": "fail", "plus_status": null, "base_fail_tests": [[["hi", "admin"], ["hI", "hi", "hii"]]], "plus_fail_tests": []}], "HumanEval/75": [{"task_id": "HumanEval/75", "solution": "def is_prime(n):\n    if n <= 1:\n        return False\n    for i in range(2, int(n**0.5) + 1):\n        if n % i == 0:\n            return False\n    return True\ndef is_multiply_prime(a):\n    primes = [i for i in range(2, 100) if is_prime(i)]\n    for i in range(len(primes)):\n        for j in range(i, len(primes)):\n            for k in range(j, len(primes)):\n                if primes[i] * primes[j] * primes[k] == a:\n                    return True\n    return False", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/76": [{"task_id": "HumanEval/76", "solution": "def is_simple_power(x, n):\n    if n == 1:\n        return x == 1\n    power = 1\n    while power < x:\n        power *= n\n    return power == x", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/77": [{"task_id": "HumanEval/77", "solution": "def iscube(a):\n    \"\"\"\n    Write a function that takes an integer a and returns True \n    if this integer is a cube of some integer number.\n    Note: you may assume the input is always valid.\n    Examples:\n    iscube(1) ==> True\n    iscube(2) ==> False\n    iscube(-1) ==> True\n    iscube(64) ==> True\n    iscube(0) ==> True\n    iscube(180) ==> False\n    \"\"\"\n    cube_root = abs(a) ** (1.0 / 3.0)\n    return round(cube_root) ** 3 == abs(a)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/78": [{"task_id": "HumanEval/78", "solution": "def hex_key(num):\n    \"\"\"You have been tasked to write a function that receives \n    a hexadecimal number as a string and counts the number of hexadecimal \n    digits that are primes (prime number, or a prime, is a natural number \n    greater than 1 that is not a product of two smaller natural numbers).\n    Hexadecimal digits are 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A, B, C, D, E, F.\n    Prime numbers are 2, 3, 5, 7, 11, 13, 17,...\n    So you have to determine a number of the following digits: 2, 3, 5, 7, \n    B (=decimal 11), D (=decimal 13).\n    Note: you may assume the input is always correct or empty string, \n    and symbols A,B,C,D,E,F are always uppercase.\n    Examples:\n    For num = \"AB\" the output should be 1.\n    For num = \"1077E\" the output should be 2.\n    For num = \"ABED1A33\" the output should be 4.\n    For num = \"123456789ABCDEF0\" the output should be 6.\n    For num = \"2020\" the output should be 2.\n    \"\"\"\n    prime_hex_digits = {'2', '3', '5', '7', 'B', 'D'}\n    count = 0\n    for digit in num:\n        if digit in prime_hex_digits:\n            count += 1\n    return count", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/79": [{"task_id": "HumanEval/79", "solution": "def decimal_to_binary(decimal):\n    \"\"\"You will be given a number in decimal form and your task is to convert it to\n    binary format. The function should return a string, with each character representing a binary\n    number. Each character in the string will be '0' or '1'.\n\n    There will be an extra couple of characters 'db' at the beginning and at the end of the string.\n    The extra characters are there to help with the format.\n\n    Examples:\n    decimal_to_binary(15)   # returns \"db1111db\"\n    decimal_to_binary(32)   # returns \"db100000db\"\n    \"\"\"\n    binary_str = bin(decimal)[2:]\n    return 'db' + binary_str + 'db'", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/80": [{"task_id": "HumanEval/80", "solution": "def is_happy(s):\n    if len(s) < 3:\n        return False\n    \n    for i in range(len(s) - 2):\n        if len(set(s[i:i+3])) != 3:\n            return False\n    \n    return True", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/81": [{"task_id": "HumanEval/81", "solution": "def numerical_letter_grade(grades):\n    letter_grades = []\n    for gpa in grades:\n        if gpa == 4.0:\n            letter_grades.append('A+')\n        elif gpa > 3.7:\n            letter_grades.append('A')\n        elif gpa > 3.3:\n            letter_grades.append('A-')\n        elif gpa > 3.0:\n            letter_grades.append('B+')\n        elif gpa > 2.7:\n            letter_grades.append('B')\n        elif gpa > 2.3:\n            letter_grades.append('B-')\n        elif gpa > 2.0:\n            letter_grades.append('C+')\n        elif gpa > 1.7:\n            letter_grades.append('C')\n        elif gpa > 1.3:\n            letter_grades.append('C-')\n        elif gpa > 1.0:\n            letter_grades.append('D+')\n        elif gpa > 0.7:\n            letter_grades.append('D')\n        elif gpa > 0.0:\n            letter_grades.append('D-')\n        elif gpa == 0.0:\n            letter_grades.append('E')\n    return letter_grades", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/82": [{"task_id": "HumanEval/82", "solution": "def prime_length(string):\n    def is_prime(n):\n        if n <= 1:\n            return False\n        for i in range(2, int(n**0.5) + 1):\n            if n % i == 0:\n                return False\n        return True\n\n    return is_prime(len(string))", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/83": [{"task_id": "HumanEval/83", "solution": "def starts_one_ends(n):\n    \"\"\"\n    Given a positive integer n, return the count of the numbers of n-digit\n    positive integers that start or end with 1.\n    \"\"\"\n    if n == 1:\n        return 2  # The numbers 1 and 10\n    else:\n        return 2 * (10 ** (n - 1)) - (10 ** (n - 2))", "base_status": "fail", "plus_status": null, "base_fail_tests": [[1]], "plus_fail_tests": []}], "HumanEval/85": [{"task_id": "HumanEval/85", "solution": "def add(lst):\n    \"\"\"Given a non-empty list of integers lst. add the even elements that are at odd indices.\n\n    Examples:\n        add([4, 2, 6, 7]) ==> 2 \n    \"\"\"\n    return sum(x for i, x in enumerate(lst) if i % 2 != 0 and x % 2 == 0)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/84": [{"task_id": "HumanEval/84", "solution": "def solve(N):\n    \"\"\"Given a positive integer N, return the total sum of its digits in binary.\n    \n    Example\n        For N = 1000, the sum of digits will be 1 the output should be \"1\".\n        For N = 150, the sum of digits will be 6 the output should be \"110\".\n        For N = 147, the sum of digits will be 12 the output should be \"1100\".\n    \n    Variables:\n        @N integer\n             Constraints: 0 \u2264 N \u2264 10000.\n    Output:\n         a string of binary number\n    \"\"\"\n    \n    # Convert the number to a string to iterate over each digit\n    digits = str(N)\n    \n    # Calculate the sum of the digits\n    sum_of_digits = sum(int(digit) for digit in digits)\n    \n    # Convert the sum to binary and return it as a string\n    return bin(sum_of_digits)[2:]", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/87": [{"task_id": "HumanEval/87", "solution": "def get_row(lst, x):\n    coordinates = []\n    for i, row in enumerate(lst):\n        for j, value in enumerate(row):\n            if value == x:\n                coordinates.append((i, j))\n    coordinates.sort(key=lambda coord: (coord[0], -coord[1]))\n    return coordinates", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/86": [{"task_id": "HumanEval/86", "solution": "def anti_shuffle(s):\n    \"\"\"\n    Write a function that takes a string and returns an ordered version of it.\n    Ordered version of string, is a string where all words (separated by space)\n    are replaced by a new word where all the characters arranged in\n    ascending order based on ascii value.\n    Note: You should keep the order of words and blank spaces in the sentence.\n\n    For example:\n    anti_shuffle('Hi') returns 'Hi'\n    anti_shuffle('hello') returns 'ehllo'\n    anti_shuffle('Hello World!!!') returns 'Hello !!!Wdlor'\n    \"\"\"\n    words = s.split(' ')\n    ordered_words = [''.join(sorted(word)) for word in words]\n    return ' '.join(ordered_words)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/89": [{"task_id": "HumanEval/89", "solution": "def encrypt(s):\n    encrypted_string = ''\n    for char in s:\n        if char.isalpha():\n            shift = 4  # 2 multiplied by 2\n            if char.islower():\n                encrypted_char = chr((ord(char) - ord('a') + shift) % 26 + ord('a'))\n            else:\n                encrypted_char = chr((ord(char) - ord('A') + shift) % 26 + ord('A'))\n            encrypted_string += encrypted_char\n        else:\n            encrypted_string += char\n    return encrypted_string", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/88": [{"task_id": "HumanEval/88", "solution": "def sort_array(array):\n    \"\"\"\n    Given an array of non-negative integers, return a copy of the given array after sorting,\n    you will sort the given array in ascending order if the sum( first index value, last index value) is odd,\n    or sort it in descending order if the sum( first index value, last index value) is even.\n\n    Note:\n    * don't change the given array.\n\n    Examples:\n    * sort_array([]) => []\n    * sort_array([5]) => [5]\n    * sort_array([2, 4, 3, 0, 1, 5]) => [0, 1, 2, 3, 4, 5]\n    * sort_array([2, 4, 3, 0, 1, 5, 6]) => [6, 5, 4, 3, 2, 1, 0]\n    \"\"\"\n    if not array:\n        return []\n\n    first_value = array[0]\n    last_value = array[-1]\n    sum_values = first_value + last_value\n\n    if sum_values % 2 == 0:\n        return sorted(array, reverse=True)\n    else:\n        return sorted(array)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/90": [{"task_id": "HumanEval/90", "solution": "def next_smallest(lst):\n    if len(lst) < 2:\n        return None\n    \n    unique_sorted = sorted(set(lst))\n    if len(unique_sorted) < 2:\n        return None\n    \n    return unique_sorted[1]", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/91": [{"task_id": "HumanEval/91", "solution": "def is_bored(S):\n    \"\"\"\n    You'll be given a string of words, and your task is to count the number\n    of boredoms. A boredom is a sentence that starts with the word \"I\".\n    Sentences are delimited by '.', '?' or '!'.\n    \n    For example:\n    >>> is_bored(\"Hello world\")\n    0\n    >>> is_bored(\"The sky is blue. The sun is shining. I love this weather\")\n    1\n    \"\"\"\n    sentences = re.split(r'[.!?]', S)\n    count = 0\n    for sentence in sentences:\n        words = sentence.strip().split()\n        if words and words[0].lower() == 'i':\n            count += 1\n    return count", "base_status": "fail", "plus_status": null, "base_fail_tests": [["Hello world"]], "plus_fail_tests": []}], "HumanEval/93": [{"task_id": "HumanEval/93", "solution": "def encode(message):\n    def swap_case(char):\n        if char.isupper():\n            return char.lower()\n        else:\n            return char.upper()\n\n    def replace_vowel(char):\n        vowels = 'aeiouAEIOU'\n        if char in vowels:\n            if char.islower():\n                return chr(((ord(char) - ord('a') + 2) % 26) + ord('a'))\n            else:\n                return chr(((ord(char) - ord('A') + 2) % 26) + ord('A'))\n        else:\n            return char\n\n    encoded_message = ''\n    for char in message:\n        swapped_char = swap_case(char)\n        replaced_char = replace_vowel(swapped_char)\n        encoded_message += replaced_char\n\n    return encoded_message", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/92": [{"task_id": "HumanEval/92", "solution": "def any_int(x, y, z):\n    '''\n    Create a function that takes 3 numbers.\n    Returns true if one of the numbers is equal to the sum of the other two, and all numbers are integers.\n    Returns false in any other cases.\n    \n    Examples\n    any_int(5, 2, 7) \u279e True\n    \n    any_int(3, 2, 2) \u279e False\n\n    any_int(3, -2, 1) \u279e True\n    \n    any_int(3.6, -2.2, 2) \u279e False\n    \n    '''\n    if isinstance(x, int) and isinstance(y, int) and isinstance(z, int):\n        return x == y + z or y == x + z or z == x + y\n    else:\n        return False", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/94": [{"task_id": "HumanEval/94", "solution": "def skjkasdkd(lst):\n    def is_prime(n):\n        if n <= 1:\n            return False\n        if n <= 3:\n            return True\n        if n % 2 == 0 or n % 3 == 0:\n            return False\n        i = 5\n        while i * i <= n:\n            if n % i == 0 or n % (i + 2) == 0:\n                return False\n            i += 6\n        return True\n\n    def sum_of_digits(n):\n        return sum(int(digit) for digit in str(n))\n\n    largest_prime = None\n    for num in lst:\n        if is_prime(num):\n            if largest_prime is None or num > largest_prime:\n                largest_prime = num\n\n    if largest_prime is not None:\n        return sum_of_digits(largest_prime)\n    else:\n        return 0", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}, {"task_id": "HumanEval/94", "solution": "import math", "base_status": "fail", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/95": [{"task_id": "HumanEval/95", "solution": "def check_dict_case(dict):\n    \"\"\"\n    Given a dictionary, return True if all keys are strings in lower \n    case or all keys are strings in upper case, else return False.\n    The function should return False if the given dictionary is empty.\n    Examples:\n    check_dict_case({\"a\":\"apple\", \"b\":\"banana\"}) should return True.\n    check_dict_case({\"a\":\"apple\", \"A\":\"banana\", \"B\":\"banana\"}) should return False.\n    check_dict_case({\"a\":\"apple\", 8:\"banana\", \"a\":\"apple\"}) should return False.\n    check_dict_case({\"Name\":\"John\", \"Age\":\"36\", \"City\":\"Houston\"}) should return False.\n    check_dict_case({\"STATE\":\"NC\", \"ZIP\":\"12345\" }) should return True.\n    \"\"\"\n    if not dict:\n        return False\n    \n    lower_case = all(isinstance(key, str) and key.islower() for key in dict)\n    upper_case = all(isinstance(key, str) and key.isupper() for key in dict)\n    \n    return lower_case or upper_case", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/96": [{"task_id": "HumanEval/96", "solution": "def count_up_to(n):\n    def is_prime(num):\n        if num <= 1:\n            return False\n        for i in range(2, int(num**0.5) + 1):\n            if num % i == 0:\n                return False\n        return True\n\n    primes = []\n    for i in range(2, n):\n        if is_prime(i):\n            primes.append(i)\n    return primes", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/97": [{"task_id": "HumanEval/97", "solution": "def multiply(a, b):\n    \"\"\"Complete the function that takes two integers and returns \n    the product of their unit digits.\n    Assume the input is always valid.\n    Examples:\n    multiply(148, 412) should return 16.\n    multiply(19, 28) should return 72.\n    multiply(2020, 1851) should return 0.\n    multiply(14,-15) should return 20.\n    \"\"\"\n    unit_digit_a = abs(a) % 10\n    unit_digit_b = abs(b) % 10\n    return unit_digit_a * unit_digit_b", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/98": [{"task_id": "HumanEval/98", "solution": "def count_upper(s):\n    \"\"\"\n    Given a string s, count the number of uppercase vowels in even indices.\n    \n    For example:\n    count_upper('aBCdEf') returns 1\n    count_upper('abcdefg') returns 0\n    count_upper('dBBE') returns 0\n    \"\"\"\n    vowels = 'AEIOU'\n    count = 0\n    for i in range(0, len(s), 2):\n        if s[i] in vowels:\n            count += 1\n    return count", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/100": [{"task_id": "HumanEval/100", "solution": "def make_a_pile(n):\n    \"\"\"\n    Given a positive integer n, you have to make a pile of n levels of stones.\n    The first level has n stones.\n    The number of stones in the next level is:\n        - the next odd number if n is odd.\n        - the next even number if n is even.\n    Return the number of stones in each level in a list, where element at index\n    i represents the number of stones in the level (i+1).\n\n    Examples:\n    >>> make_a_pile(3)\n    [3, 5, 7]\n    \"\"\"\n    pile = []\n    current_stones = n\n    for i in range(n):\n        pile.append(current_stones)\n        if n % 2 == 0:\n            current_stones += 2\n        else:\n            current_stones += 2\n    return pile", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/99": [{"task_id": "HumanEval/99", "solution": "def closest_integer(value):\n    '''\n    Create a function that takes a value (string) representing a number\n    and returns the closest integer to it. If the number is equidistant\n    from two integers, round it away from zero.\n\n    Examples\n    >>> closest_integer(\"10\")\n    10\n    >>> closest_integer(\"15.3\")\n    15\n\n    Note:\n    Rounding away from zero means that if the given number is equidistant\n    from two integers, the one you should return is the one that is the\n    farthest from zero. For example closest_integer(\"14.5\") should\n    return 15 and closest_integer(\"-14.5\") should return -15.\n    '''\n    number = float(value)\n    if number % 1 == 0.5:\n        if number > 0:\n            return int(number + 0.5)\n        else:\n            return int(number - 0.5)\n    else:\n        return round(number)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/101": [{"task_id": "HumanEval/101", "solution": "def words_string(s):\n    \"\"\"\n    You will be given a string of words separated by commas or spaces. Your task is\n    to split the string into words and return an array of the words.\n    \n    For example:\n    words_string(\"Hi, my name is John\") == [\"Hi\", \"my\", \"name\", \"is\", \"John\"]\n    words_string(\"One, two, three, four, five, six\") == [\"One\", \"two\", \"three\", \"four\", \"five\", \"six\"]\n    \"\"\"\n    import re\n    words = re.split(r'[,\\s]+', s)\n    return words", "base_status": "fail", "plus_status": null, "base_fail_tests": [["One,, two, three, four, five, six,"]], "plus_fail_tests": []}], "HumanEval/102": [{"task_id": "HumanEval/102", "solution": "def choose_num(x, y):\n    \"\"\"This function takes two positive numbers x and y and returns the\n    biggest even integer number that is in the range [x, y] inclusive. If \n    there's no such number, then the function should return -1.\n\n    For example:\n    choose_num(12, 15) = 14\n    choose_num(13, 12) = -1\n    \"\"\"\n    if x > y:\n        return -1\n    \n    for num in range(y, x-1, -1):\n        if num % 2 == 0:\n            return num\n    \n    return -1", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/104": [{"task_id": "HumanEval/104", "solution": "def unique_digits(x):\n    \"\"\"Given a list of positive integers x. return a sorted list of all \n    elements that hasn't any even digit.\n\n    Note: Returned list should be sorted in increasing order.\n    \n    For example:\n    >>> unique_digits([15, 33, 1422, 1])\n    [1, 15, 33]\n    >>> unique_digits([152, 323, 1422, 10])\n    []\n    \"\"\"\n    def has_even_digit(num):\n        for digit in str(num):\n            if int(digit) % 2 == 0:\n                return True\n        return False\n\n    result = [num for num in x if not has_even_digit(num)]\n    return sorted(result)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/103": [{"task_id": "HumanEval/103", "solution": "def rounded_avg(n, m):\n    if n > m:\n        return -1\n    avg = round((n + m) / 2)\n    return bin(avg)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/106": [{"task_id": "HumanEval/106", "solution": "def f(n):\n    \"\"\" Implement the function f that takes n as a parameter,\n    and returns a list of size n, such that the value of the element at index i is the factorial of i if i is even\n    or the sum of numbers from 1 to i otherwise.\n    i starts from 1.\n    the factorial of i is the multiplication of the numbers from 1 to i (1 * 2 * ... * i).\n    Example:\n    f(5) == [1, 2, 6, 24, 15]\n    \"\"\"\n    def factorial(num):\n        result = 1\n        for i in range(1, num + 1):\n            result *= i\n        return result\n\n    def sum_of_numbers(num):\n        return sum(range(1, num + 1))\n\n    result_list = []\n    for i in range(1, n + 1):\n        if i % 2 == 0:\n            result_list.append(factorial(i))\n        else:\n            result_list.append(sum_of_numbers(i))\n    return result_list", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/105": [{"task_id": "HumanEval/105", "solution": "def by_length(arr):\n    # Define a dictionary to map numbers to their corresponding names\n    num_to_name = {1: 'One', 2: 'Two', 3: 'Three', 4: 'Four', 5: 'Five', 6: 'Six', 7: 'Seven', 8: 'Eight', 9: 'Nine'}\n    \n    # Filter the array to include only numbers between 1 and 9 inclusive\n    filtered_arr = [num for num in arr if 1 <= num <= 9]\n    \n    # Sort the filtered array\n    sorted_arr = sorted(filtered_arr)\n    \n    # Reverse the sorted array\n    reversed_arr = sorted_arr[::-1]\n    \n    # Replace each digit by its corresponding name\n    result = [num_to_name[num] for num in reversed_arr]\n    \n    return result", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/107": [{"task_id": "HumanEval/107", "solution": "def even_odd_palindrome(n):\n    \"\"\"\n    Given a positive integer n, return a tuple that has the number of even and odd\n    integer palindromes that fall within the range(1, n), inclusive.\n\n    Example 1:\n\n        Input: 3\n        Output: (1, 2)\n        Explanation:\n        Integer palindrome are 1, 2, 3. one of them is even, and two of them are odd.\n\n    Example 2:\n\n        Input: 12\n        Output: (4, 6)\n        Explanation:\n        Integer palindrome are 1, 2, 3, 4, 5, 6, 7, 8, 9, 11. four of them are even, and 6 of them are odd.\n\n    Note:\n        1. 1 <= n <= 10^3\n        2. returned tuple has the number of even and odd integer palindromes respectively.\n    \"\"\"\n    def is_palindrome(num):\n        return str(num) == str(num)[::-1]\n\n    even_count = 0\n    odd_count = 0\n\n    for i in range(1, n + 1):\n        if is_palindrome(i):\n            if i % 2 == 0:\n                even_count += 1\n            else:\n                odd_count += 1\n\n    return (even_count, odd_count)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/108": [{"task_id": "HumanEval/108", "solution": "def count_nums(arr):\n    \"\"\"\n    Write a function count_nums which takes an array of integers and returns\n    the number of elements which has a sum of digits > 0.\n    If a number is negative, then its first signed digit will be negative:\n    e.g. -123 has signed digits -1, 2, and 3.\n    >>> count_nums([]) == 0\n    >>> count_nums([-1, 11, -11]) == 1\n    >>> count_nums([1, 1, 2]) == 3\n    \"\"\"\n    def sum_of_digits(num):\n        if num == 0:\n            return 0\n        sum_digits = 0\n        num = abs(num)\n        while num > 0:\n            sum_digits += num % 10\n            num //= 10\n        if num < 0:\n            sum_digits = -sum_digits\n        return sum_digits\n\n    count = 0\n    for num in arr:\n        if sum_of_digits(num) > 0:\n            count += 1\n    return count", "base_status": "fail", "plus_status": null, "base_fail_tests": [[[-1, -2, 0]]], "plus_fail_tests": []}], "HumanEval/109": [{"task_id": "HumanEval/109", "solution": "def move_one_ball(arr):\n    if not arr:\n        return True\n\n    n = len(arr)\n    sorted_arr = sorted(arr)\n\n    for i in range(n):\n        if arr[i:] + arr[:i] == sorted_arr:\n            return True\n\n    return False", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/110": [{"task_id": "HumanEval/110", "solution": "def exchange(lst1, lst2):\n    \"\"\"In this problem, you will implement a function that takes two lists of numbers,\n    and determines whether it is possible to perform an exchange of elements\n    between them to make lst1 a list of only even numbers.\n    There is no limit on the number of exchanged elements between lst1 and lst2.\n    If it is possible to exchange elements between the lst1 and lst2 to make\n    all the elements of lst1 to be even, return \"YES\".\n    Otherwise, return \"NO\".\n    For example:\n    exchange([1, 2, 3, 4], [1, 2, 3, 4]) => \"YES\"\n    exchange([1, 2, 3, 4], [1, 5, 3, 4]) => \"NO\"\n    It is assumed that the input lists will be non-empty.\n    \"\"\"\n    \n    # Check if there are enough even numbers in lst2 to replace all odd numbers in lst1\n    odd_count_lst1 = sum(1 for num in lst1 if num % 2 != 0)\n    even_count_lst2 = sum(1 for num in lst2 if num % 2 == 0)\n    \n    if even_count_lst2 >= odd_count_lst1:\n        return \"YES\"\n    else:\n        return \"NO\"", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}, {"task_id": "HumanEval/110", "solution": "def exchange(lst1, lst2):\n    # Check if all elements in lst1 are already even\n    if all(num % 2 == 0 for num in lst1):\n        return 'YES'\n    \n    # Count the number of odd numbers in lst1\n    count_odd = sum(1 for num in lst1 if num % 2 != 0)\n\n    # Count the number of even numbers in lst2\n    count_even = sum(1 for num in lst2 if num % 2 == 0)\n\n    # If the count of odd numbers in lst1 is 0 or the count of even numbers in lst2 is greater than or equal to the count of odd numbers in lst1,\n    # it is possible to exchange elements to make lst1 a list of only even numbers\n    if count_odd == 0 or count_even >= count_odd:\n        return 'YES'\n\n    # Iterate through lst2 and check if any odd number can be swapped\n    for num in lst2:\n        if num % 2 == 1:\n            return 'YES'\n\n    return 'NO'", "base_status": "fail", "plus_status": null, "base_fail_tests": [[[1, 2, 3, 4], [1, 5, 3, 4]]], "plus_fail_tests": []}], "HumanEval/111": [{"task_id": "HumanEval/111", "solution": "def histogram(test):\n    \"\"\"Given a string representing a space separated lowercase letters, return a dictionary\n    of the letter with the most repetition and containing the corresponding count.\n    If several letters have the same occurrence, return all of them.\n    \n    Example:\n    histogram('a b c') == {'a': 1, 'b': 1, 'c': 1}\n    histogram('a b b a') == {'a': 2, 'b': 2}\n    histogram('a b c a b') == {'a': 2, 'b': 2}\n    histogram('b b b b a') == {'b': 4}\n    histogram('') == {}\n\n    \"\"\"\n    if not test:\n        return {}\n\n    words = test.split()\n    freq = {}\n    for word in words:\n        if word in freq:\n            freq[word] += 1\n        else:\n            freq[word] = 1\n\n    max_freq = max(freq.values())\n    result = {k: v for k, v in freq.items() if v == max_freq}\n    return result", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/112": [{"task_id": "HumanEval/112", "solution": "def reverse_delete(s, c):\n    result = ''.join([char for char in s if char not in c])\n    is_palindrome = result == result[::-1]\n    return (result, is_palindrome)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/113": [{"task_id": "HumanEval/113", "solution": "def odd_count(lst):\n    result = []\n    for s in lst:\n        odd_count = sum(1 for char in s if int(char) % 2 != 0)\n        result.append(\"the number of odd elements {0}n the str{0}ng {0} of the {0}nput.\".format(odd_count))\n    return result", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/115": [{"task_id": "HumanEval/115", "solution": "def max_fill(grid, capacity):\n    total_water = 0\n    for row in grid:\n        total_water += sum(row)\n    return math.ceil(total_water / capacity)", "base_status": "fail", "plus_status": null, "base_fail_tests": [[[[0, 0, 1, 0], [0, 1, 0, 0], [1, 1, 1, 1]], 1]], "plus_fail_tests": []}], "HumanEval/114": [{"task_id": "HumanEval/114", "solution": "def minSubArraySum(nums):\n    min_sum = float('inf')\n    current_sum = 0\n    for num in nums:\n        current_sum = min(num, current_sum + num)\n        min_sum = min(min_sum, current_sum)\n    return min_sum", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/116": [{"task_id": "HumanEval/116", "solution": "def sort_array(arr):\n    \"\"\"\n    In this Kata, you have to sort an array of non-negative integers according to\n    number of ones in their binary representation in ascending order.\n    For similar number of ones, sort based on decimal value.\n\n    It must be implemented like this:\n    >>> sort_array([1, 5, 2, 3, 4]) == [1, 2, 3, 4, 5]\n    >>> sort_array([-2, -3, -4, -5, -6]) == [-6, -5, -4, -3, -2]\n    >>> sort_array([1, 0, 2, 3, 4]) [0, 1, 2, 3, 4]\n    \"\"\"\n    return sorted(arr, key=lambda x: (bin(x).count('1'), x))", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/117": [{"task_id": "HumanEval/117", "solution": "def select_words(s, n):\n    def count_consonants(word):\n        vowels = 'aeiouAEIOU'\n        return sum(1 for char in word if char.isalpha() and char not in vowels)\n\n    if not s:\n        return []\n\n    words = s.split()\n    result = [word for word in words if count_consonants(word) == n]\n    return result", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/118": [{"task_id": "HumanEval/118", "solution": "def get_closest_vowel(word):\n    vowels = 'aeiouAEIOU'\n    length = len(word)\n    for i in range(length - 2, 0, -1):\n        if word[i] in vowels and word[i - 1] not in vowels and word[i + 1] not in vowels:\n            return word[i]\n    return ''", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/119": [{"task_id": "HumanEval/119", "solution": "def match_parens(lst):\n    '''\n    You are given a list of two strings, both strings consist of open\n    parentheses '(' or close parentheses ')' only.\n    Your job is to check if it is possible to concatenate the two strings in\n    some order, that the resulting string will be good.\n    A string S is considered to be good if and only if all parentheses in S\n    are balanced. For example: the string '(())()' is good, while the string\n    '())' is not.\n    Return 'Yes' if there's a way to make a good string, and return 'No' otherwise.\n\n    Examples:\n    match_parens(['()(', ')']) == 'Yes'\n    match_parens([')', ')']) == 'No'\n    '''\n\n    def is_balanced(s):\n        balance = 0\n        for char in s:\n            if char == '(':\n                balance += 1\n            else:\n                balance -= 1\n            if balance < 0:\n                return False\n        return balance == 0\n\n    s1, s2 = lst\n    if is_balanced(s1 + s2) or is_balanced(s2 + s1):\n        return 'Yes'\n    else:\n        return 'No'", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/121": [{"task_id": "HumanEval/121", "solution": "def solution(lst):\n    \"\"\"Given a non-empty list of integers, return the sum of all of the odd elements that are in even positions.\n    \n    Examples\n    solution([5, 8, 7, 1]) ==> 12\n    solution([3, 3, 3, 3, 3]) ==> 9\n    solution([30, 13, 24, 321]) ==>0\n    \"\"\"\n    return sum(x for i, x in enumerate(lst) if i % 2 == 0 and x % 2 != 0)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/120": [{"task_id": "HumanEval/120", "solution": "def maximum(arr, k):\n    \"\"\"\n    Given an array arr of integers and a positive integer k, return a sorted list \n    of length k with the maximum k numbers in arr.\n\n    Example 1:\n\n        Input: arr = [-3, -4, 5], k = 3\n        Output: [-4, -3, 5]\n\n    Example 2:\n\n        Input: arr = [4, -4, 4], k = 2\n        Output: [4, 4]\n\n    Example 3:\n\n        Input: arr = [-3, 2, 1, 2, -1, -2, 1], k = 1\n        Output: [2]\n\n    Note:\n        1. The length of the array will be in the range of [1, 1000].\n        2. The elements in the array will be in the range of [-1000, 1000].\n        3. 0 <= k <= len(arr)\n    \"\"\"\n    return sorted(sorted(arr, reverse=True)[:k])", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/122": [{"task_id": "HumanEval/122", "solution": "def add_elements(arr, k):\n    \"\"\"\n    Given a non-empty array of integers arr and an integer k, return\n    the sum of the elements with at most two digits from the first k elements of arr.\n\n    Example:\n\n        Input: arr = [111,21,3,4000,5,6,7,8,9], k = 4\n        Output: 24 # sum of 21 + 3\n\n    Constraints:\n        1. 1 <= len(arr) <= 100\n        2. 1 <= k <= len(arr)\n    \"\"\"\n    sum_of_elements = 0\n    for i in range(k):\n        if 1 <= arr[i] <= 99:\n            sum_of_elements += arr[i]\n    return sum_of_elements", "base_status": "fail", "plus_status": null, "base_fail_tests": [[[1, -2, -3, 41, 57, 76, 87, 88, 99], 3]], "plus_fail_tests": []}], "HumanEval/123": [{"task_id": "HumanEval/123", "solution": "def get_odd_collatz(n):\n    \"\"\"\n    Given a positive integer n, return a sorted list that has the odd numbers in collatz sequence.\n\n    The Collatz conjecture is a conjecture in mathematics that concerns a sequence defined\n    as follows: start with any positive integer n. Then each term is obtained from the \n    previous term as follows: if the previous term is even, the next term is one half of \n    the previous term. If the previous term is odd, the next term is 3 times the previous\n    term plus 1. The conjecture is that no matter what value of n, the sequence will always reach 1.\n\n    Note:\n        1. Collatz(1) is [1].\n        2. returned list sorted in increasing order.\n\n    For example:\n    get_odd_collatz(5) returns [1, 5] # The collatz sequence for 5 is [5, 16, 8, 4, 2, 1], so the odd numbers are only 1, and 5.\n    \"\"\"\n    odd_numbers = []\n    while n != 1:\n        if n % 2 != 0:\n            odd_numbers.append(n)\n        if n % 2 == 0:\n            n = n // 2\n        else:\n            n = 3 * n + 1\n    odd_numbers.append(1)\n    return sorted(odd_numbers)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/124": [{"task_id": "HumanEval/124", "solution": "def valid_date(date):\n    \"\"\"You have to write a function which validates a given date string and\n    returns True if the date is valid otherwise False.\n    The date is valid if all of the following rules are satisfied:\n    1. The date string is not empty.\n    2. The number of days is not less than 1 or higher than 31 days for months 1,3,5,7,8,10,12. And the number of days is not less than 1 or higher than 30 days for months 4,6,9,11. And, the number of days is not less than 1 or higher than 29 for the month 2.\n    3. The months should not be less than 1 or higher than 12.\n    4. The date should be in the format: mm-dd-yyyy\n\n    for example:\n    valid_date('03-11-2000') => True\n\n    valid_date('15-01-2012') => False\n\n    valid_date('04-0-2040') => False\n\n    valid_date('06-04-2020') => True\n\n    valid_date('06/04/2020') => False\n    \"\"\"\n    if not date or len(date) != 10 or date[2] != '-' or date[5] != '-':\n        return False\n\n    try:\n        month, day, year = map(int, date.split('-'))\n    except ValueError:\n        return False\n\n    if month < 1 or month > 12:\n        return False\n\n    if day < 1:\n        return False\n\n    if month in [1, 3, 5, 7, 8, 10, 12]:\n        if day > 31:\n            return False\n    elif month in [4, 6, 9, 11]:\n        if day > 30:\n            return False\n    elif month == 2:\n        if day > 29:\n            return False\n\n    return True", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/125": [{"task_id": "HumanEval/125", "solution": "def split_words(txt):\n    if ' ' in txt:\n        return txt.split()\n    elif ',' in txt:\n        return txt.replace(',', ' ').split()\n    else:\n        return sum(1 for c in txt if c.islower() and (ord(c) - ord('a')) % 2 == 1)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/126": [{"task_id": "HumanEval/126", "solution": "def is_sorted(lst):\n    '''\n    Given a list of numbers, return whether or not they are sorted\n    in ascending order. If list has more than 1 duplicate of the same\n    number, return False. Assume no negative numbers and only integers.\n\n    Examples\n    is_sorted([5]) \u279e True\n    is_sorted([1, 2, 3, 4, 5]) \u279e True\n    is_sorted([1, 3, 2, 4, 5]) \u279e False\n    is_sorted([1, 2, 3, 4, 5, 6]) \u279e True\n    is_sorted([1, 2, 3, 4, 5, 6, 7]) \u279e True\n    is_sorted([1, 3, 2, 4, 5, 6, 7]) \u279e False\n    is_sorted([1, 2, 2, 3, 3, 4]) \u279e True\n    is_sorted([1, 2, 2, 2, 3, 4]) \u279e False\n    '''\n    \n    for i in range(len(lst) - 1):\n        if lst[i] > lst[i + 1]:\n            return False\n        if lst[i] == lst[i + 1]:\n            if lst.count(lst[i]) > 2:\n                return False\n    return True", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/127": [{"task_id": "HumanEval/127", "solution": "def intersection(interval1, interval2):\n    def is_prime(n):\n        if n <= 1:\n            return False\n        for i in range(2, int(n**0.5) + 1):\n            if n % i == 0:\n                return False\n        return True\n\n    start1, end1 = interval1\n    start2, end2 = interval2\n\n    intersection_start = max(start1, start2)\n    intersection_end = min(end1, end2)\n\n    if intersection_start > intersection_end:\n        return \"NO\"\n\n    intersection_length = intersection_end - intersection_start + 1\n\n    if is_prime(intersection_length):\n        return \"YES\"\n    else:\n        return \"NO\"", "base_status": "fail", "plus_status": null, "base_fail_tests": [[[-1, 1], [0, 4]]], "plus_fail_tests": []}], "HumanEval/129": [{"task_id": "HumanEval/129", "solution": "def minPath(grid, k):\n    import heapq\n    N = len(grid)\n    directions = [(0, 1), (1, 0), (0, -1), (-1, 0)]\n    \n    def is_valid(x, y):\n        return 0 <= x < N and 0 <= y < N\n    \n    min_heap = []\n    for i in range(N):\n        for j in range(N):\n            heapq.heappush(min_heap, ([grid[i][j]], i, j))\n    \n    while min_heap:\n        path, x, y = heapq.heappop(min_heap)\n        if len(path) == k:\n            return path\n        for dx, dy in directions:\n            nx, ny = x + dx, y + dy\n            if is_valid(nx, ny):\n                new_path = path + [grid[nx][ny]]\n                heapq.heappush(min_heap, (new_path, nx, ny))\n    \n    return []", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/128": [{"task_id": "HumanEval/128", "solution": "def prod_signs(arr):\n    if not arr:\n        return None\n    \n    product_of_signs = 1\n    sum_of_magnitudes = 0\n    \n    for num in arr:\n        if num > 0:\n            product_of_signs *= 1\n        elif num < 0:\n            product_of_signs *= -1\n        else:\n            product_of_signs *= 0\n        sum_of_magnitudes += abs(num)\n    \n    return product_of_signs * sum_of_magnitudes", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/131": [{"task_id": "HumanEval/131", "solution": "def digits(n):\n    \"\"\"Given a positive integer n, return the product of the odd digits.\n    Return 0 if all digits are even.\n    For example:\n    digits(1)  == 1\n    digits(4)  == 0\n    digits(235) == 15\n    \"\"\"\n    product = 1\n    has_odd_digit = False\n    while n > 0:\n        digit = n % 10\n        if digit % 2 != 0:\n            product *= digit\n            has_odd_digit = True\n        n //= 10\n    return product if has_odd_digit else 0", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/130": [{"task_id": "HumanEval/130", "solution": "def tri(n):\n    if n == 0:\n        return [1]\n    elif n == 1:\n        return [1, 3]\n    else:\n        sequence = [1, 3]\n        for i in range(2, n + 1):\n            if i % 2 == 0:\n                sequence.append(1 + i // 2)\n            else:\n                sequence.append(sequence[i - 1] + sequence[i - 2] + sequence[i + 1])\n        return sequence", "base_status": "fail", "plus_status": null, "base_fail_tests": [[3]], "plus_fail_tests": []}], "HumanEval/132": [{"task_id": "HumanEval/132", "solution": "def is_nested(string):\n    stack = []\n    for char in string:\n        if char == '[':\n            stack.append(char)\n        elif char == ']':\n            if stack:\n                stack.pop()\n            else:\n                return False\n    if len(stack) == 0:\n        for i in range(len(string) - 1):\n            if string[i] == '[' and string[i + 1] == ']':\n                return False\n        return True\n    return False", "base_status": "fail", "plus_status": null, "base_fail_tests": [["[[]]"]], "plus_fail_tests": []}, {"task_id": "HumanEval/132", "solution": "def is_nested(string):\n    stack = []\n    has_opening_bracket = False\n    for char in string:\n        if char == '[':\n            stack.append(char)\n            has_opening_bracket = True\n        elif char == ']':\n            if len(stack) > 0 and stack[-1] == '[':\n                stack.pop()\n            elif has_opening_bracket:\n                return True\n    return False", "base_status": "fail", "plus_status": null, "base_fail_tests": [["[[]]"]], "plus_fail_tests": []}], "HumanEval/133": [{"task_id": "HumanEval/133", "solution": "def sum_squares(lst):\n    \"\"\"You are given a list of numbers.\n    You need to return the sum of squared numbers in the given list,\n    round each element in the list to the upper int(Ceiling) first.\n    Examples:\n    For lst = [1,2,3] the output should be 14\n    For lst = [1,4,9] the output should be 98\n    For lst = [1,3,5,7] the output should be 84\n    For lst = [1.4,4.2,0] the output should be 29\n    For lst = [-2.4,1,1] the output should be 6\n    \"\"\"\n    import math\n    \n    # Round each element to the upper integer (ceiling)\n    rounded_lst = [math.ceil(num) for num in lst]\n    \n    # Calculate the sum of squared numbers\n    sum_of_squares = sum(num ** 2 for num in rounded_lst)\n    \n    return sum_of_squares", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/134": [{"task_id": "HumanEval/134", "solution": "def check_if_last_char_is_a_letter(txt):\n    if not txt or txt[-1] == ' ':\n        return False\n    words = txt.split()\n    if words:\n        last_word = words[-1]\n        if last_word.isalpha() and len(last_word) == 1:\n            return True\n    return False", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/135": [{"task_id": "HumanEval/135", "solution": "def can_arrange(arr):\n    for i in range(1, len(arr)):\n        if arr[i] < arr[i-1]:\n            return i\n    return -1", "base_status": "fail", "plus_status": null, "base_fail_tests": [[[4, 8, 5, 7, 3]]], "plus_fail_tests": []}], "HumanEval/136": [{"task_id": "HumanEval/136", "solution": "def largest_smallest_integers(lst):\n    '''\n    Create a function that returns a tuple (a, b), where 'a' is\n    the largest of negative integers, and 'b' is the smallest\n    of positive integers in a list.\n    If there is no negative or positive integers, return them as None.\n\n    Examples:\n    largest_smallest_integers([2, 4, 1, 3, 5, 7]) == (None, 1)\n    largest_smallest_integers([]) == (None, None)\n    largest_smallest_integers([0]) == (None, None)\n    '''\n    largest_negative = None\n    smallest_positive = None\n\n    for num in lst:\n        if num < 0:\n            if largest_negative is None or num > largest_negative:\n                largest_negative = num\n        elif num > 0:\n            if smallest_positive is None or num < smallest_positive:\n                smallest_positive = num\n\n    return (largest_negative, smallest_positive)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/137": [{"task_id": "HumanEval/137", "solution": "def compare_one(a, b):\n    \"\"\"\n    Create a function that takes integers, floats, or strings representing\n    real numbers, and returns the larger variable in its given variable type.\n    Return None if the values are equal.\n    Note: If a real number is represented as a string, the floating point might be . or ,\n\n    compare_one(1, 2.5) \u279e 2.5\n    compare_one(1, \"2,3\") \u279e \"2,3\"\n    compare_one(\"5,1\", \"6\") \u279e \"6\"\n    compare_one(\"1\", 1) \u279e None\n    \"\"\"\n    \n    def to_float_if_possible(value):\n        try:\n            if isinstance(value, str):\n                value = value.replace(',', '.')\n            return float(value)\n        except ValueError:\n            return value\n\n    a_float = to_float_if_possible(a)\n    b_float = to_float_if_possible(b)\n\n    if a_float > b_float:\n        return a\n    elif a_float < b_float:\n        return b\n    else:\n        return None", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/138": [{"task_id": "HumanEval/138", "solution": "def is_equal_to_sum_even(n):\n    \"\"\"Evaluate whether the given number n can be written as the sum of exactly 4 positive even numbers\"\"\"\n    if n < 8:\n        return False\n    return n % 2 == 0", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/139": [{"task_id": "HumanEval/139", "solution": "def special_factorial(n):\n    \"\"\"The Brazilian factorial is defined as:\n    brazilian_factorial(n) = n! * (n-1)! * (n-2)! * ... * 1!\n    where n > 0\n\n    For example:\n    >>> special_factorial(4)\n    288\n\n    The function will receive an integer as input and should return the special\n    factorial of this integer.\n    \"\"\"\n    def factorial(num):\n        if num == 0 or num == 1:\n            return 1\n        else:\n            return num * factorial(num - 1)\n\n    result = 1\n    for i in range(1, n + 1):\n        result *= factorial(i)\n\n    return result", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/140": [{"task_id": "HumanEval/140", "solution": "def fix_spaces(text):\n    \"\"\"\n    Given a string text, replace all spaces in it with underscores,\n    and if a string has more than 2 consecutive spaces,\n    then replace all consecutive spaces with -\n    \"\"\"\n    import re\n    \n    # Replace all spaces with underscores\n    text = text.replace(' ', '_')\n    \n    # Replace sequences of more than 2 underscores with a single hyphen\n    text = re.sub(r'_+', lambda m: '-' if len(m.group(0)) > 2 else m.group(0), text)\n    \n    return text", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/141": [{"task_id": "HumanEval/141", "solution": "def file_name_check(file_name):\n    \"\"\"Create a function which takes a string representing a file's name, and returns\n    'Yes' if the the file's name is valid, and returns 'No' otherwise.\n    A file's name is considered to be valid if and only if all the following conditions \n    are met:\n    - There should not be more than three digits ('0'-'9') in the file's name.\n    - The file's name contains exactly one dot '.'\n    - The substring before the dot should not be empty, and it starts with a letter from \n    the latin alphapet ('a'-'z' and 'A'-'Z').\n    - The substring after the dot should be one of these: ['txt', 'exe', 'dll']\n    Examples:\n    file_name_check(\"example.txt\") # => 'Yes'\n    file_name_check(\"1example.dll\") # => 'No' (the name should start with a latin alphapet letter)\n    \"\"\"\n    import re\n\n    # Check if there are more than three digits\n    if len(re.findall(r'\\d', file_name)) > 3:\n        return 'No'\n\n    # Split the file name into parts\n    parts = file_name.split('.', 1)\n    if len(parts) != 2:\n        return 'No'\n\n    name_part, extension_part = parts\n\n    # Check if the name part is empty or does not start with a latin alphabet letter\n    if not name_part or not re.match(r'^[a-zA-Z]', name_part):\n        return 'No'\n\n    # Check if the extension part is valid\n    if extension_part not in ['txt', 'exe', 'dll']:\n        return 'No'\n\n    return 'Yes'", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/142": [{"task_id": "HumanEval/142", "solution": "def sum_squares(lst):\n    \"\"\"\n    This function will take a list of integers. For all entries in the list, the function shall square the integer entry if its index is a \n    multiple of 3 and will cube the integer entry if its index is a multiple of 4 and not a multiple of 3. The function will not \n    change the entries in the list whose indexes are not a multiple of 3 or 4. The function shall then return the sum of all entries. \n    \n    Examples:\n    For lst = [1,2,3] the output should be 6\n    For lst = []  the output should be 0\n    For lst = [-1,-5,2,-1,-5]  the output should be -126\n    \"\"\"\n    for i in range(len(lst)):\n        if i % 3 == 0:\n            lst[i] = lst[i] ** 2\n        elif i % 4 == 0:\n            lst[i] = lst[i] ** 3\n    return sum(lst)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/143": [{"task_id": "HumanEval/143", "solution": "def is_prime(n):\n    if n <= 1:\n        return False\n    for i in range(2, int(n**0.5) + 1):\n        if n % i == 0:\n            return False\n    return True\ndef words_in_sentence(sentence):\n    words = sentence.split()\n    prime_length_words = [word for word in words if is_prime(len(word))]\n    return ' '.join(prime_length_words)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/144": [{"task_id": "HumanEval/144", "solution": "def simplify(x, n):\n    \"\"\"Your task is to implement a function that will simplify the expression\n    x * n. The function returns True if x * n evaluates to a whole number and False\n    otherwise. Both x and n, are string representation of a fraction, and have the following format,\n    <numerator>/<denominator> where both numerator and denominator are positive whole numbers.\n\n    You can assume that x, and n are valid fractions, and do not have zero as denominator.\n\n    simplify(\"1/5\", \"5/1\") = True\n    simplify(\"1/6\", \"2/1\") = False\n    simplify(\"7/10\", \"10/2\") = False\n    \"\"\"\n    \n    def parse_fraction(fraction):\n        numerator, denominator = map(int, fraction.split('/'))\n        return numerator, denominator\n\n    x_numerator, x_denominator = parse_fraction(x)\n    n_numerator, n_denominator = parse_fraction(n)\n\n    # Multiply the fractions\n    result_numerator = x_numerator * n_numerator\n    result_denominator = x_denominator * n_denominator\n\n    # Check if the result is a whole number\n    return result_numerator % result_denominator == 0", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/145": [{"task_id": "HumanEval/145", "solution": "def order_by_points(nums):\n    \"\"\"\n    Write a function which sorts the given list of integers\n    in ascending order according to the sum of their digits.\n    Note: if there are several items with similar sum of their digits,\n    order them based on their index in original list.\n\n    For example:\n    >>> order_by_points([1, 11, -1, -11, -12]) == [-1, -11, 1, -12, 11]\n    >>> order_by_points([]) == []\n    \"\"\"\n    def digit_sum(n):\n        return sum(int(digit) for digit in str(abs(n)))\n\n    return sorted(nums, key=lambda x: (digit_sum(x), nums.index(x)))", "base_status": "fail", "plus_status": null, "base_fail_tests": [[[1, 11, -1, -11, -12]]], "plus_fail_tests": []}], "HumanEval/146": [{"task_id": "HumanEval/146", "solution": "def specialFilter(nums):\n    def is_odd_digit(digit):\n        return digit in [1, 3, 5, 7, 9]\n\n    count = 0\n    for num in nums:\n        if num > 10:\n            str_num = str(abs(num))\n            if is_odd_digit(int(str_num[0])) and is_odd_digit(int(str_num[-1])):\n                count += 1\n    return count", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/148": [{"task_id": "HumanEval/148", "solution": "def bf(planet1, planet2):\n    planets = ['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune']\n    if planet1 not in planets or planet2 not in planets:\n        return ()\n    \n    index1 = planets.index(planet1)\n    index2 = planets.index(planet2)\n    \n    if index1 < index2:\n        return tuple(planets[index1 + 1:index2])\n    elif index1 > index2:\n        return tuple(planets[index2 + 1:index1])\n    else:\n        return ()", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/147": [{"task_id": "HumanEval/147", "solution": "def get_max_triples(n):\n    \"\"\"\n    You are given a positive integer n. You have to create an integer array a of length n.\n    For each i (1 \u2264 i \u2264 n), the value of a[i] = i * i - i + 1.\n    Return the number of triples (a[i], a[j], a[k]) of a where i < j < k,\n    and a[i] + a[j] + a[k] is a multiple of 3.\n\n    Example :\n        Input: n = 5\n        Output: 1\n        Explanation:\n        a = [1, 3, 7, 13, 21]\n        The only valid triple is (1, 7, 13).\n    \"\"\"\n    a = [i * i - i + 1 for i in range(1, n + 1)]\n    count = 0\n    for i in range(n):\n        for j in range(i + 1, n):\n            for k in range(j + 1, n):\n                if (a[i] + a[j] + a[k]) % 3 == 0:\n                    count += 1\n    return count", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/150": [{"task_id": "HumanEval/150", "solution": "def x_or_y(n, x, y):\n    def is_prime(num):\n        if num <= 1:\n            return False\n        for i in range(2, int(num**0.5) + 1):\n            if num % i == 0:\n                return False\n        return True\n\n    if is_prime(n):\n        return x\n    else:\n        return y", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/149": [{"task_id": "HumanEval/149", "solution": "def sorted_list_sum(lst):\n    \"\"\"Write a function that accepts a list of strings as a parameter,\n    deletes the strings that have odd lengths from it,\n    and returns the resulted list with a sorted order,\n    The list is always a list of strings and never an array of numbers,\n    and it may contain duplicates.\n    The order of the list should be ascending by length of each word, and you\n    should return the list sorted by that rule.\n    If two words have the same length, sort the list alphabetically.\n    The function should return a list of strings in sorted order.\n    You may assume that all words will have the same length.\n    For example:\n    assert list_sort([\"aa\", \"a\", \"aaa\"]) => [\"aa\"]\n    assert list_sort([\"ab\", \"a\", \"aaa\", \"cd\"]) => [\"ab\", \"cd\"]\n    \"\"\"\n    # Filter out strings with odd lengths\n    filtered_lst = [s for s in lst if len(s) % 2 == 0]\n    \n    # Sort the list by length and then alphabetically\n    sorted_lst = sorted(filtered_lst, key=lambda x: (len(x), x))\n    \n    return sorted_lst", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/152": [{"task_id": "HumanEval/152", "solution": "def compare(game, guess):\n    result = []\n    for score, g in zip(game, guess):\n        if score == g:\n            result.append(0)\n        else:\n            result.append(abs(score - g))\n    return result", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/151": [{"task_id": "HumanEval/151", "solution": "def double_the_difference(lst):\n    '''\n    Given a list of numbers, return the sum of squares of the numbers\n    in the list that are odd. Ignore numbers that are negative or not integers.\n    \n    double_the_difference([1, 3, 2, 0]) == 1 + 9 + 0 + 0 = 10\n    double_the_difference([-1, -2, 0]) == 0\n    double_the_difference([9, -2]) == 81\n    double_the_difference([0]) == 0  \n   \n    If the input list is empty, return 0.\n    '''\n    return sum(x**2 for x in lst if isinstance(x, int) and x > 0 and x % 2 != 0)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/154": [{"task_id": "HumanEval/154", "solution": "def cycpattern_check(a, b):\n    \"\"\"You are given 2 words. You need to return True if the second word or any of its rotations is a substring in the first word\n    cycpattern_check(\"abcd\",\"abd\") => False\n    cycpattern_check(\"hello\",\"ell\") => True\n    cycpattern_check(\"whassup\",\"psus\") => False\n    cycpattern_check(\"abab\",\"baa\") => True\n    cycpattern_check(\"efef\",\"eeff\") => False\n    cycpattern_check(\"himenss\",\"simen\") => True\n    \"\"\"\n    len_b = len(b)\n    for i in range(len_b):\n        rotated_b = b[i:] + b[:i]\n        if rotated_b in a:\n            return True\n    return False", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/153": [{"task_id": "HumanEval/153", "solution": "def Strongest_Extension(class_name, extensions):\n    def calculate_strength(extension):\n        CAP = sum(1 for char in extension if char.isupper())\n        SM = sum(1 for char in extension if char.islower())\n        return CAP - SM\n\n    strongest_extension = max(extensions, key=calculate_strength)\n    return f'{class_name}.{strongest_extension}'", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/155": [{"task_id": "HumanEval/155", "solution": "def even_odd_count(num):\n    \"\"\"Given an integer, return a tuple that has the number of even and odd digits respectively.\n\n    Example:\n        even_odd_count(-12) ==> (1, 1)\n        even_odd_count(123) ==> (1, 2)\n    \"\"\"\n    num = abs(num)  # Convert to positive to handle negative numbers\n    even_count = 0\n    odd_count = 0\n    while num > 0:\n        digit = num % 10\n        if digit % 2 == 0:\n            even_count += 1\n        else:\n            odd_count += 1\n        num //= 10\n    return (even_count, odd_count)", "base_status": "fail", "plus_status": null, "base_fail_tests": [[0]], "plus_fail_tests": []}], "HumanEval/156": [{"task_id": "HumanEval/156", "solution": "def int_to_mini_roman(number):\n    roman_numerals = [\n        (1000, 'm'), (900, 'cm'), (500, 'd'), (400, 'cd'),\n        (100, 'c'), (90, 'xc'), (50, 'l'), (40, 'xl'),\n        (10, 'x'), (9, 'ix'), (5, 'v'), (4, 'iv'), (1, 'i')\n    ]\n    result = ''\n    for value, numeral in roman_numerals:\n        while number >= value:\n            result += numeral\n            number -= value\n    return result", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/158": [{"task_id": "HumanEval/158", "solution": "def find_max(words):\n    def unique_char_count(word):\n        return len(set(word))\n\n    max_unique_count = 0\n    max_unique_words = []\n\n    for word in words:\n        unique_count = unique_char_count(word)\n        if unique_count > max_unique_count:\n            max_unique_count = unique_count\n            max_unique_words = [word]\n        elif unique_count == max_unique_count:\n            max_unique_words.append(word)\n\n    if max_unique_words:\n        return min(max_unique_words)\n    else:\n        return None", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/157": [{"task_id": "HumanEval/157", "solution": "def right_angle_triangle(a, b, c):\n    '''\n    Given the lengths of the three sides of a triangle. Return True if the three\n    sides form a right-angled triangle, False otherwise.\n    A right-angled triangle is a triangle in which one angle is right angle or \n    90 degree.\n    Example:\n    right_angle_triangle(3, 4, 5) == True\n    right_angle_triangle(1, 2, 3) == False\n    '''\n    # Sort the sides to ensure the largest side is considered as the hypotenuse\n    sides = sorted([a, b, c])\n    return sides[0]**2 + sides[1]**2 == sides[2]**2", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/159": [{"task_id": "HumanEval/159", "solution": "def eat(number, need, remaining):\n    \"\"\"\n    You're a hungry rabbit, and you already have eaten a certain number of carrots,\n    but now you need to eat more carrots to complete the day's meals.\n    you should return an array of [ total number of eaten carrots after your meals,\n                                    the number of carrots left after your meals ]\n    if there are not enough remaining carrots, you will eat all remaining carrots, but will still be hungry.\n    \n    Example:\n    * eat(5, 6, 10) -> [11, 4]\n    * eat(4, 8, 9) -> [12, 1]\n    * eat(1, 10, 10) -> [11, 0]\n    * eat(2, 11, 5) -> [7, 0]\n    \n    Variables:\n    @number : integer\n        the number of carrots that you have eaten.\n    @need : integer\n        the number of carrots that you need to eat.\n    @remaining : integer\n        the number of remaining carrots thet exist in stock\n    \n    Constrain:\n    * 0 <= number <= 1000\n    * 0 <= need <= 1000\n    * 0 <= remaining <= 1000\n\n    Have fun :)\n    \"\"\"\n    total_eaten = number + min(need, remaining)\n    carrots_left = max(0, remaining - need)\n    return [total_eaten, carrots_left]", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/160": [{"task_id": "HumanEval/160", "solution": "def do_algebra(operator, operand):\n    \"\"\"\n    Given two lists operator, and operand. The first list has basic algebra operations, and \n    the second list is a list of integers. Use the two given lists to build the algebric \n    expression and return the evaluation of this expression.\n\n    The basic algebra operations:\n    Addition ( + ) \n    Subtraction ( - ) \n    Multiplication ( * ) \n    Floor division ( // ) \n    Exponentiation ( ** ) \n\n    Example:\n    operator['+', '*', '-']\n    array = [2, 3, 4, 5]\n    result = 2 + 3 * 4 - 5\n    => result = 9\n\n    Note:\n        The length of operator list is equal to the length of operand list minus one.\n        Operand is a list of of non-negative integers.\n        Operator list has at least one operator, and operand list has at least two operands.\n\n    \"\"\"\n    expression = str(operand[0])\n    for i in range(len(operator)):\n        expression += ' ' + operator[i] + ' ' + str(operand[i + 1])\n    return eval(expression)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/162": [{"task_id": "HumanEval/162", "solution": "import hashlib\ndef string_to_md5(text):\n    \"\"\"\n    Given a string 'text', return its md5 hash equivalent string.\n    If 'text' is an empty string, return None.\n\n    >>> string_to_md5('Hello world') == '3e25960a79dbc69b674cd4ec67a72c62'\n    \"\"\"\n    if not text:\n        return None\n    return hashlib.md5(text.encode()).hexdigest()", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/161": [{"task_id": "HumanEval/161", "solution": "def solve(s):\n    if not any(c.isalpha() for c in s):\n        return s[::-1]\n    else:\n        result = []\n        for char in s:\n            if char.isalpha():\n                if char.islower():\n                    result.append(char.upper())\n                else:\n                    result.append(char.lower())\n            else:\n                result.append(char)\n        return ''.join(result)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}], "HumanEval/163": [{"task_id": "HumanEval/163", "solution": "def generate_integers(a, b):\n    \"\"\"\n    Given two positive integers a and b, return the even digits between a\n    and b, in ascending order.\n\n    For example:\n    generate_integers(2, 8) => [2, 4, 6, 8]\n    generate_integers(8, 2) => [2, 4, 6, 8]\n    generate_integers(10, 14) => []\n    \"\"\"\n    start = min(a, b)\n    end = max(a, b)\n    even_digits = [num for num in range(start, end + 1) if num % 2 == 0 and 10 > num >= 0]\n    return sorted(even_digits)", "base_status": "pass", "plus_status": null, "base_fail_tests": [], "plus_fail_tests": []}]}}
\ No newline at end of file