diff --git a/examples/ags/benchmark/drop.py b/examples/ags/benchmark/drop.py index 1dbea0f50..a27c82f07 100644 --- a/examples/ags/benchmark/drop.py +++ b/examples/ags/benchmark/drop.py @@ -112,7 +112,7 @@ def load_data(file_path: str, samples: int) -> List[Tuple[str, Dict[str, Any]]]: data = [data[i] for i in random_indices] return data -async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, Any]], graph: Callable) -> Tuple[str, str, float]: +async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, Any]], graph: Callable) -> Tuple[str, str, float, str]: def answer_json_to_strings(answer: Dict[str, Any]) -> Tuple[Tuple[str, ...], str]: if "number" in answer and answer["number"]: return tuple([str(answer["number"])]), "number" @@ -133,6 +133,8 @@ async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, raise ValueError(f"Answer type not found, should be one of number, spans or date at: {json.dumps(answer)}") prediction = await graph(question, passage) + cost = prediction[1] # 添加这行来获取cost + prediction = prediction[0] # 修改这行以获取实际的预测结果 def get_f1_score(prediction: str, golden_answer: str) -> float: predicted_bags = answer_to_bags(prediction) @@ -152,7 +154,7 @@ async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, max_score = score best_answer = golden_answer - return best_answer, prediction, max_score + return best_answer, prediction, max_score, cost # 修改返回值以包含cost async def evaluate_all_passages(annotations: List[Tuple[str, Dict[str, Any]]], graph: Callable, max_concurrent_tasks: int = 50) -> List[List[Any]]: semaphore = asyncio.Semaphore(max_concurrent_tasks) @@ -166,27 +168,29 @@ async def evaluate_all_passages(annotations: List[Tuple[str, Dict[str, Any]]], g answers = [qa_pair["answer"]] if "validated_answers" in qa_pair and qa_pair["validated_answers"]: answers.extend(qa_pair["validated_answers"]) - best_answer, prediction, score = await evaluate_problem(question, passage, answers, graph) - results.append([id, question, prediction, best_answer, score]) + best_answer, prediction, score, cost = await evaluate_problem(question, passage, answers, graph) + results.append([id, question, prediction, best_answer, score, cost]) # 修改这行以包含cost tasks = [sem_evaluate(id, annotation) for id, annotation in annotations] await tqdm_asyncio.gather(*tasks, desc="Evaluating DROP passages", total=len(annotations)) return results -def save_results_to_csv(results: List[List[Any]], path: str) -> float: - df = pd.DataFrame(results, columns=["id", "question", "prediction", "best_answer", "score"]) +def save_results_to_csv(results: List[List[Any]], path: str) -> Tuple[float, float]: + df = pd.DataFrame(results, columns=["id", "question", "prediction", "best_answer", "score", "cost"]) average_score = df["score"].mean() + total_cost = df["cost"].iloc[-1] # 添加这行来计算总cost output_file = f"{path}/{average_score:.5f}.csv" df.to_csv(output_file, index=False) print(f"Results saved to {output_file}") - return average_score + return average_score, total_cost # 修改返回值以包含total_cost -async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float: +async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]: data = load_data(file_path, samples) results = await evaluate_all_passages(data, graph, max_concurrent_tasks=20) - average_score = save_results_to_csv(results, path=path) + average_score, total_cost = save_results_to_csv(results, path=path) print(f"Average score on DROP dataset: {average_score:.5f}") - return average_score + print(f"Total Cost: {total_cost:.5f}") + return average_score, total_cost # 修改返回值以包含total_cost diff --git a/examples/ags/benchmark/gsm8k.py b/examples/ags/benchmark/gsm8k.py index 97dce7bff..ddc4a0169 100644 --- a/examples/ags/benchmark/gsm8k.py +++ b/examples/ags/benchmark/gsm8k.py @@ -14,7 +14,7 @@ from tqdm.asyncio import tqdm_asyncio from examples.ags.benchmark.utils import generate_random_indices def extract_number(text: str) -> Optional[float]: - """清理文本并提取单个数字""" + """Clean text and extract a single number""" matches = re.findall(r"[-+]?\d+(?:,\d{3})*(?:\.\d+)?|\d+\.\d+", text) if matches: last_number = matches[-1].replace(",", "") @@ -26,7 +26,7 @@ def extract_number(text: str) -> Optional[float]: return None def loose_match_score(expected_output: str, prediction: str, tolerance: float = 1e-6) -> int: - """宽松匹配分数计算函数""" + """Loose match score calculation function""" expected_number = extract_number(expected_output) predicted_number = extract_number(prediction) @@ -48,18 +48,19 @@ async def load_data(file_path: str, samples=1) -> List[dict]: data = [data[i] for i in random_indices] return data -def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> float: - """保存结果到CSV文件""" +def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]: + """Save results to CSV file""" df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"]) average_score = df["score"].mean() + total_cost = df["cost"].iloc[-1] output_file = f"{path}/{average_score:.5f}.csv" df.to_csv(output_file, index=False) print(f"Results saved to {output_file}") - return average_score + return average_score, total_cost async def evaluate_problem(input: str, graph: Callable, expected_output: str) -> Tuple[str, str, str, int, str]: - """评估单个问题""" + """Evaluate a single problem""" prompt = input max_retries = 5 retries = 0 @@ -87,7 +88,7 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) -> return input, output, expected_output, score, cost async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 20) -> List[Tuple[str, str, str, int, str]]: - """评估所有问题""" + """Evaluate all problems""" semaphore = asyncio.Semaphore(max_concurrent_tasks) async def sem_evaluate(problem): @@ -100,11 +101,11 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren return await tqdm_asyncio.gather(*tasks, desc="Evaluating problems", total=len(data)) -async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float: - """GSM8K评估主函数""" +async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]: + """GSM8K evaluation main function""" data = await load_data(file_path, samples) results = await evaluate_all_problems(data, graph, max_concurrent_tasks=5) - print(results) - average_score = save_results_to_csv(results, path=path) + average_score, total_cost = save_results_to_csv(results, path=path) print(f"Average score: {average_score:.5f}") - return average_score + print(f"Total Cost: {total_cost:.5f}") + return average_score, total_cost diff --git a/examples/ags/benchmark/hotpotqa.py b/examples/ags/benchmark/hotpotqa.py index 375882511..0990cc781 100644 --- a/examples/ags/benchmark/hotpotqa.py +++ b/examples/ags/benchmark/hotpotqa.py @@ -120,7 +120,7 @@ async def evaluate_problem(input: str, context_str: str, graph: Callable, expect while retries < max_retries: try: - prediction, supporting_sentences = await graph(input, context_str) if graph else "None" + prediction, supporting_sentences, cost = await graph(input, context_str) if graph else ("None", None, 0) predicted_bags = answer_to_bags(prediction) gold_bags = answer_to_bags(expected_output) @@ -137,9 +137,10 @@ async def evaluate_problem(input: str, context_str: str, graph: Callable, expect prediction = None supporting_sentences = None score = 0 + cost = 0 break - return input, prediction, expected_output, supporting_sentences, score + return input, prediction, expected_output, supporting_sentences, score, cost async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50): semaphore = asyncio.Semaphore(max_concurrent_tasks) @@ -156,21 +157,23 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren return await tqdm_asyncio.gather(*tasks, desc="Evaluating HotpotQA problems", total=len(data)) -def save_results_to_csv(results: List[Tuple[str, str, str, str, float]], path: str) -> float: +def save_results_to_csv(results: List[Tuple[str, str, str, str, float, str]], path: str) -> Tuple[float, float]: df = pd.DataFrame( - results, columns=["question", "prediction", "expected_output", "supporting_sentences", "score"] + results, columns=["question", "prediction", "expected_output", "supporting_sentences", "score", "cost"] ) average_score = df["score"].mean() + total_cost = df["cost"].iloc[-1] output_file = f"{path}/{average_score:.5f}.csv" df.to_csv(output_file, index=False) print(f"Results saved to {output_file}") - return average_score + return average_score, total_cost -async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float: +async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]: data = await load_data(file_path, samples) results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20) - average_score = save_results_to_csv(results, path=path) + average_score, total_cost = save_results_to_csv(results, path=path) print(f"Average score on HotpotQA dataset: {average_score:.5f}") - return average_score + print(f"Total Cost: {total_cost:.5f}") + return average_score, total_cost diff --git a/examples/ags/benchmark/humaneval.py b/examples/ags/benchmark/humaneval.py index fd28cff26..65a7f3f16 100644 --- a/examples/ags/benchmark/humaneval.py +++ b/examples/ags/benchmark/humaneval.py @@ -46,13 +46,15 @@ async def check_solution(solution, test_cases, entry_point): return FAIL, details -async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int]: +async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int, str]: max_retries = 5 retries = 0 while retries < max_retries: try: - solution = await graph(data["prompt"]) if graph else "None" + prediction = await graph(data["prompt"]) if graph else "None" + cost = prediction[1] # 添加这行来获取cost + solution = prediction[0] # 修改这行以获取实际的预测结果 ret = await check_solution(solution, data["test_cases"], data["entry_point"]) score = 1 if ret[0] == PASS else 0 @@ -67,11 +69,12 @@ async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, solution = None ret = (FAIL, []) score = 0 + cost = 0 # 添加这行来处理错误情况下的cost break - return data["prompt"], solution, ret[1], score + return data["prompt"], solution, ret[1], score, cost # 修改返回值以包含cost -async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int]]: +async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int, str]]: semaphore = asyncio.Semaphore(max_concurrent_tasks) async def sem_evaluate(problem): @@ -86,8 +89,9 @@ import os import time import json -def save_results_to_jsonl(results: List[Tuple[str, str, str, int]], path: str) -> float: +def save_results_to_jsonl(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]: avg_score = 0 + total_cost = 0 # 添加这行来计算总cost timestamp = int(time.time()) filename = f"humaneval_results_{timestamp}.jsonl" full_path = os.path.join(path, filename) @@ -101,19 +105,23 @@ def save_results_to_jsonl(results: List[Tuple[str, str, str, int]], path: str) - "prediction": result[1], "test_case_details": result[2], "score": result[3], + "cost": result[4], # 添加这行来包含cost } ) + "\n" ) avg_score += result[3] + total_cost += float(result[4]) # 添加这行来累加cost print(f"save to {full_path}") avg_score /= len(results) + total_cost = results[-1][4] # 使用最后一个结果的cost作为总cost - return round(avg_score, 5) + return round(avg_score, 5), round(total_cost, 5) # 修改返回值以包含total_cost -async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float: +async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]: data = await load_data(file_path, samples) results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20) - average_score = save_results_to_jsonl(results, path=path) + average_score, total_cost = save_results_to_jsonl(results, path=path) print(f"Average score on HumanEval dataset: {average_score:.5f}") - return average_score + print(f"Total Cost: {total_cost:.5f}") + return average_score, total_cost # 修改返回值以包含total_cost diff --git a/examples/ags/benchmark/math.py b/examples/ags/benchmark/math.py index 329237a51..2a6a5cf4c 100644 --- a/examples/ags/benchmark/math.py +++ b/examples/ags/benchmark/math.py @@ -221,14 +221,15 @@ async def load_data(file_path: str, samples: int = 200) -> List[dict]: data = [data[i] for i in random_indices] return data -def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> float: +def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]: df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"]) average_score = df["score"].mean() + total_cost = df["cost"].iloc[-1] output_file = f"{path}/{average_score:.5f}.csv" df.to_csv(output_file, index=False) print(f"Results saved to {output_file}") - return average_score + return average_score, total_cost async def evaluate_problem(problem: dict, graph: Callable) -> Tuple[str, str, str, int, str]: input_text = problem["problem"] @@ -269,9 +270,10 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren return await tqdm_asyncio.gather(*tasks, desc="Evaluating MATH problems", total=len(data)) -async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float: +async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]: data = await load_data(file_path, samples) results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20) - average_score = save_results_to_csv(results, path=path) + average_score, total_cost = save_results_to_csv(results, path=path) print(f"Average score on MATH dataset: {average_score:.5f}") - return average_score + print(f"Total Cost: {total_cost:.5f}") + return average_score, total_cost diff --git a/examples/ags/benchmark/mbpp.py b/examples/ags/benchmark/mbpp.py index 14075874a..b0722eb9c 100644 --- a/examples/ags/benchmark/mbpp.py +++ b/examples/ags/benchmark/mbpp.py @@ -51,13 +51,15 @@ async def check_solution(solution, test_cases, timeout=1): return FAIL, details -async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int]: +async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int, str]: max_retries = 5 retries = 0 while retries < max_retries: try: - solution = await graph(data["prompt"]) if graph else "None" + prediction = await graph(data["prompt"]) if graph else "None" + cost = prediction[1] + solution = prediction[0] ret = await check_solution(solution, data["test_list"]) score = 1 if ret[0] == PASS else 0 @@ -74,9 +76,9 @@ async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, score = 0 break - return data["prompt"], solution, ret[1], score + return data["prompt"], solution, ret[1], score, cost -async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int]]: +async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int, str]]: semaphore = asyncio.Semaphore(max_concurrent_tasks) async def sem_evaluate(problem): @@ -87,19 +89,20 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren return await tqdm_asyncio.gather(*tasks, desc="Evaluating MBPP problems", total=len(data)) -def save_results_to_csv(results: List[Tuple[str, str, str, int]], path: str) -> float: - df = pd.DataFrame(results, columns=["question", "prediction", "test_case_details", "score"]) +def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]: + df = pd.DataFrame(results, columns=["question", "prediction", "test_case_details", "score", "cost"]) average_score = df["score"].mean() + total_cost = df["cost"].iloc[-1] output_file = f"{path}/{average_score:.5f}.csv" df.to_csv(output_file, index=False) print(f"Results saved to {output_file}") + return average_score, total_cost - return average_score - -async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float: +async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]: data = await load_data(file_path, samples) results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20) - average_score = save_results_to_csv(results, path=path) + average_score, total_cost = save_results_to_csv(results, path=path) print(f"Average score on MBPP dataset: {average_score:.5f}") - return average_score + print(f"Total Cost: {total_cost:.5f}") + return average_score, total_cost diff --git a/examples/ags/experiments/baselines/cot_drop.py b/examples/ags/experiments/baselines/cot_drop.py index d0d3ecb34..692856766 100644 --- a/examples/ags/experiments/baselines/cot_drop.py +++ b/examples/ags/experiments/baselines/cot_drop.py @@ -40,7 +40,7 @@ class CoTSolveGraph(SolveGraph): async def __call__(self, question: str, context: str) -> Tuple[str, str]: answer = await self.cot_generate(question, context, mode="context_fill") - return answer + return answer, self.llm.cost_manager.total_cost if __name__ == "__main__": async def main(): @@ -48,7 +48,7 @@ if __name__ == "__main__": # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106") graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="DROP") file_path = "examples/ags/data/drop_dataset_dev.json" - samples = 3 + samples = 1 path = "examples/ags/data/baselines/general/drop" score = await drop_evaluation(graph, file_path, samples, path) return score diff --git a/examples/ags/experiments/baselines/cot_gsm8k.py b/examples/ags/experiments/baselines/cot_gsm8k.py index 3e08ff47d..9acc91346 100644 --- a/examples/ags/experiments/baselines/cot_gsm8k.py +++ b/examples/ags/experiments/baselines/cot_gsm8k.py @@ -49,25 +49,18 @@ if __name__ == "__main__": graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="Gsm8K") file_path = "examples/ags/data/gsm8k.jsonl" samples = 1055 - # samples = 100 path = "examples/ags/data/baselines/general" - score = await gsm8k_evaluation(graph, file_path, samples, path) - return score + score, cost = await gsm8k_evaluation(graph, file_path, samples, path) + return score, cost import asyncio asyncio.run(main()) -# self consistency operator; universal self consistency; +# medprompt operator; universal self consistency; # IO指的没有任何Trick,看LLM自身的一个效果。使用 model 发布者在对应的 dataset 使用的 prompt。 # deepseek-chat; gpt-4o-mini; gpt-35-turbo-1106 - - -GENERATE_PROMPT = """ -Generate Solution for the following problem: {problem_description} -""" - # med ensemble \ No newline at end of file