添加了cost计算

2026-06-14 15:25:17 +02:00 · 2024-09-10 18:51:18 +08:00 · 2024-09-10 18:51:18 +08:00 · ab112462a5
commit ab112462a5
parent 4ce18d7f48
8 changed files with 81 additions and 67 deletions
--- a/examples/ags/benchmark/drop.py
+++ b/examples/ags/benchmark/drop.py
@ -112,7 +112,7 @@ def load_data(file_path: str, samples: int) -> List[Tuple[str, Dict[str, Any]]]:
    data = [data[i] for i in random_indices]
    return data

-async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, Any]], graph: Callable) -> Tuple[str, str, float]:
+async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, Any]], graph: Callable) -> Tuple[str, str, float, str]:
    def answer_json_to_strings(answer: Dict[str, Any]) -> Tuple[Tuple[str, ...], str]:
        if "number" in answer and answer["number"]:
            return tuple([str(answer["number"])]), "number"
@ -133,6 +133,8 @@ async def evaluate_problem(question: str, passage: str, answers: List[Dict[str,
            raise ValueError(f"Answer type not found, should be one of number, spans or date at: {json.dumps(answer)}")

    prediction = await graph(question, passage)
+    cost = prediction[1]  # 添加这行来获取cost
+    prediction = prediction[0]  # 修改这行以获取实际的预测结果

    def get_f1_score(prediction: str, golden_answer: str) -> float:
        predicted_bags = answer_to_bags(prediction)
@ -152,7 +154,7 @@ async def evaluate_problem(question: str, passage: str, answers: List[Dict[str,
            max_score = score
            best_answer = golden_answer

-    return best_answer, prediction, max_score
+    return best_answer, prediction, max_score, cost  # 修改返回值以包含cost

 async def evaluate_all_passages(annotations: List[Tuple[str, Dict[str, Any]]], graph: Callable, max_concurrent_tasks: int = 50) -> List[List[Any]]:
    semaphore = asyncio.Semaphore(max_concurrent_tasks)
@ -166,27 +168,29 @@ async def evaluate_all_passages(annotations: List[Tuple[str, Dict[str, Any]]], g
                answers = [qa_pair["answer"]]
                if "validated_answers" in qa_pair and qa_pair["validated_answers"]:
                    answers.extend(qa_pair["validated_answers"])
-                best_answer, prediction, score = await evaluate_problem(question, passage, answers, graph)
-                results.append([id, question, prediction, best_answer, score])
+                best_answer, prediction, score, cost = await evaluate_problem(question, passage, answers, graph)
+                results.append([id, question, prediction, best_answer, score, cost])  # 修改这行以包含cost

    tasks = [sem_evaluate(id, annotation) for id, annotation in annotations]
    await tqdm_asyncio.gather(*tasks, desc="Evaluating DROP passages", total=len(annotations))

    return results

-def save_results_to_csv(results: List[List[Any]], path: str) -> float:
-    df = pd.DataFrame(results, columns=["id", "question", "prediction", "best_answer", "score"])
+def save_results_to_csv(results: List[List[Any]], path: str) -> Tuple[float, float]:
+    df = pd.DataFrame(results, columns=["id", "question", "prediction", "best_answer", "score", "cost"])
    average_score = df["score"].mean()
+    total_cost = df["cost"].iloc[-1] # 添加这行来计算总cost

    output_file = f"{path}/{average_score:.5f}.csv"
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

-    return average_score
+    return average_score, total_cost  # 修改返回值以包含total_cost

-async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
+async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
    data = load_data(file_path, samples)
    results = await evaluate_all_passages(data, graph, max_concurrent_tasks=20)
-    average_score = save_results_to_csv(results, path=path)
+    average_score, total_cost = save_results_to_csv(results, path=path)
    print(f"Average score on DROP dataset: {average_score:.5f}")
-    return average_score
+    print(f"Total Cost: {total_cost:.5f}")
+    return average_score, total_cost  # 修改返回值以包含total_cost
--- a/examples/ags/benchmark/gsm8k.py
+++ b/examples/ags/benchmark/gsm8k.py
@ -14,7 +14,7 @@ from tqdm.asyncio import tqdm_asyncio
 from examples.ags.benchmark.utils import generate_random_indices

 def extract_number(text: str) -> Optional[float]:
-    """清理文本并提取单个数字"""
+    """Clean text and extract a single number"""
    matches = re.findall(r"[-+]?\d+(?:,\d{3})*(?:\.\d+)?|\d+\.\d+", text)
    if matches:
        last_number = matches[-1].replace(",", "")
@ -26,7 +26,7 @@ def extract_number(text: str) -> Optional[float]:
        return None

 def loose_match_score(expected_output: str, prediction: str, tolerance: float = 1e-6) -> int:
-    """宽松匹配分数计算函数"""
+    """Loose match score calculation function"""
    expected_number = extract_number(expected_output)
    predicted_number = extract_number(prediction)

@ -48,18 +48,19 @@ async def load_data(file_path: str, samples=1) -> List[dict]:
    data = [data[i] for i in random_indices]
    return data
        
-def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> float:
-    """保存结果到CSV文件"""
+def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
+    """Save results to CSV file"""
    df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"])
    average_score = df["score"].mean()
+    total_cost = df["cost"].iloc[-1]

    output_file = f"{path}/{average_score:.5f}.csv"
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")
-    return average_score
+    return average_score, total_cost

 async def evaluate_problem(input: str, graph: Callable, expected_output: str) -> Tuple[str, str, str, int, str]:
-    """评估单个问题"""
+    """Evaluate a single problem"""
    prompt = input
    max_retries = 5
    retries = 0
@ -87,7 +88,7 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) ->
    return input, output, expected_output, score, cost

 async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 20) -> List[Tuple[str, str, str, int, str]]:
-    """评估所有问题"""
+    """Evaluate all problems"""
    semaphore = asyncio.Semaphore(max_concurrent_tasks)

    async def sem_evaluate(problem):
@ -100,11 +101,11 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren

    return await tqdm_asyncio.gather(*tasks, desc="Evaluating problems", total=len(data))

-async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
-    """GSM8K评估主函数"""
+async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
+    """GSM8K evaluation main function"""
    data = await load_data(file_path, samples)
    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=5)
-    print(results)
-    average_score = save_results_to_csv(results, path=path)
+    average_score, total_cost = save_results_to_csv(results, path=path)
    print(f"Average score: {average_score:.5f}")
-    return average_score
+    print(f"Total Cost: {total_cost:.5f}")
+    return average_score, total_cost
--- a/examples/ags/benchmark/hotpotqa.py
+++ b/examples/ags/benchmark/hotpotqa.py
@ -120,7 +120,7 @@ async def evaluate_problem(input: str, context_str: str, graph: Callable, expect

    while retries < max_retries:
        try:
-            prediction, supporting_sentences = await graph(input, context_str) if graph else "None"
+            prediction, supporting_sentences, cost = await graph(input, context_str) if graph else ("None", None, 0)
            predicted_bags = answer_to_bags(prediction)
            gold_bags = answer_to_bags(expected_output)

@ -137,9 +137,10 @@ async def evaluate_problem(input: str, context_str: str, graph: Callable, expect
                prediction = None
                supporting_sentences = None
                score = 0
+                cost = 0
                break

-    return input, prediction, expected_output, supporting_sentences, score
+    return input, prediction, expected_output, supporting_sentences, score, cost

 async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50):
    semaphore = asyncio.Semaphore(max_concurrent_tasks)
@ -156,21 +157,23 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren

    return await tqdm_asyncio.gather(*tasks, desc="Evaluating HotpotQA problems", total=len(data))

-def save_results_to_csv(results: List[Tuple[str, str, str, str, float]], path: str) -> float:
+def save_results_to_csv(results: List[Tuple[str, str, str, str, float, str]], path: str) -> Tuple[float, float]:
    df = pd.DataFrame(
-        results, columns=["question", "prediction", "expected_output", "supporting_sentences", "score"]
+        results, columns=["question", "prediction", "expected_output", "supporting_sentences", "score", "cost"]
    )
    average_score = df["score"].mean()
+    total_cost = df["cost"].iloc[-1]

    output_file = f"{path}/{average_score:.5f}.csv"
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

-    return average_score
+    return average_score, total_cost

-async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
+async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
    data = await load_data(file_path, samples)
    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
-    average_score = save_results_to_csv(results, path=path)
+    average_score, total_cost = save_results_to_csv(results, path=path)
    print(f"Average score on HotpotQA dataset: {average_score:.5f}")
-    return average_score
+    print(f"Total Cost: {total_cost:.5f}")
+    return average_score, total_cost
--- a/examples/ags/benchmark/humaneval.py
+++ b/examples/ags/benchmark/humaneval.py
@ -46,13 +46,15 @@ async def check_solution(solution, test_cases, entry_point):

    return FAIL, details

-async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int]:
+async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int, str]:
    max_retries = 5
    retries = 0

    while retries < max_retries:
        try:
-            solution = await graph(data["prompt"]) if graph else "None"
+            prediction = await graph(data["prompt"]) if graph else "None"
+            cost = prediction[1]  # 添加这行来获取cost
+            solution = prediction[0]  # 修改这行以获取实际的预测结果
            ret = await check_solution(solution, data["test_cases"], data["entry_point"])

            score = 1 if ret[0] == PASS else 0
@ -67,11 +69,12 @@ async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str,
                solution = None
                ret = (FAIL, [])
                score = 0
+                cost = 0  # 添加这行来处理错误情况下的cost
                break

-    return data["prompt"], solution, ret[1], score
+    return data["prompt"], solution, ret[1], score, cost  # 修改返回值以包含cost

-async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int]]:
+async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int, str]]:
    semaphore = asyncio.Semaphore(max_concurrent_tasks)

    async def sem_evaluate(problem):
@ -86,8 +89,9 @@ import os
 import time
 import json

-def save_results_to_jsonl(results: List[Tuple[str, str, str, int]], path: str) -> float:
+def save_results_to_jsonl(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
    avg_score = 0
+    total_cost = 0  # 添加这行来计算总cost
    timestamp = int(time.time())
    filename = f"humaneval_results_{timestamp}.jsonl"
    full_path = os.path.join(path, filename)
@ -101,19 +105,23 @@ def save_results_to_jsonl(results: List[Tuple[str, str, str, int]], path: str) -
                        "prediction": result[1],
                        "test_case_details": result[2],
                        "score": result[3],
+                        "cost": result[4],  # 添加这行来包含cost
                    }
                )
                + "\n"
            )
            avg_score += result[3]
+            total_cost += float(result[4])  # 添加这行来累加cost
    print(f"save to {full_path}")
    avg_score /= len(results)
+    total_cost = results[-1][4]  # 使用最后一个结果的cost作为总cost

-    return round(avg_score, 5)
+    return round(avg_score, 5), round(total_cost, 5)  # 修改返回值以包含total_cost

-async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
+async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
    data = await load_data(file_path, samples)
    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
-    average_score = save_results_to_jsonl(results, path=path)
+    average_score, total_cost = save_results_to_jsonl(results, path=path)
    print(f"Average score on HumanEval dataset: {average_score:.5f}")
-    return average_score
+    print(f"Total Cost: {total_cost:.5f}")
+    return average_score, total_cost  # 修改返回值以包含total_cost
--- a/examples/ags/benchmark/math.py
+++ b/examples/ags/benchmark/math.py
@ -221,14 +221,15 @@ async def load_data(file_path: str, samples: int = 200) -> List[dict]:
    data = [data[i] for i in random_indices]
    return data

-def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> float:
+def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
    df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"])
    average_score = df["score"].mean()
+    total_cost = df["cost"].iloc[-1]

    output_file = f"{path}/{average_score:.5f}.csv"
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")
-    return average_score
+    return average_score, total_cost

 async def evaluate_problem(problem: dict, graph: Callable) -> Tuple[str, str, str, int, str]:
    input_text = problem["problem"]
@ -269,9 +270,10 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren

    return await tqdm_asyncio.gather(*tasks, desc="Evaluating MATH problems", total=len(data))

-async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
+async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
    data = await load_data(file_path, samples)
    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
-    average_score = save_results_to_csv(results, path=path)
+    average_score, total_cost = save_results_to_csv(results, path=path)
    print(f"Average score on MATH dataset: {average_score:.5f}")
-    return average_score
+    print(f"Total Cost: {total_cost:.5f}")
+    return average_score, total_cost
--- a/examples/ags/benchmark/mbpp.py
+++ b/examples/ags/benchmark/mbpp.py
@ -51,13 +51,15 @@ async def check_solution(solution, test_cases, timeout=1):

    return FAIL, details

-async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int]:
+async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int, str]:
    max_retries = 5
    retries = 0

    while retries < max_retries:
        try:
-            solution = await graph(data["prompt"]) if graph else "None"
+            prediction = await graph(data["prompt"]) if graph else "None"
+            cost = prediction[1]
+            solution = prediction[0]
            ret = await check_solution(solution, data["test_list"])

            score = 1 if ret[0] == PASS else 0
@ -74,9 +76,9 @@ async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str,
                score = 0
                break

-    return data["prompt"], solution, ret[1], score
+    return data["prompt"], solution, ret[1], score, cost

-async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int]]:
+async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int, str]]:
    semaphore = asyncio.Semaphore(max_concurrent_tasks)

    async def sem_evaluate(problem):
@ -87,19 +89,20 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren

    return await tqdm_asyncio.gather(*tasks, desc="Evaluating MBPP problems", total=len(data))

-def save_results_to_csv(results: List[Tuple[str, str, str, int]], path: str) -> float:
-    df = pd.DataFrame(results, columns=["question", "prediction", "test_case_details", "score"])
+def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
+    df = pd.DataFrame(results, columns=["question", "prediction", "test_case_details", "score", "cost"])
    average_score = df["score"].mean()
+    total_cost = df["cost"].iloc[-1]

    output_file = f"{path}/{average_score:.5f}.csv"
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")
+    return average_score, total_cost

-    return average_score
-
-async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
+async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
    data = await load_data(file_path, samples)
    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
-    average_score = save_results_to_csv(results, path=path)
+    average_score, total_cost = save_results_to_csv(results, path=path)
    print(f"Average score on MBPP dataset: {average_score:.5f}")
-    return average_score
+    print(f"Total Cost: {total_cost:.5f}")
+    return average_score, total_cost
--- a/examples/ags/experiments/baselines/cot_drop.py
+++ b/examples/ags/experiments/baselines/cot_drop.py
@ -40,7 +40,7 @@ class CoTSolveGraph(SolveGraph):

    async def __call__(self, question: str, context: str) -> Tuple[str, str]:
        answer = await self.cot_generate(question, context, mode="context_fill")
-        return answer
+        return answer, self.llm.cost_manager.total_cost

 if __name__ == "__main__":
    async def main():
@ -48,7 +48,7 @@ if __name__ == "__main__":
        # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
        graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="DROP")
        file_path = "examples/ags/data/drop_dataset_dev.json"
-        samples = 3
+        samples = 1
        path = "examples/ags/data/baselines/general/drop"
        score = await drop_evaluation(graph, file_path, samples, path)
        return score
--- a/examples/ags/experiments/baselines/cot_gsm8k.py
+++ b/examples/ags/experiments/baselines/cot_gsm8k.py
@ -49,25 +49,18 @@ if __name__ == "__main__":
        graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="Gsm8K")
        file_path = "examples/ags/data/gsm8k.jsonl"
        samples = 1055
-        # samples = 100
        path = "examples/ags/data/baselines/general"
-        score = await gsm8k_evaluation(graph, file_path, samples, path)
-        return score
+        score, cost = await gsm8k_evaluation(graph, file_path, samples, path)
+        return score, cost

    import asyncio
    asyncio.run(main())


-# self consistency operator; universal self consistency; 
+# medprompt operator; universal self consistency; 

 # IO指的没有任何Trick，看LLM自身的一个效果。使用 model 发布者在对应的 dataset 使用的 prompt。

 # deepseek-chat; gpt-4o-mini; gpt-35-turbo-1106

-
-
-GENERATE_PROMPT = """
-Generate Solution for the following problem: {problem_description}
-"""
-
 # med ensemble