Update for fengwei

2026-06-05 14:55:18 +02:00 · 2024-09-16 18:13:30 +08:00 · 2024-09-16 18:13:30 +08:00 · 63f3f884c9
commit 63f3f884c9
parent 53890a5f86
2 changed files with 14 additions and 18 deletions
--- a/examples/ags/benchmark/gsm8k.py
+++ b/examples/ags/benchmark/gsm8k.py
@ -52,7 +52,7 @@ def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str
    """Save results to CSV file"""
    df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"])
    average_score = df["score"].mean()
-    total_cost = df["cost"].iloc[-1]
+    total_cost = df["cost"].max()

    output_file = f"{path}/{average_score:.5f}.csv"
    df.to_csv(output_file, index=False)
@ -103,7 +103,7 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
 async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
    """GSM8K evaluation main function"""
    data = await load_data(file_path, samples, test=test)
-    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=5)
+    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=10)
    average_score, total_cost = save_results_to_csv(results, path=path)
    print(f"Average score: {average_score:.5f}")
    print(f"Total Cost: {total_cost:.5f}")
--- a/examples/ags/experiments/baselines/cot_gsm8k.py
+++ b/examples/ags/experiments/baselines/cot_gsm8k.py
@ -9,7 +9,7 @@ from pydantic import BaseModel, Field
 from typing import Dict, Any

 GSM8K_PROMPT_GPT = """
-{question}\nPlease reason step by step, and put your final answer in the end. Wrap content using xml tags.
+{question}\nPlease reason step by step. At the end, provide the final answer in the format "Answer is <number>", where <number> is a single number, without any additional information or explanation.
 """

 GSM8K_PROMPT_DS = """
@ -39,28 +39,24 @@ class CoTSolveGraph(SolveGraph):

    async def __call__(self, problem):
        solution = await self.cot_generate(problem, mode="context_fill")
-        return solution, self.llm.cost_manager.total_cost
+        return solution, self.llm.cost_manager.total_cost # {"solution": solution}

 if __name__ == "__main__":
    async def main():
-        # llm_config = ModelsConfig.default().get("deepseek-coder")
+        llm_config = ModelsConfig.default().get("deepseek-coder")
        # llm_config = ModelsConfig.default().get("gpt-4o-mini")
-        llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+        # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+        # llm_config = ModelsConfig.default().get("gpt-4o")
        graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="Gsm8K")
        file_path = "examples/ags/data/gsm8k.jsonl"
-        samples = 1055
-        path = "examples/ags/data/baselines/general"
-        score, cost = await gsm8k_evaluation(graph, file_path, samples, path)
-        return score, cost
+        samples = 264 #264 # 1055 #314  
+        # samples = 100
+        path = "examples/ags/data/baselines/general/gsm8k/"
+        score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=False)
+        return score, cost 

    import asyncio
    asyncio.run(main())
+    

-
-# medprompt operator; universal self consistency; 
-
-# IO指的没有任何Trick，看LLM自身的一个效果。使用 model 发布者在对应的 dataset 使用的 prompt。
-
-# deepseek-chat; gpt-4o-mini; gpt-35-turbo-1106
-
-# med ensemble 
+# self consistency; medprompt 已有的Operator来实现这两个方法