Update for fengwei

This commit is contained in:
didi 2024-09-16 18:13:30 +08:00
parent 53890a5f86
commit 63f3f884c9
2 changed files with 14 additions and 18 deletions

View file

@ -52,7 +52,7 @@ def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str
"""Save results to CSV file"""
df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"])
average_score = df["score"].mean()
total_cost = df["cost"].iloc[-1]
total_cost = df["cost"].max()
output_file = f"{path}/{average_score:.5f}.csv"
df.to_csv(output_file, index=False)
@ -103,7 +103,7 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
"""GSM8K evaluation main function"""
data = await load_data(file_path, samples, test=test)
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=5)
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=10)
average_score, total_cost = save_results_to_csv(results, path=path)
print(f"Average score: {average_score:.5f}")
print(f"Total Cost: {total_cost:.5f}")

View file

@ -9,7 +9,7 @@ from pydantic import BaseModel, Field
from typing import Dict, Any
GSM8K_PROMPT_GPT = """
{question}\nPlease reason step by step, and put your final answer in the end. Wrap content using xml tags.
{question}\nPlease reason step by step. At the end, provide the final answer in the format "Answer is <number>", where <number> is a single number, without any additional information or explanation.
"""
GSM8K_PROMPT_DS = """
@ -39,28 +39,24 @@ class CoTSolveGraph(SolveGraph):
async def __call__(self, problem):
solution = await self.cot_generate(problem, mode="context_fill")
return solution, self.llm.cost_manager.total_cost
return solution, self.llm.cost_manager.total_cost # {"solution": solution}
if __name__ == "__main__":
async def main():
# llm_config = ModelsConfig.default().get("deepseek-coder")
llm_config = ModelsConfig.default().get("deepseek-coder")
# llm_config = ModelsConfig.default().get("gpt-4o-mini")
llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
# llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
# llm_config = ModelsConfig.default().get("gpt-4o")
graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="Gsm8K")
file_path = "examples/ags/data/gsm8k.jsonl"
samples = 1055
path = "examples/ags/data/baselines/general"
score, cost = await gsm8k_evaluation(graph, file_path, samples, path)
return score, cost
samples = 264 #264 # 1055 #314
# samples = 100
path = "examples/ags/data/baselines/general/gsm8k/"
score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=False)
return score, cost
import asyncio
asyncio.run(main())
# medprompt operator; universal self consistency;
# IO指的没有任何Trick看LLM自身的一个效果。使用 model 发布者在对应的 dataset 使用的 prompt。
# deepseek-chat; gpt-4o-mini; gpt-35-turbo-1106
# med ensemble
# self consistency; medprompt 已有的Operator来实现这两个方法