mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-06-08 15:05:17 +02:00
Update human Eval
This commit is contained in:
parent
62ffa730e0
commit
c7c34cda7d
2 changed files with 13 additions and 5 deletions
|
|
@ -82,10 +82,17 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
|
|||
|
||||
return await tqdm_asyncio.gather(*tasks, desc="Evaluating HumanEval problems", total=len(data))
|
||||
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
|
||||
def save_results_to_jsonl(results: List[Tuple[str, str, str, int]], path: str) -> float:
|
||||
avg_score = 0
|
||||
timestamp = int(time.time())
|
||||
filename = f"humaneval_results_{timestamp}.jsonl"
|
||||
full_path = os.path.join(path, filename)
|
||||
|
||||
with open(path, "w") as f:
|
||||
with open(full_path, "w") as f:
|
||||
for result in results:
|
||||
f.write(
|
||||
json.dumps(
|
||||
|
|
@ -99,10 +106,10 @@ def save_results_to_jsonl(results: List[Tuple[str, str, str, int]], path: str) -
|
|||
+ "\n"
|
||||
)
|
||||
avg_score += result[3]
|
||||
print(f"Results saved to {path}")
|
||||
print(f"save to {full_path}")
|
||||
avg_score /= len(results)
|
||||
|
||||
return avg_score
|
||||
return round(avg_score, 5)
|
||||
|
||||
async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
|
||||
data = await load_data(file_path, samples)
|
||||
|
|
|
|||
|
|
@ -38,11 +38,12 @@ class CoTSolveGraph(SolveGraph):
|
|||
|
||||
if __name__ == "__main__":
|
||||
async def main():
|
||||
# llm_config = ModelsConfig.default().get("gpt-4o-mini")
|
||||
llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
|
||||
graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="HumanEval")
|
||||
file_path = "examples/ags/data/human-eval-new.jsonl"
|
||||
samples = 1 # 33/131
|
||||
path = "examples/ags/data/baselines/general"
|
||||
samples = 131 # 33/131
|
||||
path = "examples/ags/data/baselines/general/humaneval"
|
||||
score = await humaneval_evaluation(graph, file_path, samples, path)
|
||||
return score
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue