Update humaneval benchmark & utils; fixs some bug in action node.

2026-05-06 06:12:39 +02:00 · 2024-08-01 15:33:55 +08:00 · 2024-08-01 15:33:55 +08:00 · 96bd3101ec
commit 96bd3101ec
parent d671e6ca6d
5 changed files with 35 additions and 13 deletions
--- a/examples/ags/benchmark/humaneval.py
+++ b/examples/ags/benchmark/humaneval.py
@ -15,7 +15,7 @@ from evalplus.data import get_human_eval_plus

 from examples.ags.w_action_node.graph import HumanEvalGraph
 from examples.ags.w_action_node.operator import GenerateCode, GenerateCodeBlock
-from examples.ags.w_action_node.utils import sort_json_by_task_id
+from examples.ags.w_action_node.utils import sort_json_by_key
 from metagpt.llm import LLM
 from metagpt.logs import logger
 from metagpt.utils.common import add_jsonl_file, read_json_file
@ -64,20 +64,18 @@ async def route_generate(mode: ModeType, id: str):
 async def sample_generate(id, result_path: str = "samples.jsonl", mode: ModeType = "ags"):
    sample_dict = await route_generate(mode, id)
    add_jsonl_file(result_path, [sample_dict])
-    sort_json_by_task_id(result_path, result_path)
+    sort_json_by_key(result_path, result_path)


 async def samples_generate(mode: ModeType, result_path: str = "samples.jsonl"):
    ids = list(get_human_eval_plus().keys())
    file_lock = asyncio.Lock()

-    @handle_exception(
-        exception_type=Exception,
-        exception_msg="Error in solve_and_write function",
-        default_return=lambda id, *args, **kwargs: id,
-    )
    async def solve_and_write(id: str, mode: ModeType) -> Optional[str]:
-        sample_dict = await route_generate(mode, id)
+        try:
+            sample_dict = await route_generate(mode, id)
+        except Exception as e:
+            return id
        async with file_lock:
            async with aiofiles.open(result_path, mode="a") as f:
                await f.write(json.dumps(sample_dict) + "\n")
@ -96,7 +94,7 @@ async def samples_generate(mode: ModeType, result_path: str = "samples.jsonl"):
            except Exception:
                logger.error(f"{task_id} fail")

-    sort_json_by_task_id(result_path, result_path)
+    sort_json_by_key(result_path, result_path)

    if not failed_tasks:
        if automatic_evalplus(result_path):
--- a/examples/ags/w_action_node/utils.py
+++ b/examples/ags/w_action_node/utils.py
@ -20,7 +20,7 @@ def extract_task_id(task_id: str) -> int:
    return int(match.group(1)) if match else 0


-def sort_json_by_task_id(input_file: str, output_file: str):
+def sort_json_by_key(input_file: str, output_file: str, key: str = "task_id"):
    """
    Read a JSONL file, sort the entries based on task_id, and write to a new JSONL file.

@ -32,7 +32,7 @@ def sort_json_by_task_id(input_file: str, output_file: str):
        data = [json.loads(line) for line in f]

    # Sort the data based on the numeric part of task_id
-    sorted_data = sorted(data, key=lambda x: extract_task_id(x["task_id"]))
+    sorted_data = sorted(data, key=lambda x: extract_task_id(x[key]))

    # Write the sorted data to a new JSONL file
    with open(output_file, "w") as f: