llm-circuit-finder/results/devstral_24b_rys_12_14.json
2026-03-18 21:17:52 +00:00

823 lines
No EOL
34 KiB
JSON

{
"results": {
"bbh_cot_fewshot_causal_judgement": {
"alias": "bbh_cot_fewshot_causal_judgement",
"exact_match,get-answer": 0.66,
"exact_match_stderr,get-answer": 0.06767268161329723
},
"bbh_cot_fewshot_date_understanding": {
"alias": "bbh_cot_fewshot_date_understanding",
"exact_match,get-answer": 0.84,
"exact_match_stderr,get-answer": 0.05237229365663814
},
"bbh_cot_fewshot_logical_deduction_five_objects": {
"alias": "bbh_cot_fewshot_logical_deduction_five_objects",
"exact_match,get-answer": 0.76,
"exact_match_stderr,get-answer": 0.06101187572589322
},
"bbh_cot_fewshot_navigate": {
"alias": "bbh_cot_fewshot_navigate",
"exact_match,get-answer": 0.98,
"exact_match_stderr,get-answer": 0.01999999999999999
},
"gsm8k_cot": {
"alias": "gsm8k_cot",
"exact_match,strict-match": 0.64,
"exact_match_stderr,strict-match": 0.06857142857142856,
"exact_match,flexible-extract": 0.86,
"exact_match_stderr,flexible-extract": 0.0495695759225642
},
"humaneval": {
"alias": "humaneval",
"pass@1,create_test": 0.0,
"pass@1_stderr,create_test": 0.0
},
"ifeval": {
"alias": "ifeval",
"prompt_level_strict_acc,none": 0.68,
"prompt_level_strict_acc_stderr,none": 0.06663945022680344,
"inst_level_strict_acc,none": 0.7631578947368421,
"inst_level_strict_acc_stderr,none": "N/A",
"prompt_level_loose_acc,none": 0.76,
"prompt_level_loose_acc_stderr,none": 0.0610118757258932,
"inst_level_loose_acc,none": 0.8157894736842105,
"inst_level_loose_acc_stderr,none": "N/A"
},
"mbpp": {
"alias": "mbpp",
"pass_at_1,none": 0.78,
"pass_at_1_stderr,none": 0.059178043363451366
}
},
"group_subtasks": {
"gsm8k_cot": [],
"ifeval": [],
"bbh_cot_fewshot_causal_judgement": [],
"bbh_cot_fewshot_date_understanding": [],
"bbh_cot_fewshot_logical_deduction_five_objects": [],
"bbh_cot_fewshot_navigate": [],
"humaneval": [],
"mbpp": []
},
"configs": {
"bbh_cot_fewshot_causal_judgement": {
"task": "bbh_cot_fewshot_causal_judgement",
"dataset_path": "SaylorTwift/bbh",
"dataset_name": "causal_judgement",
"test_split": "test",
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n",
"doc_to_target": "{{target}}",
"unsafe_code": false,
"description": "Answer questions about causal attribution.\n\n",
"target_delimiter": "",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n",
"split": null,
"process_docs": null,
"fewshot_indices": null,
"samples": [
{
"input": "How would a typical person answer each of the following questions about causation?\nFrank T., had an ongoing dispute with his neighbor over a stretch of land and one day decided to shoot his neighbor in the body. Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet bounced off a large boulder several feet away and hit the neighbor's body, causing significant injury. Did Frank T. intentionally shoot his neighbor in the body?\nOptions:\n- Yes\n- No",
"target": "Here in this question, we are told that \"Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild.\" A typical person would assume that this passage suggests that Frank T. had no intention of shooting and injuring someone and that the bullet accidentally hit the neighbor's body; therefore, we conclude that Frank T. did not intentionally hit his neighbor. So the answer is No."
},
{
"input": "How would a typical person answer each of the following questions about causation?\nSuzy and Billy are working on a project that is very important for our nation's security. The boss tells them both: \"Be sure that you are here at exactly 9 am. It is absolutely essential that you arrive at that time.\" Both Billy and Suzy arrive at 9 am. As it happens, there was a motion detector installed in the room where they arrived. The motion detector was set up to be triggered if at least one person appeared in the room at the same time. So the motion detector went off. Did Billy cause the motion detector to go off?\nOptions:\n- Yes\n- No",
"target": "Here in this question, we are told that the boss ordered them both to arrive at the meeting room at the same time and that the motion detector was set up to be triggered if at least one person appeared in the room at the same time.\" A typical person would assume that the person probably meant to say the detector was set up to be triggered if \"both persons\" appeared in the room at the same time, not at least one person, since otherwise the phrase \"at the same time\" would not make much sense in that sentence. Because the motion detector went off, a typical person would therefore come to the conclusion that both Suzy and Billy triggered the motion detector to go off; hence, Billy did indeed cause the motion detector to go off. So the answer is Yes."
},
{
"input": "How would a typical person answer each of the following questions about causation?\nGeorge and his sister Lena reunite at their parents' house for Thanksgiving. Whereas George just got into medical school, Lena is unhappy in her marriage and recently lost her job. Over the course of the day, George and Lena get into a number of heated arguments. Later in the afternoon they play a game of darts. They split the first two games, and the third game is close until the end. Who will win comes down to George's last shot. If he hits a high point region, he wins; if he hits a low point region, Lena wins. George thinks of the difficult time Lena is having, and he really wants to let her win. He aims the dart at the low point region. He sets up his shot and the dart lands in the low point region. After his shot, Lena wins the game and is very happy. Did George hit the low point region intentionally?\nOptions:\n- Yes\n- No",
"target": "Here in this question, we are told that \"He aims the dart at the low point region.\" A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes."
}
],
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n",
"doc_to_choice": null,
"doc_to_target": "{{target}}",
"gen_prefix": null,
"fewshot_delimiter": "\n\n",
"target_delimiter": ""
},
"num_fewshot": 3,
"metric_list": [
{
"metric": "exact_match",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "generate_until",
"generation_kwargs": {
"max_gen_toks": 1024,
"until": [
"</s>",
"Q",
"\n\n"
],
"do_sample": false,
"temperature": 0.0
},
"repeats": 1,
"filter_list": [
{
"name": "get-answer",
"filter": [
{
"function": "regex",
"regex_pattern": "(?<=the answer is )(.*)(?=.)"
},
{
"function": "take_first"
}
]
}
],
"should_decontaminate": false,
"metadata": {
"version": 4.0,
"model": "devstral",
"base_url": "http://localhost:8089/v1/chat/completions",
"num_concurrent": 1,
"max_retries": 3,
"tokenized_requests": false,
"eos_string": "</s>"
}
},
"bbh_cot_fewshot_date_understanding": {
"task": "bbh_cot_fewshot_date_understanding",
"dataset_path": "SaylorTwift/bbh",
"dataset_name": "date_understanding",
"test_split": "test",
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n",
"doc_to_target": "{{target}}",
"unsafe_code": false,
"description": "Infer the date from context.\n\n",
"target_delimiter": "",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n",
"split": null,
"process_docs": null,
"fewshot_indices": null,
"samples": [
{
"input": "Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?\nOptions:\n(A) 12/14/2026\n(B) 12/14/1950\n(C) 12/14/2007\n(D) 12/14/1937\n(E) 07/14/1938\n(F) 12/14/1988",
"target": "If today is Christmas Eve of 1937, then today's date is December 24, 1937. 10 days before today is December 14, 1937, that is 12/14/1937. So the answer is (D)."
},
{
"input": "Tomorrow is 11/12/2019. What is the date one year ago from today in MM/DD/YYYY?\nOptions:\n(A) 09/04/2018\n(B) 11/11/2018\n(C) 08/25/2018\n(D) 11/02/2018\n(E) 11/04/2018",
"target": "If tomorrow is 11/12/2019, then today is 11/11/2019. The date one year ago from today is 11/11/2018. So the answer is (B)."
},
{
"input": "Jane and John married on Jan 2, 1958. It is their 5-year anniversary today. What is the date tomorrow in MM/DD/YYYY?\nOptions:\n(A) 01/11/1961\n(B) 01/03/1963\n(C) 01/18/1961\n(D) 10/14/1960\n(E) 01/03/1982\n(F) 12/03/1960",
"target": "If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B)."
}
],
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n",
"doc_to_choice": null,
"doc_to_target": "{{target}}",
"gen_prefix": null,
"fewshot_delimiter": "\n\n",
"target_delimiter": ""
},
"num_fewshot": 3,
"metric_list": [
{
"metric": "exact_match",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "generate_until",
"generation_kwargs": {
"max_gen_toks": 1024,
"until": [
"</s>",
"Q",
"\n\n"
],
"do_sample": false,
"temperature": 0.0
},
"repeats": 1,
"filter_list": [
{
"name": "get-answer",
"filter": [
{
"function": "regex",
"regex_pattern": "(?<=the answer is )(.*)(?=.)"
},
{
"function": "take_first"
}
]
}
],
"should_decontaminate": false,
"metadata": {
"version": 4.0,
"model": "devstral",
"base_url": "http://localhost:8089/v1/chat/completions",
"num_concurrent": 1,
"max_retries": 3,
"tokenized_requests": false,
"eos_string": "</s>"
}
},
"bbh_cot_fewshot_logical_deduction_five_objects": {
"task": "bbh_cot_fewshot_logical_deduction_five_objects",
"dataset_path": "SaylorTwift/bbh",
"dataset_name": "logical_deduction_five_objects",
"test_split": "test",
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n",
"doc_to_target": "{{target}}",
"unsafe_code": false,
"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n",
"target_delimiter": "",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n",
"split": null,
"process_docs": null,
"fewshot_indices": null,
"samples": [
{
"input": "The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.\nOptions:\n(A) Amy finished last\n(B) Eli finished last\n(C) Eve finished last",
"target": "(1) Eve finished above Amy: \"(above) ? Eve ? Amy ? (below)\".\n(2) Eli finished below Amy: \"(above) ? Amy ? Eli ? (below)\".\n(3) Combining (1) and (2) we get the following ordering: \"(above) Eve Amy Eli (below)\".\nAccording to this ordering, the person who finished last (the one at the bottom of this list) is Eli.\nEli finished last. So the answer is (B)."
},
{
"input": "The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a white book, a green book, and an orange book. The green book is to the right of the white book. The orange book is the rightmost.\nOptions:\n(A) The white book is the leftmost\n(B) The green book is the leftmost\n(C) The orange book is the leftmost",
"target": "(1) The green book is to the right of the white book: \"(left) ? white ? green ? (right)\".\n(2) The orange book is the rightmost: \"(left) ? white ? green orange (right)\".\n(3) Combining (1) and (2) we get the following ordering: \"(left) white green orange (right)\".\nAccording to this ordering, the leftmost book is the white book.\nThe white book is the leftmost. So the answer is (A)."
},
{
"input": "The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a red book, a gray book, and a white book. The white book is to the left of the gray book. The red book is the second from the left.\nOptions:\n(A) The red book is the leftmost\n(B) The gray book is the leftmost\n(C) The white book is the leftmost",
"target": "(1) The white book is to the left of the gray book: \"(left) ? white ? gray ? (right)\".\n(2) The red book is the second from the left: \"(left) ? white red gray ? (right)\".\n(3) Combining (1) and (2) we get the following ordering: \"(left) white red gray (right)\".\nAccording to this ordering, the leftmost book is the white book.\nThe white book is the leftmost. So the answer is (C)."
}
],
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n",
"doc_to_choice": null,
"doc_to_target": "{{target}}",
"gen_prefix": null,
"fewshot_delimiter": "\n\n",
"target_delimiter": ""
},
"num_fewshot": 3,
"metric_list": [
{
"metric": "exact_match",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "generate_until",
"generation_kwargs": {
"max_gen_toks": 1024,
"until": [
"</s>",
"Q",
"\n\n"
],
"do_sample": false,
"temperature": 0.0
},
"repeats": 1,
"filter_list": [
{
"name": "get-answer",
"filter": [
{
"function": "regex",
"regex_pattern": "(?<=the answer is )(.*)(?=.)"
},
{
"function": "take_first"
}
]
}
],
"should_decontaminate": false,
"metadata": {
"version": 4.0,
"model": "devstral",
"base_url": "http://localhost:8089/v1/chat/completions",
"num_concurrent": 1,
"max_retries": 3,
"tokenized_requests": false,
"eos_string": "</s>"
}
},
"bbh_cot_fewshot_navigate": {
"task": "bbh_cot_fewshot_navigate",
"dataset_path": "SaylorTwift/bbh",
"dataset_name": "navigate",
"test_split": "test",
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n",
"doc_to_target": "{{target}}",
"unsafe_code": false,
"description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n",
"target_delimiter": "",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n",
"split": null,
"process_docs": null,
"fewshot_indices": null,
"samples": [
{
"input": "If you follow these instructions, do you return to the starting point? Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.\nOptions:\n- Yes\n- No",
"target": "We start at the origin (0, 0), facing the positive y-axis.\n(1) Turn left: (0, 0), facing the negative x-axis.\n(2) Turn around: (0, 0), facing the positive x-axis.\n(3) Turn left: (0, 0), facing the positive y-axis.\n(4) Take 7 steps: (0, 7), facing the positive y-axis.\n(5) Take 2 steps: (0, 9), facing the positive y-axis.\n(6) Take 4 steps: (0, 13), facing the positive y-axis.\n(7) Take 8 steps: (0, 21), facing the positive y-axis.\nSince (0, 21) is not (0, 0), we are not where we started. So the answer is No."
},
{
"input": "If you follow these instructions, do you return to the starting point? Turn around. Take 1 step. Take 6 steps. Turn around. Take 6 steps. Take 9 steps. Take 1 step.\nOptions:\n- Yes\n- No",
"target": "We start at the origin (0, 0), facing the positive y-axis.\n(1) Turn around: (0, 0), facing the negative y-axis.\n(2) Take 1 step: (0, -1), facing the negative y-axis.\n(3) Take 6 steps: (0, -7), facing the negative y-axis.\n(4) Turn around: (0, -7), facing the positive y-axis.\n(5) Take 6 steps: (0, -1), facing the positive y-axis.\n(6) Take 9 steps: (0, 8), facing the positive y-axis.\n(7) Take 1 step: (0, 9), facing the positive y-axis.\nSince (0, 9) is not (0, 0), we are not where we started. So the answer is No."
},
{
"input": "If you follow these instructions, do you return to the starting point? Always face forward. Take 2 steps right. Take 9 steps left. Take 7 steps right.\nOptions:\n- Yes\n- No",
"target": "We start at the origin (0, 0), facing the positive y-axis.\n(1) Always face forward: (0, 0), facing the positive y-axis.\n(2) Take 2 steps right: (0, 2), facing the positive y-axis.\n(3) Take 9 steps left: (0, -7), facing the positive y-axis.\n(4) Take 7 steps right: (0, 7), facing the positive y-axis.\nSince (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes."
}
],
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n",
"doc_to_choice": null,
"doc_to_target": "{{target}}",
"gen_prefix": null,
"fewshot_delimiter": "\n\n",
"target_delimiter": ""
},
"num_fewshot": 3,
"metric_list": [
{
"metric": "exact_match",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "generate_until",
"generation_kwargs": {
"max_gen_toks": 1024,
"until": [
"</s>",
"Q",
"\n\n"
],
"do_sample": false,
"temperature": 0.0
},
"repeats": 1,
"filter_list": [
{
"name": "get-answer",
"filter": [
{
"function": "regex",
"regex_pattern": "(?<=the answer is )(.*)(?=.)"
},
{
"function": "take_first"
}
]
}
],
"should_decontaminate": false,
"metadata": {
"version": 4.0,
"model": "devstral",
"base_url": "http://localhost:8089/v1/chat/completions",
"num_concurrent": 1,
"max_retries": 3,
"tokenized_requests": false,
"eos_string": "</s>"
}
},
"gsm8k_cot": {
"task": "gsm8k_cot",
"tag": [
"chain_of_thought"
],
"dataset_path": "openai/gsm8k",
"dataset_name": "main",
"test_split": "test",
"doc_to_text": "Q: {{question}}\nA:",
"doc_to_target": "{{answer.split('####')[-1].strip() if answer is defined else target}}",
"unsafe_code": false,
"description": "",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n",
"split": null,
"process_docs": null,
"fewshot_indices": null,
"samples": [
{
"question": "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?",
"target": "There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6."
},
{
"question": "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?",
"target": "There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5."
},
{
"question": "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?",
"target": "Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The answer is 39."
},
{
"question": "Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?",
"target": "Jason started with 20 lollipops. Then he had 12 after giving some to Denny. So he gave Denny 20 - 12 = 8. The answer is 8."
},
{
"question": "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?",
"target": "Shawn started with 5 toys. If he got 2 toys each from his mom and dad, then that is 4 more toys. 5 + 4 = 9. The answer is 9."
},
{
"question": "There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?",
"target": "There were originally 9 computers. For each of 4 days, 5 more computers were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The answer is 29."
},
{
"question": "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?",
"target": "Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The answer is 33."
},
{
"question": "Olivia has $23. She bought five bagels for $3 each. How much money does she have left?",
"target": "Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The answer is 8."
}
],
"doc_to_text": "Q: {{question}}\nA:",
"doc_to_choice": null,
"doc_to_target": "{{answer.split('####')[-1].strip() if answer is defined else target}}",
"gen_prefix": null,
"fewshot_delimiter": "\n\n",
"target_delimiter": " "
},
"num_fewshot": 8,
"metric_list": [
{
"aggregation": "mean",
"higher_is_better": true,
"ignore_case": true,
"ignore_punctuation": false,
"metric": "exact_match",
"regexes_to_ignore": [
",",
"\\$",
"(?s).*#### ",
"\\.$"
]
}
],
"output_type": "generate_until",
"generation_kwargs": {
"do_sample": false,
"until": [
"Q:",
"</s>",
"<|im_end|>"
]
},
"repeats": 1,
"filter_list": [
{
"filter": [
{
"function": "regex",
"regex_pattern": "The answer is (\\-?[0-9\\.\\,]+)."
},
{
"function": "take_first"
}
],
"name": "strict-match"
},
{
"filter": [
{
"function": "regex",
"group_select": -1,
"regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
},
{
"function": "take_first"
}
],
"name": "flexible-extract"
}
],
"should_decontaminate": false,
"metadata": {
"version": 3.0,
"model": "devstral",
"base_url": "http://localhost:8089/v1/chat/completions",
"num_concurrent": 1,
"max_retries": 3,
"tokenized_requests": false,
"eos_string": "</s>"
}
},
"humaneval": {
"task": "humaneval",
"dataset_path": "openai/openai_humaneval",
"test_split": "test",
"doc_to_text": "{{prompt}}",
"doc_to_target": "{{test}}\ncheck({{entry_point}})",
"unsafe_code": true,
"description": "",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "default",
"split": null,
"process_docs": null,
"fewshot_indices": null,
"samples": null,
"doc_to_text": "{{prompt}}",
"doc_to_choice": null,
"doc_to_target": "{{test}}\ncheck({{entry_point}})",
"gen_prefix": null,
"fewshot_delimiter": "\n\n",
"target_delimiter": " "
},
"num_fewshot": 0,
"metric_list": [
{
"metric": "def pass_at_k(references: list[str], predictions: list[list[str]], k: list[int] = None):\n global compute_\n assert k is not None\n if isinstance(k, int):\n k = [k]\n res = compute_.compute(\n references=references,\n predictions=predictions,\n k=k,\n )\n return res[0]\n",
"aggregation": "mean",
"higher_is_better": true,
"k": [
1
]
}
],
"output_type": "generate_until",
"generation_kwargs": {
"until": [
"\nclass",
"\ndef",
"\n#",
"\nif",
"\nprint"
],
"max_gen_toks": 1024,
"do_sample": false
},
"repeats": 1,
"filter_list": [
{
"name": "create_test",
"filter": [
{
"function": "custom",
"filter_fn": "<function build_predictions at 0x7f0f9b5b4ea0>"
}
]
}
],
"should_decontaminate": false,
"metadata": {
"version": 1.0,
"model": "devstral",
"base_url": "http://localhost:8089/v1/chat/completions",
"num_concurrent": 1,
"max_retries": 3,
"tokenized_requests": false,
"eos_string": "</s>"
}
},
"ifeval": {
"task": "ifeval",
"dataset_path": "google/IFEval",
"test_split": "train",
"doc_to_text": "prompt",
"doc_to_target": 0,
"unsafe_code": false,
"process_results": "def process_results(doc, results):\n inp = InputExample(\n key=doc[\"key\"],\n instruction_id_list=doc[\"instruction_id_list\"],\n prompt=doc[\"prompt\"],\n kwargs=doc[\"kwargs\"],\n )\n response = results[0]\n\n out_strict = test_instruction_following_strict(inp, response)\n out_loose = test_instruction_following_loose(inp, response)\n\n return {\n \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n }\n",
"description": "",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "default",
"split": null,
"process_docs": null,
"fewshot_indices": null,
"samples": null,
"doc_to_text": "prompt",
"doc_to_choice": null,
"doc_to_target": 0,
"gen_prefix": null,
"fewshot_delimiter": "\n\n",
"target_delimiter": " "
},
"num_fewshot": 0,
"metric_list": [
{
"metric": "prompt_level_strict_acc",
"aggregation": "mean",
"higher_is_better": true
},
{
"metric": "inst_level_strict_acc",
"aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
"higher_is_better": true
},
{
"metric": "prompt_level_loose_acc",
"aggregation": "mean",
"higher_is_better": true
},
{
"metric": "inst_level_loose_acc",
"aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
"higher_is_better": true
}
],
"output_type": "generate_until",
"generation_kwargs": {
"until": [],
"do_sample": false,
"temperature": 0.0,
"max_gen_toks": 1280
},
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 4.0,
"model": "devstral",
"base_url": "http://localhost:8089/v1/chat/completions",
"num_concurrent": 1,
"max_retries": 3,
"tokenized_requests": false,
"eos_string": "</s>"
}
},
"mbpp": {
"task": "mbpp",
"dataset_path": "google-research-datasets/mbpp",
"dataset_name": "full",
"test_split": "test",
"doc_to_text": "You are an expert Python programmer, and here is your task: {{text}} Your code should pass these tests:\n\n{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}\n[BEGIN]\n",
"doc_to_target": "{% if is_fewshot is defined %}{{code}}\n[DONE]{% else %}{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}{% endif %}",
"unsafe_code": true,
"description": "",
"target_delimiter": "",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n",
"split": null,
"process_docs": null,
"fewshot_indices": null,
"samples": "<function list_fewshot_samples at 0x7f0f9b5b5300>",
"doc_to_text": "You are an expert Python programmer, and here is your task: {{text}} Your code should pass these tests:\n\n{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}\n[BEGIN]\n",
"doc_to_choice": null,
"doc_to_target": "{% if is_fewshot is defined %}{{code}}\n[DONE]{% else %}{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}{% endif %}",
"gen_prefix": null,
"fewshot_delimiter": "\n\n",
"target_delimiter": ""
},
"num_fewshot": 3,
"metric_list": [
{
"metric": "def pass_at_1(\n references: Union[str, list[str]], predictions: Union[str, list[list[str]]]\n) -> float:\n if isinstance(references, str):\n references = [references]\n if isinstance(predictions[0], str):\n predictions = [[p] for p in predictions]\n return pass_at_k.compute(\n references=references,\n predictions=predictions,\n k=[1],\n )[0][\"pass@1\"]\n",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "generate_until",
"generation_kwargs": {
"until": [
"[DONE]"
],
"do_sample": false
},
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 1.0,
"model": "devstral",
"base_url": "http://localhost:8089/v1/chat/completions",
"num_concurrent": 1,
"max_retries": 3,
"tokenized_requests": false,
"eos_string": "</s>"
}
}
},
"versions": {
"bbh_cot_fewshot_causal_judgement": 4.0,
"bbh_cot_fewshot_date_understanding": 4.0,
"bbh_cot_fewshot_logical_deduction_five_objects": 4.0,
"bbh_cot_fewshot_navigate": 4.0,
"gsm8k_cot": 3.0,
"humaneval": 1.0,
"ifeval": 4.0,
"mbpp": 1.0
},
"n-shot": {
"bbh_cot_fewshot_causal_judgement": 3,
"bbh_cot_fewshot_date_understanding": 3,
"bbh_cot_fewshot_logical_deduction_five_objects": 3,
"bbh_cot_fewshot_navigate": 3,
"gsm8k_cot": 8,
"humaneval": 0,
"ifeval": 0,
"mbpp": 3
},
"higher_is_better": {
"bbh_cot_fewshot_causal_judgement": {
"exact_match": true
},
"bbh_cot_fewshot_date_understanding": {
"exact_match": true
},
"bbh_cot_fewshot_logical_deduction_five_objects": {
"exact_match": true
},
"bbh_cot_fewshot_navigate": {
"exact_match": true
},
"gsm8k_cot": {
"exact_match": true
},
"humaneval": {
"pass_at_k": true
},
"ifeval": {
"prompt_level_strict_acc": true,
"inst_level_strict_acc": true,
"prompt_level_loose_acc": true,
"inst_level_loose_acc": true
},
"mbpp": {
"pass_at_1": true
}
},
"n-samples": {
"mbpp": {
"original": 500,
"effective": 50
},
"humaneval": {
"original": 164,
"effective": 50
},
"bbh_cot_fewshot_navigate": {
"original": 250,
"effective": 50
},
"bbh_cot_fewshot_logical_deduction_five_objects": {
"original": 250,
"effective": 50
},
"bbh_cot_fewshot_date_understanding": {
"original": 250,
"effective": 50
},
"bbh_cot_fewshot_causal_judgement": {
"original": 187,
"effective": 50
},
"ifeval": {
"original": 541,
"effective": 50
},
"gsm8k_cot": {
"original": 1319,
"effective": 50
}
},
"config": {
"model": "local-chat-completions",
"model_args": {
"model": "devstral",
"base_url": "http://localhost:8089/v1/chat/completions",
"num_concurrent": 1,
"max_retries": 3,
"tokenized_requests": false,
"eos_string": "</s>"
},
"batch_size": 1,
"batch_sizes": [],
"device": "cuda:0",
"use_cache": null,
"limit": 50.0,
"bootstrap_iters": 100000,
"gen_kwargs": {},
"random_seed": 0,
"numpy_seed": 1234,
"torch_seed": 1234,
"fewshot_seed": 1234
},
"git_hash": null,
"date": 1773859592.6450324,
"pretty_env_info": "N/A (torch not installed)",
"transformers_version": "4.57.3",
"lm_eval_version": "0.4.11",
"upper_git_hash": null,
"task_hashes": {},
"model_source": "local-chat-completions",
"model_name": "devstral",
"model_name_sanitized": "devstral",
"system_instruction": null,
"system_instruction_sha": null,
"fewshot_as_multiturn": true,
"chat_template": "",
"chat_template_sha": null,
"total_evaluation_time_seconds": "2742.0724245810125"
}