diff --git "a/metrics.eval.jsonl" "b/metrics.eval.jsonl" --- "a/metrics.eval.jsonl" +++ "b/metrics.eval.jsonl" @@ -193,3 +193,11 @@ {"created_at": "2025-05-02T19:04:04.844533", "global_step": 480000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4180887372013652, "acc_stderr,none": 0.014413988396996088, "acc_norm,none": 0.439419795221843, "acc_norm_stderr,none": 0.014503747823580125}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7285353535353535, "acc_stderr,none": 0.009125362970360625, "acc_norm,none": 0.7150673400673401, "acc_norm_stderr,none": 0.009262170695590656}, "boolq": {"alias": "boolq", "acc,none": 0.771559633027523, "acc_stderr,none": 0.007342834051148581}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.27354627354627353, "acc_stderr,none": 0.012762608994259371}, "copa": {"alias": "copa", "acc,none": 0.79, "acc_stderr,none": 0.04093601807403326}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.476000796654053, "acc_stderr,none": 0.004984030250507296, "acc_norm,none": 0.6428002389962159, "acc_norm_stderr,none": 0.004781950883460504}, "mmlu": {"acc,none": 0.34147557328015954, "acc_stderr,none": 0.003967972569487562, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.32688629117959617, "acc_stderr,none": 0.006777844982407219, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047182}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4, "acc_stderr,none": 0.038254602783800246}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.45588235294117646, "acc_stderr,none": 0.03495624522015473}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.48523206751054854, "acc_stderr,none": 0.032533028078777386}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4214876033057851, "acc_stderr,none": 0.04507732278775094}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3611111111111111, "acc_stderr,none": 0.04643454608906274}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.34355828220858897, "acc_stderr,none": 0.03731133519673891}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3236994219653179, "acc_stderr,none": 0.025190181327608415}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.36012861736334406, "acc_stderr,none": 0.027264297599804015}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3765432098765432, "acc_stderr,none": 0.026959344518747787}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.30964797913950454, "acc_stderr,none": 0.011808598262503318}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30994152046783624, "acc_stderr,none": 0.035469769593931624}, "mmlu_other": {"acc,none": 0.3804312841969746, "acc_stderr,none": 0.008681854364792048, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.37735849056603776, "acc_stderr,none": 0.029832808114796005}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3930635838150289, "acc_stderr,none": 0.0372424959581773}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.42152466367713004, "acc_stderr,none": 0.03314190222110656}, "mmlu_management": {"alias": " - management", "acc,none": 0.4077669902912621, "acc_stderr,none": 0.04865777570410769}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.44871794871794873, "acc_stderr,none": 0.0325833464938688}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.42017879948914433, "acc_stderr,none": 0.017650651363078012}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.027363593284684934}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2801418439716312, "acc_stderr,none": 0.026789172351140245}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.02841820861940679}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3433734939759036, "acc_stderr,none": 0.036965843170106004}, "mmlu_social_sciences": {"acc,none": 0.368540786480338, "acc_stderr,none": 0.008663159748711824, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.04372748290278008}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.32323232323232326, "acc_stderr,none": 0.03332299921070644}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.43005181347150256, "acc_stderr,none": 0.035729543331448094}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3282051282051282, "acc_stderr,none": 0.023807633198657262}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3445378151260504, "acc_stderr,none": 0.030868682604121626}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3944954128440367, "acc_stderr,none": 0.020954642108587485}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3282442748091603, "acc_stderr,none": 0.041184385658062976}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.35947712418300654, "acc_stderr,none": 0.01941253924203216}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.4, "acc_stderr,none": 0.0469237132203465}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2979591836734694, "acc_stderr,none": 0.02927956741106568}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.48258706467661694, "acc_stderr,none": 0.035333892347392454}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_stem": {"acc,none": 0.29844592451633367, "acc_stderr,none": 0.008081681519214772, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384739}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.362962962962963, "acc_stderr,none": 0.04153948404742399}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.03690677986137282}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3194444444444444, "acc_stderr,none": 0.03899073687357335}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.038739587141493524}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3404255319148936, "acc_stderr,none": 0.030976692998534446}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.03695183311650232}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.022569897074918417}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4290322580645161, "acc_stderr,none": 0.02815603653823321}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.03194740072265541}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02671924078371217}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987053}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.22685185185185186, "acc_stderr,none": 0.028561650102422256}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.0443280405529152}, "mmlu_pro": {"exact_match,custom-extract": 0.18035239361702127, "exact_match_stderr,custom-extract": 0.0034727854149345185, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.32217573221757323, "exact_match_stderr,custom-extract": 0.01746419040868442}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.1596958174904943, "exact_match_stderr,custom-extract": 0.01304974197804603}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10777385159010601, "exact_match_stderr,custom-extract": 0.00922067835765709}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1926829268292683, "exact_match_stderr,custom-extract": 0.019502129313734493}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.24407582938388625, "exact_match_stderr,custom-extract": 0.01479407157778744}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.15479876160990713, "exact_match_stderr,custom-extract": 0.011625887729987494}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2273838630806846, "exact_match_stderr,custom-extract": 0.01466394014668951}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.16272965879265092, "exact_match_stderr,custom-extract": 0.018935396882827783}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.13533151680290645, "exact_match_stderr,custom-extract": 0.01031401946878533}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.154700222057735, "exact_match_stderr,custom-extract": 0.009842013620867987}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.16774891774891776, "exact_match_stderr,custom-extract": 0.01229861474036573}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1523046092184369, "exact_match_stderr,custom-extract": 0.01610133043651402}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.16782140107775212, "exact_match_stderr,custom-extract": 0.010372766376157203}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2518796992481203, "exact_match_stderr,custom-extract": 0.015376345973657985}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.274, "acc_stderr,none": 0.019966103540279462, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665817}, "piqa": {"alias": "piqa", "acc,none": 0.7268770402611534, "acc_stderr,none": 0.010395730264453267, "acc_norm,none": 0.7230685527747551, "acc_norm_stderr,none": 0.010440499969334535}, "race": {"alias": "race", "acc,none": 0.3712918660287081, "acc_stderr,none": 0.01495312651508941}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.44575230296827023, "acc_stderr,none": 0.01124728305057907}, "winogrande": {"alias": "winogrande", "acc,none": 0.6527229676400947, "acc_stderr,none": 0.013380909249751246}} {"created_at": "2025-05-02T20:41:11.508158", "global_step": 482000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4052901023890785, "acc_stderr,none": 0.014346869060229334, "acc_norm,none": 0.43686006825938567, "acc_norm_stderr,none": 0.014494421584256515}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7281144781144782, "acc_stderr,none": 0.009129795867310494, "acc_norm,none": 0.7138047138047138, "acc_norm_stderr,none": 0.009274470774627728}, "boolq": {"alias": "boolq", "acc,none": 0.7672782874617737, "acc_stderr,none": 0.007390731859680137}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2932022932022932, "acc_stderr,none": 0.0130332081673615}, "copa": {"alias": "copa", "acc,none": 0.78, "acc_stderr,none": 0.041633319989322626}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47450707030472017, "acc_stderr,none": 0.0049832915782890425, "acc_norm,none": 0.6401115315674168, "acc_norm_stderr,none": 0.004789865379084506}, "mmlu": {"acc,none": 0.3344252955419456, "acc_stderr,none": 0.0039490781729772245, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3241232731137088, "acc_stderr,none": 0.006754286113231627, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047181}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4, "acc_stderr,none": 0.038254602783800246}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46568627450980393, "acc_stderr,none": 0.03501038327635897}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5021097046413502, "acc_stderr,none": 0.032546938018020076}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4214876033057851, "acc_stderr,none": 0.045077322787750944}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.39814814814814814, "acc_stderr,none": 0.04732332615978815}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3558282208588957, "acc_stderr,none": 0.03761521380046734}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3236994219653179, "acc_stderr,none": 0.025190181327608422}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2435754189944134, "acc_stderr,none": 0.014355911964767864}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3247588424437299, "acc_stderr,none": 0.026596782287697043}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3765432098765432, "acc_stderr,none": 0.026959344518747784}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.29986962190352023, "acc_stderr,none": 0.011702660860193998}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30994152046783624, "acc_stderr,none": 0.03546976959393163}, "mmlu_other": {"acc,none": 0.3723849372384937, "acc_stderr,none": 0.008638494454770823, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.41, "acc_stderr,none": 0.049431107042371025}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.36981132075471695, "acc_stderr,none": 0.029711421880107922}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3699421965317919, "acc_stderr,none": 0.03681229633394319}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4349775784753363, "acc_stderr,none": 0.033272833702713445}, "mmlu_management": {"alias": " - management", "acc,none": 0.3592233009708738, "acc_stderr,none": 0.047504583990416946}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.4444444444444444, "acc_stderr,none": 0.03255326307272485}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4086845466155811, "acc_stderr,none": 0.017579250148153393}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.32679738562091504, "acc_stderr,none": 0.02685729466328142}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.02646903681859063}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.30514705882352944, "acc_stderr,none": 0.0279715413701706}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3614457831325301, "acc_stderr,none": 0.03740059382029321}, "mmlu_social_sciences": {"acc,none": 0.3493662658433539, "acc_stderr,none": 0.00856634801852034, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.30701754385964913, "acc_stderr,none": 0.0433913832257986}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.30303030303030304, "acc_stderr,none": 0.032742879140268674}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.39378238341968913, "acc_stderr,none": 0.03526077095548237}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3128205128205128, "acc_stderr,none": 0.023507579020645365}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.029597329730978082}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.363302752293578, "acc_stderr,none": 0.020620603919625807}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.31297709923664124, "acc_stderr,none": 0.04066962905677698}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3660130718954248, "acc_stderr,none": 0.019488025745529665}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.37272727272727274, "acc_stderr,none": 0.04631381319425464}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.0289205832206756}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.4527363184079602, "acc_stderr,none": 0.03519702717576915}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.47, "acc_stderr,none": 0.050161355804659205}, "mmlu_stem": {"acc,none": 0.2978116079923882, "acc_stderr,none": 0.008085516897590603, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.37777777777777777, "acc_stderr,none": 0.04188307537595853}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03583496176361063}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.039420826399272135}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.47, "acc_stderr,none": 0.050161355804659205}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.34893617021276596, "acc_stderr,none": 0.031158522131357773}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23544973544973544, "acc_stderr,none": 0.021851509822031708}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4032258064516129, "acc_stderr,none": 0.02790615082604114}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.0319474007226554}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.27037037037037037, "acc_stderr,none": 0.027080372815145668}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.034791855725996586}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.028765111718046934}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.04464285714285714}, "mmlu_pro": {"exact_match,custom-extract": 0.18276263297872342, "exact_match_stderr,custom-extract": 0.0034949896205120033, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3054393305439331, "exact_match_stderr,custom-extract": 0.017213178087194265}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.15462610899873258, "exact_match_stderr,custom-extract": 0.01287961022947729}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.1166077738515901, "exact_match_stderr,custom-extract": 0.009543534246504533}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.2048780487804878, "exact_match_stderr,custom-extract": 0.01995735269083492}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.23933649289099526, "exact_match_stderr,custom-extract": 0.014695587900810768}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1651186790505676, "exact_match_stderr,custom-extract": 0.0119336362631826}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2310513447432763, "exact_match_stderr,custom-extract": 0.014746599750625127}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1889763779527559, "exact_match_stderr,custom-extract": 0.02008300581197246}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.13533151680290645, "exact_match_stderr,custom-extract": 0.010314019468785329}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.16506291635825315, "exact_match_stderr,custom-extract": 0.010103800165231483}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.15584415584415584, "exact_match_stderr,custom-extract": 0.011938663890309693}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1523046092184369, "exact_match_stderr,custom-extract": 0.016101330436514016}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.16782140107775212, "exact_match_stderr,custom-extract": 0.010372766376157212}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2619047619047619, "exact_match_stderr,custom-extract": 0.015573948649345878}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.272, "acc_stderr,none": 0.019920483209566072, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665817}, "piqa": {"alias": "piqa", "acc,none": 0.7279651795429815, "acc_stderr,none": 0.01038276378624738, "acc_norm,none": 0.7219804134929271, "acc_norm_stderr,none": 0.010453117358332811}, "race": {"alias": "race", "acc,none": 0.3751196172248804, "acc_stderr,none": 0.014984183551431952}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.44779938587512796, "acc_stderr,none": 0.011252242102001767}, "winogrande": {"alias": "winogrande", "acc,none": 0.6629834254143646, "acc_stderr,none": 0.01328495576939525}} {"created_at": "2025-05-02T22:30:13.812873", "global_step": 484000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4129692832764505, "acc_stderr,none": 0.014388344935398324, "acc_norm,none": 0.44283276450511944, "acc_norm_stderr,none": 0.014515573873348907}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.734006734006734, "acc_stderr,none": 0.009066789565615694, "acc_norm,none": 0.7192760942760943, "acc_norm_stderr,none": 0.009220526174711356}, "boolq": {"alias": "boolq", "acc,none": 0.7694189602446483, "acc_stderr,none": 0.007366917025520441}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.28501228501228504, "acc_stderr,none": 0.012924125439047213}, "copa": {"alias": "copa", "acc,none": 0.79, "acc_stderr,none": 0.04093601807403326}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47460665206134234, "acc_stderr,none": 0.004983342213776258, "acc_norm,none": 0.6410077673770165, "acc_norm_stderr,none": 0.0047872453779671045}, "mmlu": {"acc,none": 0.3336419313488107, "acc_stderr,none": 0.0039438524879566, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3249734325185972, "acc_stderr,none": 0.0067554867542048945, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.18253968253968253, "acc_stderr,none": 0.03455071019102148}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4121212121212121, "acc_stderr,none": 0.038435669935887165}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46078431372549017, "acc_stderr,none": 0.03498501649369527}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.48945147679324896, "acc_stderr,none": 0.032539983791662855}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.45454545454545453, "acc_stderr,none": 0.04545454545454546}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.047500773411999854}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3619631901840491, "acc_stderr,none": 0.037757007291414416}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3208092485549133, "acc_stderr,none": 0.0251310002336479}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3440514469453376, "acc_stderr,none": 0.026981478043648022}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.36728395061728397, "acc_stderr,none": 0.026822801759507898}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3011734028683181, "acc_stderr,none": 0.011717148751648431}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.035650796707083106}, "mmlu_other": {"acc,none": 0.3746379143868684, "acc_stderr,none": 0.008656226181610889, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.37358490566037733, "acc_stderr,none": 0.029773082713319875}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3872832369942196, "acc_stderr,none": 0.037143259063020656}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4349775784753363, "acc_stderr,none": 0.033272833702713445}, "mmlu_management": {"alias": " - management", "acc,none": 0.34951456310679613, "acc_stderr,none": 0.047211885060971716}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.43162393162393164, "acc_stderr,none": 0.0324483553531149}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.43, "acc_stderr,none": 0.04975698519562428}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4163473818646232, "acc_stderr,none": 0.017627948030430298}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.32679738562091504, "acc_stderr,none": 0.026857294663281413}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.30514705882352944, "acc_stderr,none": 0.027971541370170598}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3614457831325301, "acc_stderr,none": 0.03740059382029321}, "mmlu_social_sciences": {"acc,none": 0.3457913552161196, "acc_stderr,none": 0.008543185295853226, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.30701754385964913, "acc_stderr,none": 0.0433913832257986}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.31313131313131315, "acc_stderr,none": 0.033042050878136525}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.39896373056994816, "acc_stderr,none": 0.03533999094065696}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.29743589743589743, "acc_stderr,none": 0.023177408131465956}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.02934457250063434}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3504587155963303, "acc_stderr,none": 0.020456077599824457}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467766}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.36764705882352944, "acc_stderr,none": 0.019506291693954847}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.04607582090719976}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3020408163265306, "acc_stderr,none": 0.029393609319879818}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.4626865671641791, "acc_stderr,none": 0.035256751674679745}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.45, "acc_stderr,none": 0.05}, "mmlu_stem": {"acc,none": 0.2943228671106882, "acc_stderr,none": 0.008037384970229454, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.04244633238353228}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.26973684210526316, "acc_stderr,none": 0.03611780560284898}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3402777777777778, "acc_stderr,none": 0.03962135573486219}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.48, "acc_stderr,none": 0.05021167315686781}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3404255319148936, "acc_stderr,none": 0.030976692998534443}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24338624338624337, "acc_stderr,none": 0.022101128787415426}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4096774193548387, "acc_stderr,none": 0.027976054915347357}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.0319474007226554}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.026466117538959912}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.03543304234389985}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.02699145450203672}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.18392619680851063, "exact_match_stderr,custom-extract": 0.0034958263771979322, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3305439330543933, "exact_match_stderr,custom-extract": 0.01758001028718085}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16476552598225602, "exact_match_stderr,custom-extract": 0.013215216167850041}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10335689045936396, "exact_match_stderr,custom-extract": 0.009052076648374284}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.2097560975609756, "exact_match_stderr,custom-extract": 0.020131503920840902}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.25118483412322273, "exact_match_stderr,custom-extract": 0.014937235759500112}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.16305469556243551, "exact_match_stderr,custom-extract": 0.011873466052186893}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2273838630806846, "exact_match_stderr,custom-extract": 0.014663940146689517}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1837270341207349, "exact_match_stderr,custom-extract": 0.01986609191654633}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.14259763851044505, "exact_match_stderr,custom-extract": 0.010542707604685755}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.14581791265729088, "exact_match_stderr,custom-extract": 0.009605363046932713}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17207792207792208, "exact_match_stderr,custom-extract": 0.012423857401451374}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13627254509018036, "exact_match_stderr,custom-extract": 0.015373681322287381}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1762894534257121, "exact_match_stderr,custom-extract": 0.010577015303223901}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2593984962406015, "exact_match_stderr,custom-extract": 0.015525545242193468}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.27, "acc_stderr,none": 0.01987435483128748, "acc_norm,none": 0.39, "acc_norm_stderr,none": 0.02183468586936921}, "piqa": {"alias": "piqa", "acc,none": 0.7290533188248096, "acc_stderr,none": 0.010369718937426843, "acc_norm,none": 0.7219804134929271, "acc_norm_stderr,none": 0.010453117358332811}, "race": {"alias": "race", "acc,none": 0.37799043062200954, "acc_stderr,none": 0.015006820447473677}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4508700102354145, "acc_stderr,none": 0.011259319269273937}, "winogrande": {"alias": "winogrande", "acc,none": 0.65982636148382, "acc_stderr,none": 0.013315218762417395}} +{"created_at": "2025-05-03T02:20:02.564155", "global_step": 486000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.40784982935153585, "acc_stderr,none": 0.01436109728844969, "acc_norm,none": 0.43856655290102387, "acc_norm_stderr,none": 0.014500682618212864}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7331649831649831, "acc_stderr,none": 0.00907591585926726, "acc_norm,none": 0.7209595959595959, "acc_norm_stderr,none": 0.009203588704032635}, "boolq": {"alias": "boolq", "acc,none": 0.7703363914373089, "acc_stderr,none": 0.0073566287371636604}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.29238329238329236, "acc_stderr,none": 0.013022531002213357}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47540330611431986, "acc_stderr,none": 0.004983740145218612, "acc_norm,none": 0.6413065126468831, "acc_norm_stderr,none": 0.004786368011500453}, "mmlu": {"acc,none": 0.3397664150405925, "acc_stderr,none": 0.003965236208796706, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3277364505844846, "acc_stderr,none": 0.006775454816409681, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047182}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4121212121212121, "acc_stderr,none": 0.038435669935887165}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.44607843137254904, "acc_stderr,none": 0.03488845451304974}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.4978902953586498, "acc_stderr,none": 0.032546938018020076}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4380165289256198, "acc_stderr,none": 0.045291468044357915}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.42592592592592593, "acc_stderr,none": 0.0478034362693679}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3312883435582822, "acc_stderr,none": 0.03697983910025589}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.315028901734104, "acc_stderr,none": 0.025009313790069706}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24134078212290502, "acc_stderr,none": 0.01431099954796145}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3536977491961415, "acc_stderr,none": 0.027155208103200882}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3734567901234568, "acc_stderr,none": 0.026915003011380147}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3089960886571056, "acc_stderr,none": 0.011801729777239256}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.3797875764402961, "acc_stderr,none": 0.008680271254254667, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3622641509433962, "acc_stderr,none": 0.0295822451283843}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3988439306358382, "acc_stderr,none": 0.03733626655383509}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4484304932735426, "acc_stderr,none": 0.03337883736255099}, "mmlu_management": {"alias": " - management", "acc,none": 0.3883495145631068, "acc_stderr,none": 0.048257293373563895}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.452991452991453, "acc_stderr,none": 0.0326109987309862}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.45, "acc_stderr,none": 0.04999999999999999}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.40485312899106, "acc_stderr,none": 0.017553246467720256}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.02736359328468493}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.02668456434046099}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.33455882352941174, "acc_stderr,none": 0.028661996202335307}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3614457831325301, "acc_stderr,none": 0.03740059382029321}, "mmlu_social_sciences": {"acc,none": 0.354891127721807, "acc_stderr,none": 0.008588640193149566, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.30303030303030304, "acc_stderr,none": 0.032742879140268674}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.41968911917098445, "acc_stderr,none": 0.03561587327685884}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3128205128205128, "acc_stderr,none": 0.023507579020645365}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31092436974789917, "acc_stderr,none": 0.030066761582977924}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.363302752293578, "acc_stderr,none": 0.020620603919625807}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.32061068702290074, "acc_stderr,none": 0.04093329229834278}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3627450980392157, "acc_stderr,none": 0.01945076843250551}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.37272727272727274, "acc_stderr,none": 0.04631381319425464}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3020408163265306, "acc_stderr,none": 0.029393609319879815}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.47761194029850745, "acc_stderr,none": 0.03531987930208731}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_stem": {"acc,none": 0.30352045670789723, "acc_stderr,none": 0.008136846004156602, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.04171654161354543}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674805}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3402777777777778, "acc_stderr,none": 0.03962135573486219}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3446808510638298, "acc_stderr,none": 0.03106898596312215}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2896551724137931, "acc_stderr,none": 0.03780019230438014}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.02241804289111394}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4032258064516129, "acc_stderr,none": 0.027906150826041143}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.0317852971064275}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712173}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2582781456953642, "acc_stderr,none": 0.035737053147634576}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.028963702570791013}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.04432804055291519}, "mmlu_pro": {"exact_match,custom-extract": 0.18018617021276595, "exact_match_stderr,custom-extract": 0.0034673431817660166, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.32217573221757323, "exact_match_stderr,custom-extract": 0.01746419040868442}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.1634980988593156, "exact_match_stderr,custom-extract": 0.013174274584326415}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10247349823321555, "exact_match_stderr,custom-extract": 0.009017748507579058}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.21707317073170732, "exact_match_stderr,custom-extract": 0.020384591313839226}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.2476303317535545, "exact_match_stderr,custom-extract": 0.014866330095923867}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.16615067079463364, "exact_match_stderr,custom-extract": 0.011963469940000507}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2200488997555012, "exact_match_stderr,custom-extract": 0.014493799859240616}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.16272965879265092, "exact_match_stderr,custom-extract": 0.018935396882827787}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1371480472297911, "exact_match_stderr,custom-extract": 0.01037209807700217}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.14285714285714285, "exact_match_stderr,custom-extract": 0.009523809523809466}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17316017316017315, "exact_match_stderr,custom-extract": 0.012454716571952209}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1462925851703407, "exact_match_stderr,custom-extract": 0.015836201263905444}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15473441108545036, "exact_match_stderr,custom-extract": 0.010038127358043922}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2669172932330827, "exact_match_stderr,custom-extract": 0.015668798035500312}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.268, "acc_stderr,none": 0.019827714859587568, "acc_norm,none": 0.392, "acc_norm_stderr,none": 0.021854684955611263}, "piqa": {"alias": "piqa", "acc,none": 0.7312295973884657, "acc_stderr,none": 0.010343392940090011, "acc_norm,none": 0.720348204570185, "acc_norm_stderr,none": 0.01047189953030656}, "race": {"alias": "race", "acc,none": 0.3770334928229665, "acc_stderr,none": 0.014999337089843356}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4524053224155578, "acc_stderr,none": 0.011262695440459564}, "winogrande": {"alias": "winogrande", "acc,none": 0.6582478295185478, "acc_stderr,none": 0.013330103018622856}} +{"created_at": "2025-05-03T02:21:07.841364", "global_step": 488000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4061433447098976, "acc_stderr,none": 0.014351656690097863, "acc_norm,none": 0.43430034129692835, "acc_norm_stderr,none": 0.01448470304885736}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7297979797979798, "acc_stderr,none": 0.00911200222911985, "acc_norm,none": 0.7209595959595959, "acc_norm_stderr,none": 0.009203588704032633}, "boolq": {"alias": "boolq", "acc,none": 0.763914373088685, "acc_stderr,none": 0.007427619611412126}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.28746928746928746, "acc_stderr,none": 0.012957392226225588}, "copa": {"alias": "copa", "acc,none": 0.76, "acc_stderr,none": 0.04292346959909284}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47450707030472017, "acc_stderr,none": 0.0049832915782890425, "acc_norm,none": 0.6409081856203943, "acc_norm_stderr,none": 0.004787537385153012}, "mmlu": {"acc,none": 0.341760432986754, "acc_stderr,none": 0.003966290188369764, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.33156216790648246, "acc_stderr,none": 0.0067828713916655075, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.03619604524124249}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.42424242424242425, "acc_stderr,none": 0.03859268142070262}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.47058823529411764, "acc_stderr,none": 0.03503235296367992}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.510548523206751, "acc_stderr,none": 0.032539983791662855}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4628099173553719, "acc_stderr,none": 0.04551711196104218}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3611111111111111, "acc_stderr,none": 0.04643454608906274}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.34355828220858897, "acc_stderr,none": 0.03731133519673891}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.32947976878612717, "acc_stderr,none": 0.025305258131879716}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24022346368715083, "acc_stderr,none": 0.0142883438039253}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3762057877813505, "acc_stderr,none": 0.02751392568354943}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3765432098765432, "acc_stderr,none": 0.026959344518747784}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3044328552803129, "acc_stderr,none": 0.011752877592597579}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3567251461988304, "acc_stderr,none": 0.03674013002860954}, "mmlu_other": {"acc,none": 0.37721274541358224, "acc_stderr,none": 0.0086755519187428, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.38113207547169814, "acc_stderr,none": 0.02989060968628663}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.37572254335260113, "acc_stderr,none": 0.03692820767264867}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4304932735426009, "acc_stderr,none": 0.033231973029429394}, "mmlu_management": {"alias": " - management", "acc,none": 0.39805825242718446, "acc_stderr,none": 0.048467482539772386}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.43162393162393164, "acc_stderr,none": 0.0324483553531149}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4112388250319285, "acc_stderr,none": 0.01759597190805657}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.02736359328468494}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.34191176470588236, "acc_stderr,none": 0.028814722422254177}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3313253012048193, "acc_stderr,none": 0.03664314777288085}, "mmlu_social_sciences": {"acc,none": 0.36529086772830677, "acc_stderr,none": 0.00863478536388612, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.04303684033537316}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3383838383838384, "acc_stderr,none": 0.03371124142626302}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.43005181347150256, "acc_stderr,none": 0.035729543331448094}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.30256410256410254, "acc_stderr,none": 0.023290888053772715}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3067226890756303, "acc_stderr,none": 0.02995382389188703}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3908256880733945, "acc_stderr,none": 0.020920058346111065}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3282442748091603, "acc_stderr,none": 0.041184385658062976}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.37745098039215685, "acc_stderr,none": 0.019610851474880276}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.04607582090719976}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3020408163265306, "acc_stderr,none": 0.029393609319879818}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.48756218905472637, "acc_stderr,none": 0.0353443984853958}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.5, "acc_stderr,none": 0.050251890762960605}, "mmlu_stem": {"acc,none": 0.2990802410402791, "acc_stderr,none": 0.008091408185819379, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384739}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3851851851851852, "acc_stderr,none": 0.042039210401562783}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.26973684210526316, "acc_stderr,none": 0.036117805602848975}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3194444444444444, "acc_stderr,none": 0.038990736873573344}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.0379328118530781}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33617021276595743, "acc_stderr,none": 0.030881618520676942}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.30344827586206896, "acc_stderr,none": 0.038312260488503336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4129032258064516, "acc_stderr,none": 0.028009138125400387}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.0319474007226554}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.0263357394040558}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.034791855725996586}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.028765111718046934}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.0443280405529152}, "mmlu_pro": {"exact_match,custom-extract": 0.18342752659574468, "exact_match_stderr,custom-extract": 0.0034929873225827486, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.32496513249651326, "exact_match_stderr,custom-extract": 0.017503503047556074}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.17363751584283904, "exact_match_stderr,custom-extract": 0.013494101406164668}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.1068904593639576, "exact_match_stderr,custom-extract": 0.009187355756744646}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.21707317073170732, "exact_match_stderr,custom-extract": 0.020384591313839226}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.245260663507109, "exact_match_stderr,custom-extract": 0.014818309281701568}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.17234262125902994, "exact_match_stderr,custom-extract": 0.012139029421075573}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.22616136919315402, "exact_match_stderr,custom-extract": 0.014636033244302024}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.17585301837270342, "exact_match_stderr,custom-extract": 0.019529244892152534}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.13169845594913715, "exact_match_stderr,custom-extract": 0.010195987296692636}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.16432272390821615, "exact_match_stderr,custom-extract": 0.010085588042335562}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1590909090909091, "exact_match_stderr,custom-extract": 0.012039164679107935}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.15030060120240482, "exact_match_stderr,custom-extract": 0.01601394538357726}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15242494226327943, "exact_match_stderr,custom-extract": 0.009976535615945838}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2681704260651629, "exact_match_stderr,custom-extract": 0.015692106905487214}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.264, "acc_stderr,none": 0.0197328855859221, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665817}, "piqa": {"alias": "piqa", "acc,none": 0.7306855277475517, "acc_stderr,none": 0.01035000407058876, "acc_norm,none": 0.7247007616974973, "acc_norm_stderr,none": 0.01042142927736953}, "race": {"alias": "race", "acc,none": 0.3827751196172249, "acc_stderr,none": 0.015043306814111515}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4498464687819857, "acc_stderr,none": 0.0112570083604857}, "winogrande": {"alias": "winogrande", "acc,none": 0.6637726913970008, "acc_stderr,none": 0.013277286593993442}} +{"created_at": "2025-05-03T04:09:05.390677", "global_step": 490000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4035836177474403, "acc_stderr,none": 0.014337158914268457, "acc_norm,none": 0.4325938566552901, "acc_norm_stderr,none": 0.014478005694182531}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7314814814814815, "acc_stderr,none": 0.009094042554994856, "acc_norm,none": 0.7209595959595959, "acc_norm_stderr,none": 0.009203588704032633}, "boolq": {"alias": "boolq", "acc,none": 0.7642201834862385, "acc_stderr,none": 0.00742429301951045}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2891072891072891, "acc_stderr,none": 0.012979310916953698}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4751045608444533, "acc_stderr,none": 0.004983592410934164, "acc_norm,none": 0.6406094403505278, "acc_norm_stderr,none": 0.004788412062375702}, "mmlu": {"acc,none": 0.33819968665432276, "acc_stderr,none": 0.003959929814293297, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.32624867162592985, "acc_stderr,none": 0.006768024364199171, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.03567016675276863}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.43636363636363634, "acc_stderr,none": 0.03872592983524754}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46078431372549017, "acc_stderr,none": 0.03498501649369527}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.47257383966244726, "acc_stderr,none": 0.03249822718301303}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4628099173553719, "acc_stderr,none": 0.04551711196104218}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.37962962962962965, "acc_stderr,none": 0.04691521224077742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.36809815950920244, "acc_stderr,none": 0.03789213935838396}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.315028901734104, "acc_stderr,none": 0.025009313790069716}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2446927374301676, "acc_stderr,none": 0.014378169884098423}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3504823151125402, "acc_stderr,none": 0.027098652621301747}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3611111111111111, "acc_stderr,none": 0.026725868809100786}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3044328552803129, "acc_stderr,none": 0.011752877592597577}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708311}, "mmlu_other": {"acc,none": 0.37141937560347604, "acc_stderr,none": 0.008652684909473572, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3660377358490566, "acc_stderr,none": 0.029647813539365242}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.36416184971098264, "acc_stderr,none": 0.03669072477416907}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4125560538116592, "acc_stderr,none": 0.03304062175449296}, "mmlu_management": {"alias": " - management", "acc,none": 0.36893203883495146, "acc_stderr,none": 0.0477761518115674}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.43162393162393164, "acc_stderr,none": 0.0324483553531149}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4061302681992337, "acc_stderr,none": 0.017562037406478916}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.027363593284684955}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2907801418439716, "acc_stderr,none": 0.027090664368353178}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3272058823529412, "acc_stderr,none": 0.028501452860396563}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3373493975903614, "acc_stderr,none": 0.0368078369072758}, "mmlu_social_sciences": {"acc,none": 0.3574910627234319, "acc_stderr,none": 0.008591601703531852, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.03318477333845331}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.41450777202072536, "acc_stderr,none": 0.03555300319557672}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.28974358974358977, "acc_stderr,none": 0.023000628243687978}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3067226890756303, "acc_stderr,none": 0.02995382389188703}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3743119266055046, "acc_stderr,none": 0.020748959408988306}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.31297709923664124, "acc_stderr,none": 0.04066962905677698}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.38235294117647056, "acc_stderr,none": 0.019659922493623343}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.37272727272727274, "acc_stderr,none": 0.04631381319425465}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2938775510204082, "acc_stderr,none": 0.029162738410249772}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.47761194029850745, "acc_stderr,none": 0.03531987930208731}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.5, "acc_stderr,none": 0.050251890762960605}, "mmlu_stem": {"acc,none": 0.3044719314938154, "acc_stderr,none": 0.008124896244499831, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3925925925925926, "acc_stderr,none": 0.04218506215368879}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.29605263157894735, "acc_stderr,none": 0.03715062154998905}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3402777777777778, "acc_stderr,none": 0.039621355734862175}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.038739587141493524}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.51, "acc_stderr,none": 0.05024183937956913}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33617021276595743, "acc_stderr,none": 0.030881618520676942}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.038061426873099935}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02256989707491842}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4129032258064516, "acc_stderr,none": 0.02800913812540039}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03255086769970103}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712166}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987053}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.028353212866863448}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.04464285714285713}, "mmlu_pro": {"exact_match,custom-extract": 0.18384308510638298, "exact_match_stderr,custom-extract": 0.0034963470869140215, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3291492329149233, "exact_match_stderr,custom-extract": 0.017561146780265928}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16856780735107732, "exact_match_stderr,custom-extract": 0.013336369763619692}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.11219081272084806, "exact_match_stderr,custom-extract": 0.009384414071431937}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.2, "exact_match_stderr,custom-extract": 0.019778727057365927}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.24170616113744076, "exact_match_stderr,custom-extract": 0.014745137840103598}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.15479876160990713, "exact_match_stderr,custom-extract": 0.011625887729987489}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.22860635696821516, "exact_match_stderr,custom-extract": 0.014691669532004212}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.16272965879265092, "exact_match_stderr,custom-extract": 0.018935396882827787}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.13169845594913715, "exact_match_stderr,custom-extract": 0.010195987296692636}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.153960029607698, "exact_match_stderr,custom-extract": 0.00982273775259319}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17316017316017315, "exact_match_stderr,custom-extract": 0.012454716571952212}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.16633266533066132, "exact_match_stderr,custom-extract": 0.016686701398526124}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.16782140107775212, "exact_match_stderr,custom-extract": 0.010372766376157203}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2719298245614035, "exact_match_stderr,custom-extract": 0.015761076648135943}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.266, "acc_stderr,none": 0.019780559675655486, "acc_norm,none": 0.394, "acc_norm_stderr,none": 0.021874299301689253}, "piqa": {"alias": "piqa", "acc,none": 0.7268770402611534, "acc_stderr,none": 0.010395730264453267, "acc_norm,none": 0.720348204570185, "acc_norm_stderr,none": 0.01047189953030656}, "race": {"alias": "race", "acc,none": 0.37894736842105264, "acc_stderr,none": 0.015014241655133452}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4498464687819857, "acc_stderr,none": 0.0112570083604857}, "winogrande": {"alias": "winogrande", "acc,none": 0.6535122336227308, "acc_stderr,none": 0.013373773411685653}} +{"created_at": "2025-05-03T06:08:32.025138", "global_step": 492000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4069965870307167, "acc_stderr,none": 0.014356399418009137, "acc_norm,none": 0.4402730375426621, "acc_norm_stderr,none": 0.014506769524804243}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7327441077441077, "acc_stderr,none": 0.00908046324601747, "acc_norm,none": 0.7196969696969697, "acc_norm_stderr,none": 0.009216306864088034}, "boolq": {"alias": "boolq", "acc,none": 0.772782874617737, "acc_stderr,none": 0.007328950945979886}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.28746928746928746, "acc_stderr,none": 0.012957392226225588}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47560246962756425, "acc_stderr,none": 0.0049838376415028965, "acc_norm,none": 0.641804421429994, "acc_norm_stderr,none": 0.00478490124855871}, "mmlu": {"acc,none": 0.34354080615296967, "acc_stderr,none": 0.003974731025609914, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.33156216790648246, "acc_stderr,none": 0.006794278058752767, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.03567016675276863}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.44242424242424244, "acc_stderr,none": 0.038783721137112745}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46568627450980393, "acc_stderr,none": 0.03501038327635897}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.48523206751054854, "acc_stderr,none": 0.032533028078777386}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4462809917355372, "acc_stderr,none": 0.0453793517794788}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.047500773411999854}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3619631901840491, "acc_stderr,none": 0.037757007291414416}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.315028901734104, "acc_stderr,none": 0.025009313790069713}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24581005586592178, "acc_stderr,none": 0.014400296429225615}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.34726688102893893, "acc_stderr,none": 0.027040745502307336}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3734567901234568, "acc_stderr,none": 0.026915003011380147}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.31421121251629724, "acc_stderr,none": 0.011855911587048228}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.3778564531702607, "acc_stderr,none": 0.008676211645032454, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.37358490566037733, "acc_stderr,none": 0.02977308271331987}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3699421965317919, "acc_stderr,none": 0.036812296333943194}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4304932735426009, "acc_stderr,none": 0.033231973029429394}, "mmlu_management": {"alias": " - management", "acc,none": 0.39805825242718446, "acc_stderr,none": 0.048467482539772386}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.44017094017094016, "acc_stderr,none": 0.032520741720630506}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.44, "acc_stderr,none": 0.049888765156985884}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.41379310344827586, "acc_stderr,none": 0.01761220408466377}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3464052287581699, "acc_stderr,none": 0.027245613047215355}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.28368794326241137, "acc_stderr,none": 0.02689170942834396}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.33088235294117646, "acc_stderr,none": 0.02858270975389844}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3433734939759036, "acc_stderr,none": 0.036965843170106004}, "mmlu_social_sciences": {"acc,none": 0.36626584335391615, "acc_stderr,none": 0.008652052188694258, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.32456140350877194, "acc_stderr,none": 0.044045561573747685}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.31313131313131315, "acc_stderr,none": 0.033042050878136525}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.41450777202072536, "acc_stderr,none": 0.03555300319557672}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3153846153846154, "acc_stderr,none": 0.023559646983189932}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3277310924369748, "acc_stderr,none": 0.03048991141767323}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3926605504587156, "acc_stderr,none": 0.020937505161201093}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.33587786259541985, "acc_stderr,none": 0.041423137719966634}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.37254901960784315, "acc_stderr,none": 0.019559646809215934}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.04607582090719976}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.30612244897959184, "acc_stderr,none": 0.02950489645459595}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.48756218905472637, "acc_stderr,none": 0.03534439848539579}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_stem": {"acc,none": 0.3054234062797336, "acc_stderr,none": 0.008133938815204834, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384739}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3851851851851852, "acc_stderr,none": 0.042039210401562783}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952925}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3472222222222222, "acc_stderr,none": 0.03981240543717861}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3404255319148936, "acc_stderr,none": 0.030976692998534443}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.038061426873099935}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.43548387096774194, "acc_stderr,none": 0.028206225591502737}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.0319474007226554}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712166}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987053}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.028963702570791016}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "mmlu_pro": {"exact_match,custom-extract": 0.18218085106382978, "exact_match_stderr,custom-extract": 0.003484810620377296, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3179916317991632, "exact_match_stderr,custom-extract": 0.017403884250878417}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.17110266159695817, "exact_match_stderr,custom-extract": 0.013415771307906003}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10424028268551237, "exact_match_stderr,custom-extract": 0.009086199159417191}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.2121951219512195, "exact_match_stderr,custom-extract": 0.020216937884754132}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.24052132701421802, "exact_match_stderr,custom-extract": 0.014720440282752337}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1568627450980392, "exact_match_stderr,custom-extract": 0.0116888387242459}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.22127139364303178, "exact_match_stderr,custom-extract": 0.014522609899199774}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.16272965879265092, "exact_match_stderr,custom-extract": 0.018935396882827787}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1344232515894641, "exact_match_stderr,custom-extract": 0.010284747799160813}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.15247964470762398, "exact_match_stderr,custom-extract": 0.00978394764898708}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17207792207792208, "exact_match_stderr,custom-extract": 0.012423857401451376}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1402805611222445, "exact_match_stderr,custom-extract": 0.015561893867712511}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1739799846035412, "exact_match_stderr,custom-extract": 0.010522224976679326}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2719298245614035, "exact_match_stderr,custom-extract": 0.01576107664813595}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.264, "acc_stderr,none": 0.019732885585922098, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665817}, "piqa": {"alias": "piqa", "acc,none": 0.7306855277475517, "acc_stderr,none": 0.01035000407058876, "acc_norm,none": 0.7247007616974973, "acc_norm_stderr,none": 0.01042142927736953}, "race": {"alias": "race", "acc,none": 0.3799043062200957, "acc_stderr,none": 0.01502160080493565}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4508700102354145, "acc_stderr,none": 0.011259319269273938}, "winogrande": {"alias": "winogrande", "acc,none": 0.6566692975532754, "acc_stderr,none": 0.01334482318535801}} +{"created_at": "2025-05-03T07:42:11.506429", "global_step": 494000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4035836177474403, "acc_stderr,none": 0.014337158914268457, "acc_norm,none": 0.43856655290102387, "acc_norm_stderr,none": 0.014500682618212864}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7331649831649831, "acc_stderr,none": 0.009075915859267258, "acc_norm,none": 0.7171717171717171, "acc_norm_stderr,none": 0.009241472775328228}, "boolq": {"alias": "boolq", "acc,none": 0.7691131498470948, "acc_stderr,none": 0.007370335500493339}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2915642915642916, "acc_stderr,none": 0.013011802821401595}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47520414260107546, "acc_stderr,none": 0.004983641854351153, "acc_norm,none": 0.6419040031866162, "acc_norm_stderr,none": 0.004784607222774649}, "mmlu": {"acc,none": 0.3453923942458339, "acc_stderr,none": 0.003980840239935939, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3292242295430393, "acc_stderr,none": 0.0067810483739687176, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.03619604524124249}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.42424242424242425, "acc_stderr,none": 0.03859268142070262}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46078431372549017, "acc_stderr,none": 0.03498501649369527}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.4978902953586498, "acc_stderr,none": 0.032546938018020076}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4462809917355372, "acc_stderr,none": 0.04537935177947879}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.37962962962962965, "acc_stderr,none": 0.04691521224077742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3619631901840491, "acc_stderr,none": 0.037757007291414416}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3208092485549133, "acc_stderr,none": 0.025131000233647897}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24581005586592178, "acc_stderr,none": 0.014400296429225617}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3504823151125402, "acc_stderr,none": 0.027098652621301744}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3765432098765432, "acc_stderr,none": 0.026959344518747787}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.30378096479791394, "acc_stderr,none": 0.011745787720472465}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.34502923976608185, "acc_stderr,none": 0.036459813773888065}, "mmlu_other": {"acc,none": 0.3817186997103315, "acc_stderr,none": 0.008701055208963249, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.36981132075471695, "acc_stderr,none": 0.02971142188010792}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3815028901734104, "acc_stderr,none": 0.03703851193099521}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4349775784753363, "acc_stderr,none": 0.033272833702713445}, "mmlu_management": {"alias": " - management", "acc,none": 0.39805825242718446, "acc_stderr,none": 0.048467482539772386}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.44871794871794873, "acc_stderr,none": 0.0325833464938688}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4086845466155811, "acc_stderr,none": 0.017579250148153397}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.027363593284684937}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3049645390070922, "acc_stderr,none": 0.02746470844202213}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.35661764705882354, "acc_stderr,none": 0.02909720956841195}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3253012048192771, "acc_stderr,none": 0.03647168523683227}, "mmlu_social_sciences": {"acc,none": 0.3721156971075723, "acc_stderr,none": 0.008688508540966224, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.32456140350877194, "acc_stderr,none": 0.044045561573747685}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.033586181457325226}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.43005181347150256, "acc_stderr,none": 0.035729543331448094}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.33076923076923076, "acc_stderr,none": 0.023854795680971114}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3277310924369748, "acc_stderr,none": 0.03048991141767323}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3963302752293578, "acc_stderr,none": 0.02097146994790053}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3435114503816794, "acc_stderr,none": 0.041649760719448786}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.380718954248366, "acc_stderr,none": 0.01964380155792481}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.35454545454545455, "acc_stderr,none": 0.04582004841505416}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.31020408163265306, "acc_stderr,none": 0.029613459872484378}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.47761194029850745, "acc_stderr,none": 0.03531987930208731}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.47, "acc_stderr,none": 0.050161355804659205}, "mmlu_stem": {"acc,none": 0.3076435141135427, "acc_stderr,none": 0.008155043722905417, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384739}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3851851851851852, "acc_stderr,none": 0.042039210401562783}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.035834961763610625}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3472222222222222, "acc_stderr,none": 0.03981240543717861}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3574468085106383, "acc_stderr,none": 0.03132941789476425}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2827586206896552, "acc_stderr,none": 0.037528339580033376}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.42258064516129035, "acc_stderr,none": 0.02810096472427264}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2955665024630542, "acc_stderr,none": 0.032104944337514575}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.38, "acc_stderr,none": 0.04878317312145633}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02671924078371217}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.034791855725996586}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.028765111718046934}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.18267952127659576, "exact_match_stderr,custom-extract": 0.003492826921774581, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.30264993026499304, "exact_match_stderr,custom-extract": 0.017168770774579085}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16730038022813687, "exact_match_stderr,custom-extract": 0.01329626121085858}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.11749116607773852, "exact_match_stderr,custom-extract": 0.009574824784172204}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.22195121951219512, "exact_match_stderr,custom-extract": 0.02054804589006829}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.23933649289099526, "exact_match_stderr,custom-extract": 0.014695587900810762}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.15583075335397317, "exact_match_stderr,custom-extract": 0.011657452909176065}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2310513447432763, "exact_match_stderr,custom-extract": 0.014746599750625127}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.17060367454068243, "exact_match_stderr,custom-extract": 0.019296717799305904}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1307901907356948, "exact_match_stderr,custom-extract": 0.010166080711813392}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.15247964470762398, "exact_match_stderr,custom-extract": 0.009783947648987079}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17532467532467533, "exact_match_stderr,custom-extract": 0.01251590249750937}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.15831663326653306, "exact_match_stderr,custom-extract": 0.01635772767882581}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.16320246343341033, "exact_match_stderr,custom-extract": 0.010257374338618739}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2694235588972431, "exact_match_stderr,custom-extract": 0.015715255828559514}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.264, "acc_stderr,none": 0.019732885585922098, "acc_norm,none": 0.398, "acc_norm_stderr,none": 0.021912377885779974}, "piqa": {"alias": "piqa", "acc,none": 0.7301414581066377, "acc_stderr,none": 0.010356595421852209, "acc_norm,none": 0.7225244831338411, "acc_norm_stderr,none": 0.010446818281039947}, "race": {"alias": "race", "acc,none": 0.3827751196172249, "acc_stderr,none": 0.015043306814111515}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.45138178096212894, "acc_stderr,none": 0.01126045668162444}, "winogrande": {"alias": "winogrande", "acc,none": 0.6621941594317285, "acc_stderr,none": 0.013292583502910885}} +{"created_at": "2025-05-03T10:14:24.328785", "global_step": 496000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4052901023890785, "acc_stderr,none": 0.014346869060229337, "acc_norm,none": 0.44283276450511944, "acc_norm_stderr,none": 0.014515573873348904}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7306397306397306, "acc_stderr,none": 0.009103043207756983, "acc_norm,none": 0.718013468013468, "acc_norm_stderr,none": 0.009233124071053646}, "boolq": {"alias": "boolq", "acc,none": 0.7697247706422018, "acc_stderr,none": 0.007363493078403616}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.29074529074529076, "acc_stderr,none": 0.013001023498635364}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47540330611431986, "acc_stderr,none": 0.004983740145218612, "acc_norm,none": 0.6417048396733719, "acc_norm_stderr,none": 0.0047851950498891595}, "mmlu": {"acc,none": 0.34425295541945594, "acc_stderr,none": 0.0039768596251749, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3294367693942614, "acc_stderr,none": 0.00678167572474211, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047181}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.43636363636363634, "acc_stderr,none": 0.03872592983524754}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46078431372549017, "acc_stderr,none": 0.03498501649369527}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5021097046413502, "acc_stderr,none": 0.032546938018020076}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4380165289256198, "acc_stderr,none": 0.045291468044357915}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3888888888888889, "acc_stderr,none": 0.047128212574267705}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3558282208588957, "acc_stderr,none": 0.03761521380046734}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.315028901734104, "acc_stderr,none": 0.02500931379006971}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3440514469453376, "acc_stderr,none": 0.026981478043648022}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.026869490744815247}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.31029986962190353, "acc_stderr,none": 0.011815439293469825}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.32748538011695905, "acc_stderr,none": 0.035993357714560276}, "mmlu_other": {"acc,none": 0.37914386868361766, "acc_stderr,none": 0.008687635435215911, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.36981132075471695, "acc_stderr,none": 0.02971142188010792}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3699421965317919, "acc_stderr,none": 0.036812296333943194}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4304932735426009, "acc_stderr,none": 0.033231973029429394}, "mmlu_management": {"alias": " - management", "acc,none": 0.42718446601941745, "acc_stderr,none": 0.04897957737781168}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.44017094017094016, "acc_stderr,none": 0.032520741720630506}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.43, "acc_stderr,none": 0.04975698519562428}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4086845466155811, "acc_stderr,none": 0.017579250148153397}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3562091503267974, "acc_stderr,none": 0.02742047766262923}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.29432624113475175, "acc_stderr,none": 0.0271871270115038}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3382352941176471, "acc_stderr,none": 0.028739328513983576}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3313253012048193, "acc_stderr,none": 0.03664314777288085}, "mmlu_social_sciences": {"acc,none": 0.36821579460513487, "acc_stderr,none": 0.008664461483418769, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.32456140350877194, "acc_stderr,none": 0.044045561573747685}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.03318477333845331}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.42487046632124353, "acc_stderr,none": 0.035674713352125395}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3282051282051282, "acc_stderr,none": 0.023807633198657262}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.33613445378151263, "acc_stderr,none": 0.030684737115135363}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3963302752293578, "acc_stderr,none": 0.02097146994790053}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3282442748091603, "acc_stderr,none": 0.041184385658062976}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3660130718954248, "acc_stderr,none": 0.01948802574552967}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.37272727272727274, "acc_stderr,none": 0.04631381319425464}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3020408163265306, "acc_stderr,none": 0.029393609319879818}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.48258706467661694, "acc_stderr,none": 0.03533389234739245}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_stem": {"acc,none": 0.30859498889946085, "acc_stderr,none": 0.008153618277361684, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3925925925925926, "acc_stderr,none": 0.04218506215368879}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.035834961763610625}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3541666666666667, "acc_stderr,none": 0.039994111357535424}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617747}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.5, "acc_stderr,none": 0.050251890762960605}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.34893617021276596, "acc_stderr,none": 0.031158522131357773}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.038061426873099935}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.432258064516129, "acc_stderr,none": 0.02818173972001941}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.30049261083743845, "acc_stderr,none": 0.03225799476233484}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.02696242432507383}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987053}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.028963702570791016}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3482142857142857, "acc_stderr,none": 0.045218299028335865}, "mmlu_pro": {"exact_match,custom-extract": 0.18201462765957446, "exact_match_stderr,custom-extract": 0.003484247706736926, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.31520223152022314, "exact_match_stderr,custom-extract": 0.01736278145457227}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16476552598225602, "exact_match_stderr,custom-extract": 0.013215216167850041}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10865724381625441, "exact_match_stderr,custom-extract": 0.009253806404080385}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.21951219512195122, "exact_match_stderr,custom-extract": 0.020466837110489664}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.23933649289099526, "exact_match_stderr,custom-extract": 0.014695587900810766}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.15995872033023736, "exact_match_stderr,custom-extract": 0.011781934278001626}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.22616136919315402, "exact_match_stderr,custom-extract": 0.014636033244302024}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.17060367454068243, "exact_match_stderr,custom-extract": 0.019296717799305904}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1307901907356948, "exact_match_stderr,custom-extract": 0.010166080711813392}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.16136195410806808, "exact_match_stderr,custom-extract": 0.010012002939971329}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1699134199134199, "exact_match_stderr,custom-extract": 0.01236159999946829}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.14829659318637275, "exact_match_stderr,custom-extract": 0.0159255744939775}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15704387990762125, "exact_match_stderr,custom-extract": 0.010098936605569635}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2719298245614035, "exact_match_stderr,custom-extract": 0.015761076648135947}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.266, "acc_stderr,none": 0.019780559675655486, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665817}, "piqa": {"alias": "piqa", "acc,none": 0.7279651795429815, "acc_stderr,none": 0.01038276378624738, "acc_norm,none": 0.7181719260065288, "acc_norm_stderr,none": 0.010496675231258164}, "race": {"alias": "race", "acc,none": 0.38086124401913873, "acc_stderr,none": 0.015028897988042758}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.45138178096212894, "acc_stderr,none": 0.011260456681624438}, "winogrande": {"alias": "winogrande", "acc,none": 0.665351223362273, "acc_stderr,none": 0.013261823629558366}} +{"created_at": "2025-05-03T12:12:27.468408", "global_step": 498000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4052901023890785, "acc_stderr,none": 0.014346869060229337, "acc_norm,none": 0.44368600682593856, "acc_norm_stderr,none": 0.014518421825670444}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7302188552188552, "acc_stderr,none": 0.009107527914671064, "acc_norm,none": 0.7171717171717171, "acc_norm_stderr,none": 0.009241472775328228}, "boolq": {"alias": "boolq", "acc,none": 0.7691131498470948, "acc_stderr,none": 0.007370335500493343}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2891072891072891, "acc_stderr,none": 0.0129793109169537}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4751045608444533, "acc_stderr,none": 0.004983592410934164, "acc_norm,none": 0.6428002389962159, "acc_norm_stderr,none": 0.004781950883460504}, "mmlu": {"acc,none": 0.34297108673978066, "acc_stderr,none": 0.003972238006364669, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3294367693942614, "acc_stderr,none": 0.006776167208504884, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.0361960452412425}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4484848484848485, "acc_stderr,none": 0.038835659779569286}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46568627450980393, "acc_stderr,none": 0.03501038327635897}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5021097046413502, "acc_stderr,none": 0.032546938018020076}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4462809917355372, "acc_stderr,none": 0.0453793517794788}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3888888888888889, "acc_stderr,none": 0.047128212574267705}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3558282208588957, "acc_stderr,none": 0.03761521380046734}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3092485549132948, "acc_stderr,none": 0.024883140570071755}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24134078212290502, "acc_stderr,none": 0.014310999547961445}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3440514469453376, "acc_stderr,none": 0.026981478043648022}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3734567901234568, "acc_stderr,none": 0.026915003011380147}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.30834419817470665, "acc_stderr,none": 0.011794833789715329}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3391812865497076, "acc_stderr,none": 0.03631053496488905}, "mmlu_other": {"acc,none": 0.3794657225619569, "acc_stderr,none": 0.008690981024699845, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.36981132075471695, "acc_stderr,none": 0.02971142188010792}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3699421965317919, "acc_stderr,none": 0.036812296333943194}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.42152466367713004, "acc_stderr,none": 0.033141902221106564}, "mmlu_management": {"alias": " - management", "acc,none": 0.4077669902912621, "acc_stderr,none": 0.04865777570410768}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.452991452991453, "acc_stderr,none": 0.0326109987309862}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.42, "acc_stderr,none": 0.049604496374885836}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4086845466155811, "acc_stderr,none": 0.017579250148153397}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35947712418300654, "acc_stderr,none": 0.027475969910660952}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2978723404255319, "acc_stderr,none": 0.027281608344469414}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.34558823529411764, "acc_stderr,none": 0.02888819310398864}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3313253012048193, "acc_stderr,none": 0.03664314777288085}, "mmlu_social_sciences": {"acc,none": 0.365940851478713, "acc_stderr,none": 0.0086458358023838, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.04372748290278008}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.033586181457325226}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.43005181347150256, "acc_stderr,none": 0.035729543331448094}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.31025641025641026, "acc_stderr,none": 0.023454674889404288}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31932773109243695, "acc_stderr,none": 0.0302839955258844}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3981651376146789, "acc_stderr,none": 0.020987989422654264}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.32061068702290074, "acc_stderr,none": 0.04093329229834278}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.36437908496732024, "acc_stderr,none": 0.0194695182215737}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.39090909090909093, "acc_stderr,none": 0.04673752333670238}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3020408163265306, "acc_stderr,none": 0.029393609319879818}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.48756218905472637, "acc_stderr,none": 0.0353443984853958}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.47, "acc_stderr,none": 0.050161355804659205}, "mmlu_stem": {"acc,none": 0.3047890897557881, "acc_stderr,none": 0.00813445737237051, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4, "acc_stderr,none": 0.042320736951515885}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952925}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3472222222222222, "acc_stderr,none": 0.03981240543717861}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.040925639582376556}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33617021276595743, "acc_stderr,none": 0.030881618520676942}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2896551724137931, "acc_stderr,none": 0.03780019230438014}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4290322580645161, "acc_stderr,none": 0.028156036538233217}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.031785297106427496}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.026962424325073835}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.034791855725996586}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.028765111718046934}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.04432804055291519}, "mmlu_pro": {"exact_match,custom-extract": 0.18367686170212766, "exact_match_stderr,custom-extract": 0.003498964616069176, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3179916317991632, "exact_match_stderr,custom-extract": 0.017403884250878413}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16856780735107732, "exact_match_stderr,custom-extract": 0.013336369763619692}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.1157243816254417, "exact_match_stderr,custom-extract": 0.009512068239624288}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.21707317073170732, "exact_match_stderr,custom-extract": 0.020384591313839226}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.24170616113744076, "exact_match_stderr,custom-extract": 0.014745137840103598}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.16615067079463364, "exact_match_stderr,custom-extract": 0.011963469940000524}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2237163814180929, "exact_match_stderr,custom-extract": 0.014579682804410434}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1679790026246719, "exact_match_stderr,custom-extract": 0.01917797923756724}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.14168937329700274, "exact_match_stderr,custom-extract": 0.010514643243492047}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.153960029607698, "exact_match_stderr,custom-extract": 0.00982273775259319}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1764069264069264, "exact_match_stderr,custom-extract": 0.01254623184906948}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1402805611222445, "exact_match_stderr,custom-extract": 0.015561893867712511}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15935334872979215, "exact_match_stderr,custom-extract": 0.010158977410017719}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2669172932330827, "exact_match_stderr,custom-extract": 0.015668798035500312}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.266, "acc_stderr,none": 0.019780559675655486, "acc_norm,none": 0.394, "acc_norm_stderr,none": 0.021874299301689253}, "piqa": {"alias": "piqa", "acc,none": 0.7295973884657236, "acc_stderr,none": 0.010363167031620796, "acc_norm,none": 0.7236126224156693, "acc_norm_stderr,none": 0.010434162388275613}, "race": {"alias": "race", "acc,none": 0.38086124401913873, "acc_stderr,none": 0.015028897988042758}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.45291709314227224, "acc_stderr,none": 0.01126379679411243}, "winogrande": {"alias": "winogrande", "acc,none": 0.6566692975532754, "acc_stderr,none": 0.013344823185358009}} +{"created_at": "2025-05-03T13:12:02.749816", "global_step": 500000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.40187713310580203, "acc_stderr,none": 0.014327268614578281, "acc_norm,none": 0.43856655290102387, "acc_norm_stderr,none": 0.014500682618212864}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7314814814814815, "acc_stderr,none": 0.009094042554994856, "acc_norm,none": 0.7192760942760943, "acc_norm_stderr,none": 0.009220526174711358}, "boolq": {"alias": "boolq", "acc,none": 0.7700305810397553, "acc_stderr,none": 0.007360063651505803}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2841932841932842, "acc_stderr,none": 0.012912932309514279}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4751045608444533, "acc_stderr,none": 0.0049835924109341645, "acc_norm,none": 0.6420035849432384, "acc_norm_stderr,none": 0.004784312972495384}, "mmlu": {"acc,none": 0.34261501210653755, "acc_stderr,none": 0.003972341794408205, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.33049946865037194, "acc_stderr,none": 0.0067835256855849485, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.19047619047619047, "acc_stderr,none": 0.03512207412302052}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4303030303030303, "acc_stderr,none": 0.03866225962879077}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46568627450980393, "acc_stderr,none": 0.03501038327635897}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5021097046413502, "acc_stderr,none": 0.032546938018020076}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4380165289256198, "acc_stderr,none": 0.045291468044357915}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3888888888888889, "acc_stderr,none": 0.047128212574267705}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.36809815950920244, "acc_stderr,none": 0.03789213935838396}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.315028901734104, "acc_stderr,none": 0.02500931379006971}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2435754189944134, "acc_stderr,none": 0.014355911964767865}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3633440514469453, "acc_stderr,none": 0.02731684767419271}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.026869490744815244}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3089960886571056, "acc_stderr,none": 0.011801729777239254}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.036155076303109344}, "mmlu_other": {"acc,none": 0.3759253299002253, "acc_stderr,none": 0.008671719910651066, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.38, "acc_stderr,none": 0.04878317312145632}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3660377358490566, "acc_stderr,none": 0.02964781353936524}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.37572254335260113, "acc_stderr,none": 0.03692820767264867}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4304932735426009, "acc_stderr,none": 0.033231973029429394}, "mmlu_management": {"alias": " - management", "acc,none": 0.39805825242718446, "acc_stderr,none": 0.048467482539772386}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.44017094017094016, "acc_stderr,none": 0.032520741720630506}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.40357598978288634, "acc_stderr,none": 0.017544332237926417}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.027363593284684934}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2907801418439716, "acc_stderr,none": 0.027090664368353178}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.33088235294117646, "acc_stderr,none": 0.02858270975389844}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3253012048192771, "acc_stderr,none": 0.03647168523683227}, "mmlu_social_sciences": {"acc,none": 0.3639909002274943, "acc_stderr,none": 0.008636500524938126, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.04372748290278008}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.32323232323232326, "acc_stderr,none": 0.03332299921070644}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.43523316062176165, "acc_stderr,none": 0.035780381650085846}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3076923076923077, "acc_stderr,none": 0.023400928918310485}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3319327731092437, "acc_stderr,none": 0.030588697013783663}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3834862385321101, "acc_stderr,none": 0.020847156641915984}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.32061068702290074, "acc_stderr,none": 0.04093329229834278}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.36764705882352944, "acc_stderr,none": 0.019506291693954843}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.37272727272727274, "acc_stderr,none": 0.04631381319425464}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3020408163265306, "acc_stderr,none": 0.029393609319879818}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.48756218905472637, "acc_stderr,none": 0.03534439848539579}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_stem": {"acc,none": 0.3070091975895972, "acc_stderr,none": 0.008151207994643158, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3851851851851852, "acc_stderr,none": 0.042039210401562783}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.28289473684210525, "acc_stderr,none": 0.03665349695640767}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3472222222222222, "acc_stderr,none": 0.03981240543717861}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617747}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3446808510638298, "acc_stderr,none": 0.03106898596312215}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.038061426873099935}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.022644212615525218}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.42258064516129035, "acc_stderr,none": 0.02810096472427264}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2955665024630542, "acc_stderr,none": 0.032104944337514575}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.026466117538959912}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.24503311258278146, "acc_stderr,none": 0.03511807571804725}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.22685185185185186, "acc_stderr,none": 0.028561650102422252}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.04464285714285714}, "mmlu_pro": {"exact_match,custom-extract": 0.18367686170212766, "exact_match_stderr,custom-extract": 0.003498581572180752, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3138075313807531, "exact_match_stderr,custom-extract": 0.017341958540453725}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16603295310519645, "exact_match_stderr,custom-extract": 0.013255877519716387}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.11130742049469965, "exact_match_stderr,custom-extract": 0.009352043831026096}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.22682926829268293, "exact_match_stderr,custom-extract": 0.020707401045044642}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.24052132701421802, "exact_match_stderr,custom-extract": 0.014720440282752337}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.16718266253869968, "exact_match_stderr,custom-extract": 0.011993137667893426}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.22493887530562348, "exact_match_stderr,custom-extract": 0.014607947807460348}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1679790026246719, "exact_match_stderr,custom-extract": 0.019177979237567245}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1335149863760218, "exact_match_stderr,custom-extract": 0.01025531945289569}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.15544041450777202, "exact_match_stderr,custom-extract": 0.00986121065535071}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17316017316017315, "exact_match_stderr,custom-extract": 0.012454716571952212}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.15030060120240482, "exact_match_stderr,custom-extract": 0.01601394538357726}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.16628175519630484, "exact_match_stderr,custom-extract": 0.01033462224083037}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2681704260651629, "exact_match_stderr,custom-extract": 0.01569210690548721}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.264, "acc_stderr,none": 0.019732885585922098, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665817}, "piqa": {"alias": "piqa", "acc,none": 0.7306855277475517, "acc_stderr,none": 0.01035000407058876, "acc_norm,none": 0.7225244831338411, "acc_norm_stderr,none": 0.010446818281039945}, "race": {"alias": "race", "acc,none": 0.3837320574162679, "acc_stderr,none": 0.015050418634703647}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.45291709314227224, "acc_stderr,none": 0.01126379679411243}, "winogrande": {"alias": "winogrande", "acc,none": 0.6614048934490924, "acc_stderr,none": 0.013300169865842416}}