qggpddlZddlZddlmZddlmZdZeddZdZ d Z d Z dS) N)HfApi) lru_cachect}|d}g}|D]}|jdr|jdddd}d|vr4|dd\}}||d|||t|S) Nzopen-llm-leaderboard)author-details/__)r list_datasetsidendswithsplitreplaceappendsorted)apidatasetsmodelsdataset model_partprovidermodels K/mnt/lustre/work/bethge/jstrueber72/projects/lm_sim_test/src/dataloading.pyget_leaderboard_modelsrs ''C  (> ??H F** :  z * * * ))#..r2:::rJJJz!!","2"24";";% 33E334444 j))) &>>r )maxsizectS)N)rrrget_leaderboard_models_cachedr!s ! # ##rc|gdSd|D}i}|D]4}tj|}d|D}t|||<5|r tj|}t |S)N)'bbh_boolean_expressionsbbh_causal_judgementbbh_date_understandingbbh_disambiguation_qabbh_formal_fallaciesbbh_geometric_shapesbbh_hyperbaton"bbh_logical_deduction_five_objects#bbh_logical_deduction_seven_objects#bbh_logical_deduction_three_objectsbbh_movie_recommendation bbh_navigatebbh_object_countingbbh_penguins_in_a_table#bbh_reasoning_about_colored_objectsbbh_ruin_names'bbh_salient_translation_error_detection bbh_snarksbbh_sports_understandingbbh_temporal_sequences*bbh_tracking_shuffled_objects_five_objects+bbh_tracking_shuffled_objects_seven_objects+bbh_tracking_shuffled_objects_three_objectsbbh_web_of_lies gpqa_diamond gpqa_extended gpqa_mainifevalmath_algebra_hardmath_counting_and_prob_hardmath_geometry_hardmath_intermediate_algebra_hardmath_num_theory_hardmath_prealgebra_hardmath_precalculus_hardmmlu_promusr_murder_mysteriesmusr_object_placementsmusr_team_allocationcBg|]}d|dddS)open-llm-leaderboard/rr r)r).0model_ids r z,get_leaderboard_datasets..'s5uuu_gZX5E5Ec45P5PZZZuuurcDg|]}|ddS)__leaderboard_r )r)rLnames rrNz,get_leaderboard_datasets...s*SSSd$455b9SSSr)rget_dataset_config_namesset intersectionvaluesr) model_idsleaderboard_model_idsmodel_datasetsrM config_names dataset_namescommon_datasetss rget_leaderboard_datasetsr\"sUUU UvuktuuuN)668BB SSlSSS #&}#5#5x  E*N,A,A,C,CD / " ""rc2g}d|dvr!|D]}||dnW|D]T}|ddkr|d$|ddkr|dFtd|S)N answer_indexranswerFalseTruer z Invalid label)keysr ValueError)doclabelsds r filter_labelsrg8s FQ&& - -A MM!N+ , , , , - 2 2A{g%% a    8&& a     111 Mrc |dd}tjd|zdz|dz|zd}|d}|}g}|d D]5}t jd |D}||6t|d }n*#t$r}t|g}g}Yd}~nd}~wwxYw||fS) Nrr rKrrPlatest)rQrdoc_idfiltered_respsc8g|]}t|dS)r)float)rLoptions rrNz!load_run_data..Ts$ E E Efvay!1!1 E E Errd) rr load_datasetsortto_dictnparrayrrg Exceptionprint) model_name dataset_namedata log_probsresplog_probrees r load_run_datar}Gs"''T22 $%rs!!!!!!* 1$$$###,   r