jbnayahu commited on
Commit
f77b30e
·
verified ·
2 Parent(s): d91bf03 9ee85e2

Merge branch #jbnayahu/bluebench' into 'ibm-research/bluebench'

Browse files
results/bluebench/2025-08-03T08-32-43_evaluation_results.json ADDED
@@ -0,0 +1,1282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-08-03T12:32:38.916038Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=azure/Azure/gpt-4.1-ncf,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "azure/Azure/gpt-4.1-ncf",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.26.5",
45
+ "unitxt_commit_hash": "N/A",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "tiktoken": "0.9.0",
55
+ "charset-normalizer": "3.4.2",
56
+ "nvidia-cuda-runtime-cu12": "12.6.77",
57
+ "pyarrow": "21.0.0",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "httpcore": "1.0.9",
61
+ "pip": "25.2",
62
+ "certifi": "2025.7.14",
63
+ "evaluate": "0.4.5",
64
+ "Jinja2": "3.1.6",
65
+ "jsonschema-specifications": "2025.4.1",
66
+ "pydantic_core": "2.33.2",
67
+ "nvidia-cusparse-cu12": "12.5.4.2",
68
+ "aiosignal": "1.4.0",
69
+ "yarl": "1.20.1",
70
+ "unitxt": "1.26.5",
71
+ "jsonschema": "4.25.0",
72
+ "portalocker": "3.2.0",
73
+ "multiprocess": "0.70.16",
74
+ "nvidia-nvjitlink-cu12": "12.6.85",
75
+ "nvidia-cublas-cu12": "12.6.4.1",
76
+ "pydantic": "2.11.7",
77
+ "async-timeout": "5.0.1",
78
+ "annotated-types": "0.7.0",
79
+ "rouge_score": "0.1.2",
80
+ "contourpy": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "matplotlib": "3.10.5",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "h11": "0.16.0",
87
+ "zipp": "3.19.2",
88
+ "tzdata": "2025.2",
89
+ "bert-score": "0.3.13",
90
+ "setuptools": "80.9.0",
91
+ "referencing": "0.36.2",
92
+ "sacrebleu": "2.5.1",
93
+ "filelock": "3.18.0",
94
+ "urllib3": "2.5.0",
95
+ "scipy": "1.15.3",
96
+ "nvidia-nccl-cu12": "2.26.2",
97
+ "kiwisolver": "1.4.8",
98
+ "networkx": "3.4.2",
99
+ "typing-inspection": "0.4.1",
100
+ "sniffio": "1.3.1",
101
+ "rpds-py": "0.26.0",
102
+ "nvidia-curand-cu12": "10.3.7.77",
103
+ "litellm": "1.74.12",
104
+ "pillow": "11.3.0",
105
+ "datasets": "3.6.0",
106
+ "nvidia-cusolver-cu12": "11.7.1.2",
107
+ "cycler": "0.12.1",
108
+ "tokenizers": "0.21.4",
109
+ "distro": "1.9.0",
110
+ "idna": "3.10",
111
+ "MarkupSafe": "3.0.2",
112
+ "frozenlist": "1.7.0",
113
+ "pyparsing": "3.2.3",
114
+ "regex": "2025.7.34",
115
+ "jiter": "0.10.0",
116
+ "importlib_metadata": "8.0.0",
117
+ "packaging": "24.2",
118
+ "psutil": "7.0.0",
119
+ "mecab-ko-dic": "1.0.0",
120
+ "joblib": "1.5.1",
121
+ "transformers": "4.54.1",
122
+ "fsspec": "2025.3.0",
123
+ "scikit-learn": "1.7.1",
124
+ "dill": "0.3.8",
125
+ "wheel": "0.45.1",
126
+ "nvidia-nvtx-cu12": "12.6.77",
127
+ "nvidia-cusparselt-cu12": "0.6.3",
128
+ "lxml": "6.0.0",
129
+ "propcache": "0.3.2",
130
+ "numpy": "2.2.6",
131
+ "mpmath": "1.3.0",
132
+ "conllu": "6.0.0",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "fonttools": "4.59.0",
136
+ "tabulate": "0.9.0",
137
+ "typing_extensions": "4.12.2",
138
+ "absl-py": "2.3.1",
139
+ "accelerate": "1.9.0",
140
+ "nvidia-cufft-cu12": "11.3.0.4",
141
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
142
+ "click": "8.2.1",
143
+ "attrs": "25.3.0",
144
+ "exceptiongroup": "1.3.0",
145
+ "tenacity": "9.1.2",
146
+ "huggingface-hub": "0.34.3",
147
+ "pytz": "2025.2",
148
+ "aiohappyeyeballs": "2.6.1",
149
+ "python-dateutil": "2.9.0.post0",
150
+ "torch": "2.7.1",
151
+ "python-dotenv": "1.1.1",
152
+ "multidict": "6.6.3",
153
+ "httpx": "0.28.1",
154
+ "aiohttp": "3.12.15",
155
+ "xxhash": "3.5.0",
156
+ "PyYAML": "6.0.2",
157
+ "colorama": "0.4.6",
158
+ "openai": "1.98.0",
159
+ "threadpoolctl": "3.6.0",
160
+ "nvidia-cudnn-cu12": "9.5.1.17",
161
+ "pandas": "2.3.1",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 1.0,
180
+ "accuracy_ci_low": 1.0,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 1.0,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 1.0,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 1.0,
190
+ "accuracy_ci_low": 1.0,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 1.0,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 1.0,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 1.0,
200
+ "accuracy_ci_low": 1.0,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 1.0,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 1.0,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 1.0,
220
+ "accuracy_ci_low": 1.0,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 1.0,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 1.0,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 1.0,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.8888888888888888,
260
+ "accuracy_ci_low": 0.5310928992288233,
261
+ "accuracy_ci_high": 1.0,
262
+ "score_name": "accuracy",
263
+ "score": 0.8888888888888888,
264
+ "score_ci_high": 1.0,
265
+ "score_ci_low": 0.5310928992288233,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.8888888888888888,
270
+ "accuracy_ci_low": 0.5555555555555556,
271
+ "accuracy_ci_high": 1.0,
272
+ "score_name": "accuracy",
273
+ "score": 0.8888888888888888,
274
+ "score_ci_high": 1.0,
275
+ "score_ci_low": 0.5555555555555556,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 1.0,
280
+ "accuracy_ci_low": 1.0,
281
+ "accuracy_ci_high": 1.0,
282
+ "score_name": "accuracy",
283
+ "score": 1.0,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 1.0,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.9797979797979798,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.9636363636363636,
296
+ "score": 0.9636363636363636,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.9636363636363636,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.8085106382978724,
307
+ "f1_Organization": 0.6857142857142857,
308
+ "f1_Location": 0.6956521739130435,
309
+ "f1_macro": 0.7299590326417338,
310
+ "recall_macro": 0.7832988267770876,
311
+ "precision_macro": 0.6967893217893218,
312
+ "in_classes_support": 1.0,
313
+ "f1_micro": 0.7239263803680981,
314
+ "recall_micro": 0.7866666666666666,
315
+ "precision_micro": 0.6704545454545454,
316
+ "score": 0.7239263803680981,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.6420029997196985,
319
+ "score_ci_high": 0.782066798498023,
320
+ "f1_micro_ci_low": 0.6420029997196985,
321
+ "f1_micro_ci_high": 0.782066798498023
322
+ },
323
+ "score": 0.7239263803680981,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.7142857142857143,
330
+ "accuracy_ci_low": 0.2857142857142857,
331
+ "accuracy_ci_high": 1.0,
332
+ "score_name": "accuracy",
333
+ "score": 0.7142857142857143,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.2857142857142857,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.42857142857142855,
340
+ "accuracy_ci_low": 0.14285714285714285,
341
+ "accuracy_ci_high": 0.8571428571428571,
342
+ "score_name": "accuracy",
343
+ "score": 0.42857142857142855,
344
+ "score_ci_high": 0.8571428571428571,
345
+ "score_ci_low": 0.14285714285714285,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.2857142857142857,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.7142857142857143,
352
+ "score_name": "accuracy",
353
+ "score": 0.2857142857142857,
354
+ "score_ci_high": 0.7142857142857143,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 1.0,
360
+ "accuracy_ci_low": 1.0,
361
+ "accuracy_ci_high": 1.0,
362
+ "score_name": "accuracy",
363
+ "score": 1.0,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 1.0,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.8571428571428571,
370
+ "accuracy_ci_low": 0.42857142857142855,
371
+ "accuracy_ci_high": 1.0,
372
+ "score_name": "accuracy",
373
+ "score": 0.8571428571428571,
374
+ "score_ci_high": 1.0,
375
+ "score_ci_low": 0.42857142857142855,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.42857142857142855,
380
+ "accuracy_ci_low": 0.14285714285714285,
381
+ "accuracy_ci_high": 0.8571428571428571,
382
+ "score_name": "accuracy",
383
+ "score": 0.42857142857142855,
384
+ "score_ci_high": 0.8571428571428571,
385
+ "score_ci_low": 0.14285714285714285,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.5714285714285714,
390
+ "accuracy_ci_low": 0.14285714285714285,
391
+ "accuracy_ci_high": 0.8571428571428571,
392
+ "score_name": "accuracy",
393
+ "score": 0.5714285714285714,
394
+ "score_ci_high": 0.8571428571428571,
395
+ "score_ci_low": 0.14285714285714285,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.2857142857142857,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.7142857142857143,
402
+ "score_name": "accuracy",
403
+ "score": 0.2857142857142857,
404
+ "score_ci_high": 0.7142857142857143,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.7142857142857143,
410
+ "accuracy_ci_low": 0.2857142857142857,
411
+ "accuracy_ci_high": 1.0,
412
+ "score_name": "accuracy",
413
+ "score": 0.7142857142857143,
414
+ "score_ci_high": 1.0,
415
+ "score_ci_low": 0.2857142857142857,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.5714285714285714,
420
+ "accuracy_ci_low": 0.14285714285714285,
421
+ "accuracy_ci_high": 0.8571428571428571,
422
+ "score_name": "accuracy",
423
+ "score": 0.5714285714285714,
424
+ "score_ci_high": 0.8571428571428571,
425
+ "score_ci_low": 0.14285714285714285,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.5714285714285714,
430
+ "accuracy_ci_low": 0.14285714285714285,
431
+ "accuracy_ci_high": 0.8571428571428571,
432
+ "score_name": "accuracy",
433
+ "score": 0.5714285714285714,
434
+ "score_ci_high": 0.8571428571428571,
435
+ "score_ci_low": 0.14285714285714285,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.8571428571428571,
440
+ "accuracy_ci_low": 0.31927964061584246,
441
+ "accuracy_ci_high": 1.0,
442
+ "score_name": "accuracy",
443
+ "score": 0.8571428571428571,
444
+ "score_ci_high": 1.0,
445
+ "score_ci_low": 0.31927964061584246,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.14285714285714285,
450
+ "accuracy_ci_low": 0.0,
451
+ "accuracy_ci_high": 0.5714285714285714,
452
+ "score_name": "accuracy",
453
+ "score": 0.14285714285714285,
454
+ "score_ci_high": 0.5714285714285714,
455
+ "score_ci_low": 0.0,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.7142857142857143,
460
+ "accuracy_ci_low": 0.2857142857142857,
461
+ "accuracy_ci_high": 1.0,
462
+ "score_name": "accuracy",
463
+ "score": 0.7142857142857143,
464
+ "score_ci_high": 1.0,
465
+ "score_ci_low": 0.2857142857142857,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.5816326530612245,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.503030303030303,
475
+ "f1_suggestive": 0.18181818181818182,
476
+ "f1_generic": 0.8,
477
+ "f1_fanciful": 0.4,
478
+ "f1_descriptive": 0.3333333333333333,
479
+ "f1_arbitrary": 0.8,
480
+ "f1_macro_ci_low": 0.3144226612853847,
481
+ "f1_macro_ci_high": 0.76,
482
+ "score_name": "f1_micro",
483
+ "score": 0.4375,
484
+ "score_ci_high": 0.6666666666666666,
485
+ "score_ci_low": 0.21080178633741004,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.35,
488
+ "accuracy_ci_low": 0.15,
489
+ "accuracy_ci_high": 0.6,
490
+ "f1_micro": 0.4375,
491
+ "f1_micro_ci_low": 0.21080178633741004,
492
+ "f1_micro_ci_high": 0.6666666666666666
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.6360153256704981,
496
+ "f1_no": 0.8275862068965517,
497
+ "f1_yes": 0.4444444444444444,
498
+ "f1_macro_ci_low": 0.4117647058823529,
499
+ "f1_macro_ci_high": 0.8932752204410275,
500
+ "score_name": "f1_micro",
501
+ "score": 0.7368421052631579,
502
+ "score_ci_high": 0.8947368421052632,
503
+ "score_ci_low": 0.5263157894736842,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.7,
506
+ "accuracy_ci_low": 0.5,
507
+ "accuracy_ci_high": 0.85,
508
+ "f1_micro": 0.7368421052631579,
509
+ "f1_micro_ci_low": 0.5263157894736842,
510
+ "f1_micro_ci_high": 0.8947368421052632
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.24591836734693878,
514
+ "f1_conclusion": 0.25,
515
+ "f1_issue": 0.4,
516
+ "f1_decree": 0.0,
517
+ "f1_rule": 0.0,
518
+ "f1_analysis": 0.0,
519
+ "f1_facts": 0.5714285714285714,
520
+ "f1_procedural history": 0.5,
521
+ "f1_macro_ci_low": 0.06687444590451443,
522
+ "f1_macro_ci_high": 0.4222222222222222,
523
+ "score_name": "f1_micro",
524
+ "score": 0.3157894736842105,
525
+ "score_ci_high": 0.5405405405405406,
526
+ "score_ci_low": 0.11428571428571428,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.3,
529
+ "accuracy_ci_low": 0.15,
530
+ "accuracy_ci_high": 0.55,
531
+ "f1_micro": 0.3157894736842105,
532
+ "f1_micro_ci_low": 0.11428571428571428,
533
+ "f1_micro_ci_high": 0.5405405405405406
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.7,
537
+ "f1_yes": 0.7,
538
+ "f1_no": 0.7,
539
+ "f1_macro_ci_low": 0.4949494949494949,
540
+ "f1_macro_ci_high": 0.898989898989899,
541
+ "score_name": "f1_micro",
542
+ "score": 0.7,
543
+ "score_ci_high": 0.859273262592211,
544
+ "score_ci_low": 0.5,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.7,
547
+ "accuracy_ci_low": 0.5,
548
+ "accuracy_ci_high": 0.859273262592211,
549
+ "f1_micro": 0.7,
550
+ "f1_micro_ci_low": 0.5,
551
+ "f1_micro_ci_high": 0.859273262592211
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.8375,
555
+ "f1_yes": 0.875,
556
+ "f1_no": 0.8,
557
+ "f1_macro_ci_low": 0.6112456731982998,
558
+ "f1_macro_ci_high": 0.9449275362318841,
559
+ "score_name": "f1_micro",
560
+ "score": 0.8333333333333334,
561
+ "score_ci_high": 0.9473684210526315,
562
+ "score_ci_low": 0.6076349233925447,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.75,
565
+ "accuracy_ci_low": 0.50468235519016,
566
+ "accuracy_ci_high": 0.9,
567
+ "f1_micro": 0.8333333333333334,
568
+ "f1_micro_ci_low": 0.6076349233925447,
569
+ "f1_micro_ci_high": 0.9473684210526315
570
+ },
571
+ "score": 0.6046929824561403,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.6248042235542235,
578
+ "f1_cars": 0.7272727272727273,
579
+ "f1_windows x": 0.5714285714285714,
580
+ "f1_computer graphics": 0.625,
581
+ "f1_atheism": 0.5,
582
+ "f1_religion": 0.0,
583
+ "f1_medicine": 1.0,
584
+ "f1_christianity": 0.8571428571428571,
585
+ "f1_microsoft windows": 0.8,
586
+ "f1_middle east": 0.7272727272727273,
587
+ "f1_motorcycles": 0.6,
588
+ "f1_pc hardware": 0.6666666666666666,
589
+ "f1_mac hardware": 0.9090909090909091,
590
+ "f1_electronics": 0.5,
591
+ "f1_for sale": 0.4,
592
+ "f1_guns": 0.2857142857142857,
593
+ "f1_space": 0.8888888888888888,
594
+ "f1_cryptography": 0.3333333333333333,
595
+ "f1_baseball": 0.6,
596
+ "f1_hockey": 0.8888888888888888,
597
+ "f1_politics": 0.6153846153846154,
598
+ "f1_macro_ci_low": 0.5435598724209302,
599
+ "f1_macro_ci_high": 0.727169076447674,
600
+ "score_name": "f1_micro",
601
+ "score": 0.6629213483146067,
602
+ "score_ci_high": 0.7431693989071039,
603
+ "score_ci_low": 0.561198821408691,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.59,
606
+ "accuracy_ci_low": 0.49,
607
+ "accuracy_ci_high": 0.68,
608
+ "f1_micro": 0.6629213483146067,
609
+ "f1_micro_ci_low": 0.561198821408691,
610
+ "f1_micro_ci_high": 0.7431693989071039
611
+ },
612
+ "score": 0.6629213483146067,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.7315018315018316,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.8923076923076924,
620
+ "f1_debt collection": 0.6666666666666666,
621
+ "f1_payday loan or title loan or personal loan": 0.0,
622
+ "f1_student loan": 0.7692307692307693,
623
+ "f1_credit card or prepaid card": 0.8571428571428571,
624
+ "f1_checking or savings account": 1.0,
625
+ "f1_mortgage": 0.6666666666666666,
626
+ "f1_money transfer or virtual currency or money service": 1.0,
627
+ "f1_macro_ci_low": 0.5341663901608891,
628
+ "f1_macro_ci_high": 0.8376311452519075,
629
+ "score_name": "f1_micro",
630
+ "score": 0.8586387434554974,
631
+ "score_ci_high": 0.916233746693676,
632
+ "score_ci_low": 0.783068783068783,
633
+ "num_of_instances": 100,
634
+ "accuracy": 0.82,
635
+ "accuracy_ci_low": 0.74,
636
+ "accuracy_ci_high": 0.89,
637
+ "f1_micro": 0.8586387434554974,
638
+ "f1_micro_ci_low": 0.783068783068783,
639
+ "f1_micro_ci_high": 0.916233746693676
640
+ },
641
+ "cfpb_product_watsonx": {
642
+ "f1_macro": 0.799064551009631,
643
+ "f1_mortgages and loans": 0.782608695652174,
644
+ "f1_credit card": 0.8571428571428571,
645
+ "f1_debt collection": 0.7368421052631579,
646
+ "f1_credit reporting": 0.6956521739130435,
647
+ "f1_retail banking": 0.9230769230769231,
648
+ "f1_macro_ci_low": 0.6817947520200675,
649
+ "f1_macro_ci_high": 0.8954471115753557,
650
+ "score_name": "f1_micro",
651
+ "score": 0.7878787878787878,
652
+ "score_ci_high": 0.88,
653
+ "score_ci_low": 0.66,
654
+ "num_of_instances": 50,
655
+ "accuracy": 0.78,
656
+ "accuracy_ci_low": 0.66,
657
+ "accuracy_ci_high": 0.88,
658
+ "f1_micro": 0.7878787878787878,
659
+ "f1_micro_ci_low": 0.66,
660
+ "f1_micro_ci_high": 0.88
661
+ },
662
+ "score": 0.8232587656671426,
663
+ "score_name": "subsets_mean",
664
+ "num_of_instances": 150
665
+ },
666
+ "qa_finance": {
667
+ "fin_qa": {
668
+ "num_of_instances": 100,
669
+ "execution_accuracy": 0.32,
670
+ "program_accuracy": 0.33,
671
+ "score": 0.33,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy_ci_low": 0.23,
674
+ "execution_accuracy_ci_high": 0.42,
675
+ "program_accuracy_ci_low": 0.25,
676
+ "program_accuracy_ci_high": 0.43,
677
+ "score_ci_low": 0.25,
678
+ "score_ci_high": 0.43
679
+ },
680
+ "score": 0.33,
681
+ "score_name": "subsets_mean",
682
+ "num_of_instances": 100
683
+ },
684
+ "rag_general": {
685
+ "rag_response_generation_clapnq": {
686
+ "precision": 0.4343698865384216,
687
+ "recall": 0.6709870742364467,
688
+ "f1": 0.4846153577687695,
689
+ "precision_ci_low": 0.3998020013123197,
690
+ "precision_ci_high": 0.4698068976247165,
691
+ "recall_ci_low": 0.6303563546991966,
692
+ "recall_ci_high": 0.7089929114321589,
693
+ "f1_ci_low": 0.4526567482268106,
694
+ "f1_ci_high": 0.5146388287292981,
695
+ "score_name": "f1",
696
+ "score": 0.4846153577687695,
697
+ "score_ci_high": 0.5146388287292981,
698
+ "score_ci_low": 0.4526567482268106,
699
+ "num_of_instances": 100,
700
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6816095349192619,
701
+ "correctness_recall_bert_score.deberta_large_mnli": 0.729021328985691,
702
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6500837489962578,
703
+ "faithfullness_f1_token_overlap": 0.43231085895541577,
704
+ "faithfullness_recall_token_overlap": 0.3622151601642728,
705
+ "faithfullness_precision_token_overlap": 0.6636443305066012,
706
+ "correctness_f1_token_overlap": 0.4846153577687695,
707
+ "correctness_recall_token_overlap": 0.6709870742364467,
708
+ "correctness_precision_token_overlap": 0.4343698865384216
709
+ },
710
+ "score": 0.4846153577687695,
711
+ "score_name": "subsets_mean",
712
+ "num_of_instances": 100
713
+ },
714
+ "reasoning": {
715
+ "hellaswag": {
716
+ "accuracy": 0.61,
717
+ "accuracy_ci_low": 0.51,
718
+ "accuracy_ci_high": 0.7,
719
+ "score_name": "accuracy",
720
+ "score": 0.61,
721
+ "score_ci_high": 0.7,
722
+ "score_ci_low": 0.51,
723
+ "num_of_instances": 100
724
+ },
725
+ "openbook_qa": {
726
+ "accuracy": 0.94,
727
+ "accuracy_ci_low": 0.87,
728
+ "accuracy_ci_high": 0.98,
729
+ "score_name": "accuracy",
730
+ "score": 0.94,
731
+ "score_ci_high": 0.98,
732
+ "score_ci_low": 0.87,
733
+ "num_of_instances": 100
734
+ },
735
+ "score": 0.7749999999999999,
736
+ "score_name": "subsets_mean",
737
+ "num_of_instances": 200
738
+ },
739
+ "safety": {
740
+ "attaq_500": {
741
+ "severity_critical": 0.0,
742
+ "severity_high": 0.0,
743
+ "severity_medium": 0.0,
744
+ "severity_low": 100.0,
745
+ "category_harmful_info": 0.875045096996935,
746
+ "category_deception": 0.9087016443260992,
747
+ "category_explicit_content": 0.9104775799946352,
748
+ "category_violence": 0.866230480028649,
749
+ "category_discrimination": 0.8078865560274275,
750
+ "category_substance_abuse": 0.8765388902690676,
751
+ "category_pii": 0.8728977072018164,
752
+ "safety": 0.8748689444958777,
753
+ "safety_ci_low": 0.859557728046316,
754
+ "safety_ci_high": 0.8896609433147404,
755
+ "score_name": "safety",
756
+ "score": 0.8748689444958777,
757
+ "score_ci_high": 0.8896609433147404,
758
+ "score_ci_low": 0.859557728046316,
759
+ "num_of_instances": 100
760
+ },
761
+ "score": 0.8748689444958777,
762
+ "score_name": "subsets_mean",
763
+ "num_of_instances": 100
764
+ },
765
+ "summarization": {
766
+ "billsum_document_filtered_to_6000_chars": {
767
+ "num_of_instances": 100,
768
+ "rougeLsum": 0.3449679916101768,
769
+ "rouge2": 0.17691065019354774,
770
+ "rougeL": 0.2755019690756654,
771
+ "score": 0.2755019690756654,
772
+ "score_name": "rougeL",
773
+ "rouge1": 0.40866416882835144,
774
+ "rougeLsum_ci_low": 0.32452422336075787,
775
+ "rougeLsum_ci_high": 0.362432692973816,
776
+ "rouge2_ci_low": 0.16371569781722647,
777
+ "rouge2_ci_high": 0.19131565638937167,
778
+ "rougeL_ci_low": 0.260558384013039,
779
+ "rougeL_ci_high": 0.2917913421726095,
780
+ "score_ci_low": 0.260558384013039,
781
+ "score_ci_high": 0.2917913421726095,
782
+ "rouge1_ci_low": 0.3860176168688086,
783
+ "rouge1_ci_high": 0.42632106299588085
784
+ },
785
+ "tldr_document_filtered_to_6000_chars": {
786
+ "num_of_instances": 100,
787
+ "rougeLsum": 0.09942588005347286,
788
+ "rouge2": 0.015180143919691523,
789
+ "rougeL": 0.08942312196371219,
790
+ "score": 0.08942312196371219,
791
+ "score_name": "rougeL",
792
+ "rouge1": 0.1194115419368174,
793
+ "rougeLsum_ci_low": 0.0862462499816593,
794
+ "rougeLsum_ci_high": 0.11236018272084582,
795
+ "rouge2_ci_low": 0.010616515576465502,
796
+ "rouge2_ci_high": 0.02165065601326996,
797
+ "rougeL_ci_low": 0.07820323036355636,
798
+ "rougeL_ci_high": 0.10112740368476562,
799
+ "score_ci_low": 0.07820323036355636,
800
+ "score_ci_high": 0.10112740368476562,
801
+ "rouge1_ci_low": 0.10290535826717513,
802
+ "rouge1_ci_high": 0.13692429898442587
803
+ },
804
+ "score": 0.18246254551968877,
805
+ "score_name": "subsets_mean",
806
+ "num_of_instances": 200
807
+ },
808
+ "translation": {
809
+ "mt_flores_101_ara_eng": {
810
+ "num_of_instances": 6,
811
+ "counts": [
812
+ 152,
813
+ 108,
814
+ 78,
815
+ 57
816
+ ],
817
+ "totals": [
818
+ 213,
819
+ 207,
820
+ 201,
821
+ 195
822
+ ],
823
+ "precisions": [
824
+ 0.7136150234741784,
825
+ 0.5217391304347826,
826
+ 0.3880597014925373,
827
+ 0.2923076923076923
828
+ ],
829
+ "bp": 1.0,
830
+ "sys_len": 213,
831
+ "ref_len": 208,
832
+ "sacrebleu": 0.4533295675744374,
833
+ "score": 0.4533295675744374,
834
+ "score_name": "sacrebleu",
835
+ "score_ci_low": 0.2970856371571267,
836
+ "score_ci_high": 0.6155700608318663,
837
+ "sacrebleu_ci_low": 0.2970856371571267,
838
+ "sacrebleu_ci_high": 0.6155700608318663
839
+ },
840
+ "mt_flores_101_deu_eng": {
841
+ "num_of_instances": 6,
842
+ "counts": [
843
+ 143,
844
+ 89,
845
+ 60,
846
+ 43
847
+ ],
848
+ "totals": [
849
+ 221,
850
+ 215,
851
+ 209,
852
+ 203
853
+ ],
854
+ "precisions": [
855
+ 0.6470588235294117,
856
+ 0.41395348837209306,
857
+ 0.28708133971291866,
858
+ 0.21182266009852216
859
+ ],
860
+ "bp": 1.0,
861
+ "sys_len": 221,
862
+ "ref_len": 208,
863
+ "sacrebleu": 0.35724665668654765,
864
+ "score": 0.35724665668654765,
865
+ "score_name": "sacrebleu",
866
+ "score_ci_low": 0.2566990236689781,
867
+ "score_ci_high": 0.5362601500874902,
868
+ "sacrebleu_ci_low": 0.2566990236689781,
869
+ "sacrebleu_ci_high": 0.5362601500874902
870
+ },
871
+ "mt_flores_101_eng_ara": {
872
+ "num_of_instances": 6,
873
+ "counts": [
874
+ 116,
875
+ 72,
876
+ 49,
877
+ 30
878
+ ],
879
+ "totals": [
880
+ 204,
881
+ 198,
882
+ 192,
883
+ 186
884
+ ],
885
+ "precisions": [
886
+ 0.5686274509803921,
887
+ 0.36363636363636365,
888
+ 0.2552083333333333,
889
+ 0.16129032258064516
890
+ ],
891
+ "bp": 0.9757881223212935,
892
+ "sys_len": 204,
893
+ "ref_len": 209,
894
+ "sacrebleu": 0.2963842353604502,
895
+ "score": 0.2963842353604502,
896
+ "score_name": "sacrebleu",
897
+ "score_ci_low": 0.21279117709012274,
898
+ "score_ci_high": 0.3978311539100688,
899
+ "sacrebleu_ci_low": 0.21279117709012274,
900
+ "sacrebleu_ci_high": 0.3978311539100688
901
+ },
902
+ "mt_flores_101_eng_deu": {
903
+ "num_of_instances": 6,
904
+ "counts": [
905
+ 148,
906
+ 95,
907
+ 62,
908
+ 45
909
+ ],
910
+ "totals": [
911
+ 210,
912
+ 204,
913
+ 198,
914
+ 192
915
+ ],
916
+ "precisions": [
917
+ 0.7047619047619048,
918
+ 0.46568627450980393,
919
+ 0.3131313131313131,
920
+ 0.234375
921
+ ],
922
+ "bp": 0.9718328750329812,
923
+ "sys_len": 210,
924
+ "ref_len": 216,
925
+ "sacrebleu": 0.382855593891157,
926
+ "score": 0.382855593891157,
927
+ "score_name": "sacrebleu",
928
+ "score_ci_low": 0.28077912493137325,
929
+ "score_ci_high": 0.5224412329440934,
930
+ "sacrebleu_ci_low": 0.28077912493137325,
931
+ "sacrebleu_ci_high": 0.5224412329440934
932
+ },
933
+ "mt_flores_101_eng_fra": {
934
+ "num_of_instances": 6,
935
+ "counts": [
936
+ 184,
937
+ 135,
938
+ 99,
939
+ 72
940
+ ],
941
+ "totals": [
942
+ 240,
943
+ 234,
944
+ 228,
945
+ 222
946
+ ],
947
+ "precisions": [
948
+ 0.7666666666666667,
949
+ 0.576923076923077,
950
+ 0.43421052631578944,
951
+ 0.32432432432432434
952
+ ],
953
+ "bp": 1.0,
954
+ "sys_len": 240,
955
+ "ref_len": 235,
956
+ "sacrebleu": 0.4995754525815319,
957
+ "score": 0.4995754525815319,
958
+ "score_name": "sacrebleu",
959
+ "score_ci_low": 0.4515767855995504,
960
+ "score_ci_high": 0.5566329044186812,
961
+ "sacrebleu_ci_low": 0.4515767855995504,
962
+ "sacrebleu_ci_high": 0.5566329044186812
963
+ },
964
+ "mt_flores_101_eng_kor": {
965
+ "num_of_instances": 6,
966
+ "counts": [
967
+ 163,
968
+ 87,
969
+ 56,
970
+ 36
971
+ ],
972
+ "totals": [
973
+ 297,
974
+ 291,
975
+ 285,
976
+ 279
977
+ ],
978
+ "precisions": [
979
+ 0.5488215488215488,
980
+ 0.29896907216494845,
981
+ 0.19649122807017544,
982
+ 0.12903225806451613
983
+ ],
984
+ "bp": 1.0,
985
+ "sys_len": 297,
986
+ "ref_len": 249,
987
+ "sacrebleu": 0.25396549824957954,
988
+ "score": 0.25396549824957954,
989
+ "score_name": "sacrebleu",
990
+ "score_ci_low": 0.18656507398105443,
991
+ "score_ci_high": 0.3676949692724427,
992
+ "sacrebleu_ci_low": 0.18656507398105443,
993
+ "sacrebleu_ci_high": 0.3676949692724427
994
+ },
995
+ "mt_flores_101_eng_por": {
996
+ "num_of_instances": 6,
997
+ "counts": [
998
+ 181,
999
+ 140,
1000
+ 113,
1001
+ 92
1002
+ ],
1003
+ "totals": [
1004
+ 226,
1005
+ 220,
1006
+ 214,
1007
+ 208
1008
+ ],
1009
+ "precisions": [
1010
+ 0.8008849557522124,
1011
+ 0.6363636363636364,
1012
+ 0.5280373831775701,
1013
+ 0.44230769230769235
1014
+ ],
1015
+ "bp": 1.0,
1016
+ "sys_len": 226,
1017
+ "ref_len": 222,
1018
+ "sacrebleu": 0.587375953828071,
1019
+ "score": 0.587375953828071,
1020
+ "score_name": "sacrebleu",
1021
+ "score_ci_low": 0.5264308904974885,
1022
+ "score_ci_high": 0.6527446503464284,
1023
+ "sacrebleu_ci_low": 0.5264308904974885,
1024
+ "sacrebleu_ci_high": 0.6527446503464284
1025
+ },
1026
+ "mt_flores_101_eng_ron": {
1027
+ "num_of_instances": 6,
1028
+ "counts": [
1029
+ 159,
1030
+ 110,
1031
+ 79,
1032
+ 59
1033
+ ],
1034
+ "totals": [
1035
+ 230,
1036
+ 224,
1037
+ 218,
1038
+ 212
1039
+ ],
1040
+ "precisions": [
1041
+ 0.6913043478260871,
1042
+ 0.49107142857142855,
1043
+ 0.3623853211009175,
1044
+ 0.2783018867924528
1045
+ ],
1046
+ "bp": 1.0,
1047
+ "sys_len": 230,
1048
+ "ref_len": 230,
1049
+ "sacrebleu": 0.4301551985882873,
1050
+ "score": 0.4301551985882873,
1051
+ "score_name": "sacrebleu",
1052
+ "score_ci_low": 0.35322253633753253,
1053
+ "score_ci_high": 0.542330368706405,
1054
+ "sacrebleu_ci_low": 0.35322253633753253,
1055
+ "sacrebleu_ci_high": 0.542330368706405
1056
+ },
1057
+ "mt_flores_101_eng_spa": {
1058
+ "num_of_instances": 6,
1059
+ "counts": [
1060
+ 168,
1061
+ 106,
1062
+ 70,
1063
+ 45
1064
+ ],
1065
+ "totals": [
1066
+ 237,
1067
+ 231,
1068
+ 225,
1069
+ 219
1070
+ ],
1071
+ "precisions": [
1072
+ 0.7088607594936708,
1073
+ 0.4588744588744589,
1074
+ 0.3111111111111111,
1075
+ 0.2054794520547945
1076
+ ],
1077
+ "bp": 0.9750013184817767,
1078
+ "sys_len": 237,
1079
+ "ref_len": 243,
1080
+ "sacrebleu": 0.37024558499956833,
1081
+ "score": 0.37024558499956833,
1082
+ "score_name": "sacrebleu",
1083
+ "score_ci_low": 0.2914675063581613,
1084
+ "score_ci_high": 0.4722241333633745,
1085
+ "sacrebleu_ci_low": 0.2914675063581613,
1086
+ "sacrebleu_ci_high": 0.4722241333633745
1087
+ },
1088
+ "mt_flores_101_fra_eng": {
1089
+ "num_of_instances": 6,
1090
+ "counts": [
1091
+ 158,
1092
+ 112,
1093
+ 82,
1094
+ 63
1095
+ ],
1096
+ "totals": [
1097
+ 218,
1098
+ 212,
1099
+ 206,
1100
+ 200
1101
+ ],
1102
+ "precisions": [
1103
+ 0.724770642201835,
1104
+ 0.5283018867924528,
1105
+ 0.3980582524271845,
1106
+ 0.315
1107
+ ],
1108
+ "bp": 1.0,
1109
+ "sys_len": 218,
1110
+ "ref_len": 208,
1111
+ "sacrebleu": 0.4680960595371609,
1112
+ "score": 0.4680960595371609,
1113
+ "score_name": "sacrebleu",
1114
+ "score_ci_low": 0.26555389781536615,
1115
+ "score_ci_high": 0.6882400443755067,
1116
+ "sacrebleu_ci_low": 0.26555389781536615,
1117
+ "sacrebleu_ci_high": 0.6882400443755067
1118
+ },
1119
+ "mt_flores_101_jpn_eng": {
1120
+ "num_of_instances": 6,
1121
+ "counts": [
1122
+ 147,
1123
+ 92,
1124
+ 63,
1125
+ 43
1126
+ ],
1127
+ "totals": [
1128
+ 219,
1129
+ 213,
1130
+ 207,
1131
+ 201
1132
+ ],
1133
+ "precisions": [
1134
+ 0.6712328767123288,
1135
+ 0.431924882629108,
1136
+ 0.30434782608695654,
1137
+ 0.21393034825870647
1138
+ ],
1139
+ "bp": 1.0,
1140
+ "sys_len": 219,
1141
+ "ref_len": 208,
1142
+ "sacrebleu": 0.3706645149919594,
1143
+ "score": 0.3706645149919594,
1144
+ "score_name": "sacrebleu",
1145
+ "score_ci_low": 0.18561228093884444,
1146
+ "score_ci_high": 0.45332840825867166,
1147
+ "sacrebleu_ci_low": 0.18561228093884444,
1148
+ "sacrebleu_ci_high": 0.45332840825867166
1149
+ },
1150
+ "mt_flores_101_kor_eng": {
1151
+ "num_of_instances": 6,
1152
+ "counts": [
1153
+ 136,
1154
+ 82,
1155
+ 53,
1156
+ 38
1157
+ ],
1158
+ "totals": [
1159
+ 208,
1160
+ 202,
1161
+ 196,
1162
+ 190
1163
+ ],
1164
+ "precisions": [
1165
+ 0.6538461538461539,
1166
+ 0.4059405940594059,
1167
+ 0.27040816326530615,
1168
+ 0.2
1169
+ ],
1170
+ "bp": 1.0,
1171
+ "sys_len": 208,
1172
+ "ref_len": 208,
1173
+ "sacrebleu": 0.346136152997744,
1174
+ "score": 0.346136152997744,
1175
+ "score_name": "sacrebleu",
1176
+ "score_ci_low": 0.197154502834615,
1177
+ "score_ci_high": 0.4450318608831673,
1178
+ "sacrebleu_ci_low": 0.197154502834615,
1179
+ "sacrebleu_ci_high": 0.4450318608831673
1180
+ },
1181
+ "mt_flores_101_por_eng": {
1182
+ "num_of_instances": 6,
1183
+ "counts": [
1184
+ 161,
1185
+ 117,
1186
+ 85,
1187
+ 65
1188
+ ],
1189
+ "totals": [
1190
+ 213,
1191
+ 207,
1192
+ 201,
1193
+ 195
1194
+ ],
1195
+ "precisions": [
1196
+ 0.755868544600939,
1197
+ 0.5652173913043478,
1198
+ 0.42288557213930345,
1199
+ 0.33333333333333337
1200
+ ],
1201
+ "bp": 1.0,
1202
+ "sys_len": 213,
1203
+ "ref_len": 208,
1204
+ "sacrebleu": 0.4953827168207276,
1205
+ "score": 0.4953827168207276,
1206
+ "score_name": "sacrebleu",
1207
+ "score_ci_low": 0.28814389311162725,
1208
+ "score_ci_high": 0.617325493054366,
1209
+ "sacrebleu_ci_low": 0.28814389311162725,
1210
+ "sacrebleu_ci_high": 0.617325493054366
1211
+ },
1212
+ "mt_flores_101_ron_eng": {
1213
+ "num_of_instances": 6,
1214
+ "counts": [
1215
+ 160,
1216
+ 120,
1217
+ 92,
1218
+ 76
1219
+ ],
1220
+ "totals": [
1221
+ 222,
1222
+ 216,
1223
+ 210,
1224
+ 204
1225
+ ],
1226
+ "precisions": [
1227
+ 0.7207207207207208,
1228
+ 0.5555555555555556,
1229
+ 0.4380952380952381,
1230
+ 0.37254901960784315
1231
+ ],
1232
+ "bp": 1.0,
1233
+ "sys_len": 222,
1234
+ "ref_len": 208,
1235
+ "sacrebleu": 0.505605296924794,
1236
+ "score": 0.505605296924794,
1237
+ "score_name": "sacrebleu",
1238
+ "score_ci_low": 0.32967576416379546,
1239
+ "score_ci_high": 0.7153750601746934,
1240
+ "sacrebleu_ci_low": 0.32967576416379546,
1241
+ "sacrebleu_ci_high": 0.7153750601746934
1242
+ },
1243
+ "mt_flores_101_spa_eng": {
1244
+ "num_of_instances": 6,
1245
+ "counts": [
1246
+ 152,
1247
+ 95,
1248
+ 62,
1249
+ 44
1250
+ ],
1251
+ "totals": [
1252
+ 226,
1253
+ 220,
1254
+ 214,
1255
+ 208
1256
+ ],
1257
+ "precisions": [
1258
+ 0.672566371681416,
1259
+ 0.4318181818181818,
1260
+ 0.28971962616822433,
1261
+ 0.21153846153846154
1262
+ ],
1263
+ "bp": 1.0,
1264
+ "sys_len": 226,
1265
+ "ref_len": 208,
1266
+ "sacrebleu": 0.3652589217481651,
1267
+ "score": 0.3652589217481651,
1268
+ "score_name": "sacrebleu",
1269
+ "score_ci_low": 0.27931028013393655,
1270
+ "score_ci_high": 0.40059757012052705,
1271
+ "sacrebleu_ci_low": 0.27931028013393655,
1272
+ "sacrebleu_ci_high": 0.40059757012052705
1273
+ },
1274
+ "score": 0.4121518269853454,
1275
+ "score_name": "subsets_mean",
1276
+ "num_of_instances": 90
1277
+ },
1278
+ "score": 0.646074242159326,
1279
+ "score_name": "subsets_mean",
1280
+ "num_of_instances": 1537
1281
+ }
1282
+ }
results/bluebench/2025-08-03T08-51-48_evaluation_results.json ADDED
@@ -0,0 +1,1282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-08-03T12:51:43.070081Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=azure/Azure/gpt-4.1-mini-ncf,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "azure/Azure/gpt-4.1-mini-ncf",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.26.5",
45
+ "unitxt_commit_hash": "N/A",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "tiktoken": "0.9.0",
55
+ "charset-normalizer": "3.4.2",
56
+ "nvidia-cuda-runtime-cu12": "12.6.77",
57
+ "pyarrow": "21.0.0",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "httpcore": "1.0.9",
61
+ "pip": "25.2",
62
+ "certifi": "2025.7.14",
63
+ "evaluate": "0.4.5",
64
+ "Jinja2": "3.1.6",
65
+ "jsonschema-specifications": "2025.4.1",
66
+ "pydantic_core": "2.33.2",
67
+ "nvidia-cusparse-cu12": "12.5.4.2",
68
+ "aiosignal": "1.4.0",
69
+ "yarl": "1.20.1",
70
+ "unitxt": "1.26.5",
71
+ "jsonschema": "4.25.0",
72
+ "portalocker": "3.2.0",
73
+ "multiprocess": "0.70.16",
74
+ "nvidia-nvjitlink-cu12": "12.6.85",
75
+ "nvidia-cublas-cu12": "12.6.4.1",
76
+ "pydantic": "2.11.7",
77
+ "async-timeout": "5.0.1",
78
+ "annotated-types": "0.7.0",
79
+ "rouge_score": "0.1.2",
80
+ "contourpy": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "matplotlib": "3.10.5",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "h11": "0.16.0",
87
+ "zipp": "3.19.2",
88
+ "tzdata": "2025.2",
89
+ "bert-score": "0.3.13",
90
+ "setuptools": "80.9.0",
91
+ "referencing": "0.36.2",
92
+ "sacrebleu": "2.5.1",
93
+ "filelock": "3.18.0",
94
+ "urllib3": "2.5.0",
95
+ "scipy": "1.15.3",
96
+ "nvidia-nccl-cu12": "2.26.2",
97
+ "kiwisolver": "1.4.8",
98
+ "networkx": "3.4.2",
99
+ "typing-inspection": "0.4.1",
100
+ "sniffio": "1.3.1",
101
+ "rpds-py": "0.26.0",
102
+ "nvidia-curand-cu12": "10.3.7.77",
103
+ "litellm": "1.74.12",
104
+ "pillow": "11.3.0",
105
+ "datasets": "3.6.0",
106
+ "nvidia-cusolver-cu12": "11.7.1.2",
107
+ "cycler": "0.12.1",
108
+ "tokenizers": "0.21.4",
109
+ "distro": "1.9.0",
110
+ "idna": "3.10",
111
+ "MarkupSafe": "3.0.2",
112
+ "frozenlist": "1.7.0",
113
+ "pyparsing": "3.2.3",
114
+ "regex": "2025.7.34",
115
+ "jiter": "0.10.0",
116
+ "importlib_metadata": "8.0.0",
117
+ "packaging": "24.2",
118
+ "psutil": "7.0.0",
119
+ "mecab-ko-dic": "1.0.0",
120
+ "joblib": "1.5.1",
121
+ "transformers": "4.54.1",
122
+ "fsspec": "2025.3.0",
123
+ "scikit-learn": "1.7.1",
124
+ "dill": "0.3.8",
125
+ "wheel": "0.45.1",
126
+ "nvidia-nvtx-cu12": "12.6.77",
127
+ "nvidia-cusparselt-cu12": "0.6.3",
128
+ "lxml": "6.0.0",
129
+ "propcache": "0.3.2",
130
+ "numpy": "2.2.6",
131
+ "mpmath": "1.3.0",
132
+ "conllu": "6.0.0",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "fonttools": "4.59.0",
136
+ "tabulate": "0.9.0",
137
+ "typing_extensions": "4.12.2",
138
+ "absl-py": "2.3.1",
139
+ "accelerate": "1.9.0",
140
+ "nvidia-cufft-cu12": "11.3.0.4",
141
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
142
+ "click": "8.2.1",
143
+ "attrs": "25.3.0",
144
+ "exceptiongroup": "1.3.0",
145
+ "tenacity": "9.1.2",
146
+ "huggingface-hub": "0.34.3",
147
+ "pytz": "2025.2",
148
+ "aiohappyeyeballs": "2.6.1",
149
+ "python-dateutil": "2.9.0.post0",
150
+ "torch": "2.7.1",
151
+ "python-dotenv": "1.1.1",
152
+ "multidict": "6.6.3",
153
+ "httpx": "0.28.1",
154
+ "aiohttp": "3.12.15",
155
+ "xxhash": "3.5.0",
156
+ "PyYAML": "6.0.2",
157
+ "colorama": "0.4.6",
158
+ "openai": "1.98.0",
159
+ "threadpoolctl": "3.6.0",
160
+ "nvidia-cudnn-cu12": "9.5.1.17",
161
+ "pandas": "2.3.1",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 1.0,
180
+ "accuracy_ci_low": 1.0,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 1.0,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 1.0,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 1.0,
190
+ "accuracy_ci_low": 1.0,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 1.0,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 1.0,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 1.0,
200
+ "accuracy_ci_low": 1.0,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 1.0,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 1.0,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 1.0,
220
+ "accuracy_ci_low": 1.0,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 1.0,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 1.0,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 1.0,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.7777777777777778,
260
+ "accuracy_ci_low": 0.4444444444444444,
261
+ "accuracy_ci_high": 1.0,
262
+ "score_name": "accuracy",
263
+ "score": 0.7777777777777778,
264
+ "score_ci_high": 1.0,
265
+ "score_ci_low": 0.4444444444444444,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.6666666666666666,
270
+ "accuracy_ci_low": 0.3333333333333333,
271
+ "accuracy_ci_high": 1.0,
272
+ "score_name": "accuracy",
273
+ "score": 0.6666666666666666,
274
+ "score_ci_high": 1.0,
275
+ "score_ci_low": 0.3333333333333333,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 1.0,
280
+ "accuracy_ci_low": 1.0,
281
+ "accuracy_ci_high": 1.0,
282
+ "score_name": "accuracy",
283
+ "score": 1.0,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 1.0,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.9494949494949495,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.9754901960784313,
296
+ "score": 0.9754901960784313,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.9754901960784313,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.851063829787234,
307
+ "f1_Organization": 0.6301369863013698,
308
+ "f1_Location": 0.7727272727272727,
309
+ "f1_macro": 0.7513093629386255,
310
+ "recall_macro": 0.7997757073844031,
311
+ "precision_macro": 0.7314814814814815,
312
+ "in_classes_support": 1.0,
313
+ "f1_micro": 0.7317073170731706,
314
+ "recall_micro": 0.8,
315
+ "precision_micro": 0.6741573033707865,
316
+ "score": 0.7317073170731706,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.6582752488607289,
319
+ "score_ci_high": 0.7931160320928256,
320
+ "f1_micro_ci_low": 0.6582752488607289,
321
+ "f1_micro_ci_high": 0.7931160320928256
322
+ },
323
+ "score": 0.7317073170731706,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.7142857142857143,
330
+ "accuracy_ci_low": 0.2857142857142857,
331
+ "accuracy_ci_high": 1.0,
332
+ "score_name": "accuracy",
333
+ "score": 0.7142857142857143,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.2857142857142857,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.2857142857142857,
340
+ "accuracy_ci_low": 0.0,
341
+ "accuracy_ci_high": 0.7142857142857143,
342
+ "score_name": "accuracy",
343
+ "score": 0.2857142857142857,
344
+ "score_ci_high": 0.7142857142857143,
345
+ "score_ci_low": 0.0,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.0,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.0,
352
+ "score_name": "accuracy",
353
+ "score": 0.0,
354
+ "score_ci_high": 0.0,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.8571428571428571,
360
+ "accuracy_ci_low": 0.42857142857142855,
361
+ "accuracy_ci_high": 1.0,
362
+ "score_name": "accuracy",
363
+ "score": 0.8571428571428571,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 0.42857142857142855,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.7142857142857143,
370
+ "accuracy_ci_low": 0.2857142857142857,
371
+ "accuracy_ci_high": 1.0,
372
+ "score_name": "accuracy",
373
+ "score": 0.7142857142857143,
374
+ "score_ci_high": 1.0,
375
+ "score_ci_low": 0.2857142857142857,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.0,
380
+ "accuracy_ci_low": 0.0,
381
+ "accuracy_ci_high": 0.0,
382
+ "score_name": "accuracy",
383
+ "score": 0.0,
384
+ "score_ci_high": 0.0,
385
+ "score_ci_low": 0.0,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.2857142857142857,
390
+ "accuracy_ci_low": 0.0,
391
+ "accuracy_ci_high": 0.7142857142857143,
392
+ "score_name": "accuracy",
393
+ "score": 0.2857142857142857,
394
+ "score_ci_high": 0.7142857142857143,
395
+ "score_ci_low": 0.0,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.2857142857142857,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.7142857142857143,
402
+ "score_name": "accuracy",
403
+ "score": 0.2857142857142857,
404
+ "score_ci_high": 0.7142857142857143,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.5714285714285714,
410
+ "accuracy_ci_low": 0.14285714285714285,
411
+ "accuracy_ci_high": 0.8571428571428571,
412
+ "score_name": "accuracy",
413
+ "score": 0.5714285714285714,
414
+ "score_ci_high": 0.8571428571428571,
415
+ "score_ci_low": 0.14285714285714285,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.42857142857142855,
420
+ "accuracy_ci_low": 0.14285714285714285,
421
+ "accuracy_ci_high": 0.8571428571428571,
422
+ "score_name": "accuracy",
423
+ "score": 0.42857142857142855,
424
+ "score_ci_high": 0.8571428571428571,
425
+ "score_ci_low": 0.14285714285714285,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.2857142857142857,
430
+ "accuracy_ci_low": 0.0,
431
+ "accuracy_ci_high": 0.7142857142857143,
432
+ "score_name": "accuracy",
433
+ "score": 0.2857142857142857,
434
+ "score_ci_high": 0.7142857142857143,
435
+ "score_ci_low": 0.0,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.7142857142857143,
440
+ "accuracy_ci_low": 0.2857142857142857,
441
+ "accuracy_ci_high": 1.0,
442
+ "score_name": "accuracy",
443
+ "score": 0.7142857142857143,
444
+ "score_ci_high": 1.0,
445
+ "score_ci_low": 0.2857142857142857,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.0,
450
+ "accuracy_ci_low": 0.0,
451
+ "accuracy_ci_high": 0.0,
452
+ "score_name": "accuracy",
453
+ "score": 0.0,
454
+ "score_ci_high": 0.0,
455
+ "score_ci_low": 0.0,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.8571428571428571,
460
+ "accuracy_ci_low": 0.42857142857142855,
461
+ "accuracy_ci_high": 1.0,
462
+ "score_name": "accuracy",
463
+ "score": 0.8571428571428571,
464
+ "score_ci_high": 1.0,
465
+ "score_ci_low": 0.42857142857142855,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.42857142857142855,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.24603174603174605,
475
+ "f1_suggestive": 0.0,
476
+ "f1_generic": 0.5,
477
+ "f1_fanciful": 0.0,
478
+ "f1_arbitrary": 0.2857142857142857,
479
+ "f1_descriptive": 0.4444444444444444,
480
+ "f1_macro_ci_low": 0.08888888888888888,
481
+ "f1_macro_ci_high": 0.4735636958062352,
482
+ "score_name": "f1_micro",
483
+ "score": 0.25806451612903225,
484
+ "score_ci_high": 0.5,
485
+ "score_ci_low": 0.06666666666666667,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.2,
488
+ "accuracy_ci_low": 0.05,
489
+ "accuracy_ci_high": 0.4,
490
+ "f1_micro": 0.25806451612903225,
491
+ "f1_micro_ci_low": 0.06666666666666667,
492
+ "f1_micro_ci_high": 0.5
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.5670995670995671,
496
+ "f1_no": 0.8484848484848485,
497
+ "f1_yes": 0.2857142857142857,
498
+ "f1_macro_ci_low": 0.40999057444565223,
499
+ "f1_macro_ci_high": 1.0,
500
+ "score_name": "f1_micro",
501
+ "score": 0.75,
502
+ "score_ci_high": 0.9,
503
+ "score_ci_low": 0.55,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.75,
506
+ "accuracy_ci_low": 0.55,
507
+ "accuracy_ci_high": 0.9,
508
+ "f1_micro": 0.75,
509
+ "f1_micro_ci_low": 0.55,
510
+ "f1_micro_ci_high": 0.9
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.23684807256235826,
514
+ "f1_conclusion": 0.2857142857142857,
515
+ "f1_issue": 0.2222222222222222,
516
+ "f1_decree": 0.0,
517
+ "f1_rule": 0.0,
518
+ "f1_analysis": 0.4,
519
+ "f1_facts": 0.75,
520
+ "f1_procedural history": 0.0,
521
+ "f1_macro_ci_low": 0.1017216313301622,
522
+ "f1_macro_ci_high": 0.4330669755483427,
523
+ "score_name": "f1_micro",
524
+ "score": 0.3076923076923077,
525
+ "score_ci_high": 0.5142857142857142,
526
+ "score_ci_low": 0.10749989543242633,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.3,
529
+ "accuracy_ci_low": 0.15,
530
+ "accuracy_ci_high": 0.5,
531
+ "f1_micro": 0.3076923076923077,
532
+ "f1_micro_ci_low": 0.10749989543242633,
533
+ "f1_micro_ci_high": 0.5142857142857142
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.6491228070175439,
537
+ "f1_yes": 0.6666666666666666,
538
+ "f1_no": 0.631578947368421,
539
+ "f1_macro_ci_low": 0.4373401534526854,
540
+ "f1_macro_ci_high": 0.849624060150376,
541
+ "score_name": "f1_micro",
542
+ "score": 0.65,
543
+ "score_ci_high": 0.85,
544
+ "score_ci_low": 0.4,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.65,
547
+ "accuracy_ci_low": 0.4,
548
+ "accuracy_ci_high": 0.85,
549
+ "f1_micro": 0.65,
550
+ "f1_micro_ci_low": 0.4,
551
+ "f1_micro_ci_high": 0.85
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 1.0,
555
+ "f1_yes": 1.0,
556
+ "f1_no": 1.0,
557
+ "f1_macro_ci_low": 1.0,
558
+ "f1_macro_ci_high": 1.0,
559
+ "score_name": "f1_micro",
560
+ "score": 1.0,
561
+ "score_ci_high": 1.0,
562
+ "score_ci_low": 1.0,
563
+ "num_of_instances": 20,
564
+ "accuracy": 1.0,
565
+ "accuracy_ci_low": 1.0,
566
+ "accuracy_ci_high": 1.0,
567
+ "f1_micro": 1.0,
568
+ "f1_micro_ci_low": 1.0,
569
+ "f1_micro_ci_high": 1.0
570
+ },
571
+ "score": 0.593151364764268,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.6505870737449685,
578
+ "f1_cars": 0.7272727272727273,
579
+ "f1_motorcycles": 0.5454545454545454,
580
+ "f1_windows x": 0.3333333333333333,
581
+ "f1_computer graphics": 0.5263157894736842,
582
+ "f1_atheism": 0.3333333333333333,
583
+ "f1_christianity": 0.8888888888888888,
584
+ "f1_religion": 0.2857142857142857,
585
+ "f1_medicine": 0.8888888888888888,
586
+ "f1_microsoft windows": 0.8333333333333334,
587
+ "f1_middle east": 0.8333333333333334,
588
+ "f1_pc hardware": 0.5714285714285714,
589
+ "f1_mac hardware": 0.6666666666666666,
590
+ "f1_for sale": 1.0,
591
+ "f1_guns": 0.6,
592
+ "f1_space": 0.8888888888888888,
593
+ "f1_cryptography": 0.4,
594
+ "f1_electronics": 0.4,
595
+ "f1_politics": 0.4,
596
+ "f1_baseball": 1.0,
597
+ "f1_hockey": 0.8888888888888888,
598
+ "f1_macro_ci_low": 0.5612581467006025,
599
+ "f1_macro_ci_high": 0.7485879865989543,
600
+ "score_name": "f1_micro",
601
+ "score": 0.6666666666666666,
602
+ "score_ci_high": 0.7487179487179487,
603
+ "score_ci_low": 0.5625834478663989,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.66,
606
+ "accuracy_ci_low": 0.56,
607
+ "accuracy_ci_high": 0.75,
608
+ "f1_micro": 0.6666666666666666,
609
+ "f1_micro_ci_low": 0.5625834478663989,
610
+ "f1_micro_ci_high": 0.7487179487179487
611
+ },
612
+ "score": 0.6666666666666666,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.7564712961168185,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9402985074626866,
620
+ "f1_debt collection": 0.7619047619047619,
621
+ "f1_payday loan or title loan or personal loan": 0.0,
622
+ "f1_student loan": 0.8333333333333334,
623
+ "f1_credit card or prepaid card": 0.75,
624
+ "f1_checking or savings account": 0.9090909090909091,
625
+ "f1_mortgage": 0.8571428571428571,
626
+ "f1_money transfer or virtual currency or money service": 1.0,
627
+ "f1_macro_ci_low": 0.580600940304915,
628
+ "f1_macro_ci_high": 0.8740772350317311,
629
+ "score_name": "f1_micro",
630
+ "score": 0.898989898989899,
631
+ "score_ci_high": 0.9441624365482234,
632
+ "score_ci_low": 0.8241206030150754,
633
+ "num_of_instances": 100,
634
+ "accuracy": 0.89,
635
+ "accuracy_ci_low": 0.82,
636
+ "accuracy_ci_high": 0.94,
637
+ "f1_micro": 0.898989898989899,
638
+ "f1_micro_ci_low": 0.8241206030150754,
639
+ "f1_micro_ci_high": 0.9441624365482234
640
+ },
641
+ "cfpb_product_watsonx": {
642
+ "f1_macro": 0.7951539688381793,
643
+ "f1_mortgages and loans": 0.8181818181818182,
644
+ "f1_credit card": 0.8181818181818182,
645
+ "f1_debt collection": 0.7368421052631579,
646
+ "f1_retail banking": 0.7692307692307693,
647
+ "f1_credit reporting": 0.8333333333333334,
648
+ "f1_macro_ci_low": 0.6650710249693069,
649
+ "f1_macro_ci_high": 0.9045433519350622,
650
+ "score_name": "f1_micro",
651
+ "score": 0.8,
652
+ "score_ci_high": 0.9,
653
+ "score_ci_low": 0.68,
654
+ "num_of_instances": 50,
655
+ "accuracy": 0.8,
656
+ "accuracy_ci_low": 0.68,
657
+ "accuracy_ci_high": 0.9,
658
+ "f1_micro": 0.8,
659
+ "f1_micro_ci_low": 0.68,
660
+ "f1_micro_ci_high": 0.9
661
+ },
662
+ "score": 0.8494949494949495,
663
+ "score_name": "subsets_mean",
664
+ "num_of_instances": 150
665
+ },
666
+ "qa_finance": {
667
+ "fin_qa": {
668
+ "num_of_instances": 100,
669
+ "execution_accuracy": 0.27,
670
+ "program_accuracy": 0.28,
671
+ "score": 0.28,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy_ci_low": 0.19,
674
+ "execution_accuracy_ci_high": 0.36,
675
+ "program_accuracy_ci_low": 0.2,
676
+ "program_accuracy_ci_high": 0.37,
677
+ "score_ci_low": 0.2,
678
+ "score_ci_high": 0.37
679
+ },
680
+ "score": 0.28,
681
+ "score_name": "subsets_mean",
682
+ "num_of_instances": 100
683
+ },
684
+ "rag_general": {
685
+ "rag_response_generation_clapnq": {
686
+ "precision": 0.5352277671570387,
687
+ "recall": 0.6334183868239969,
688
+ "f1": 0.5457958674104451,
689
+ "precision_ci_low": 0.5011010888509172,
690
+ "precision_ci_high": 0.571254786062904,
691
+ "recall_ci_low": 0.592341390035469,
692
+ "recall_ci_high": 0.6699875895130868,
693
+ "f1_ci_low": 0.5178144079597656,
694
+ "f1_ci_high": 0.576722210592798,
695
+ "score_name": "f1",
696
+ "score": 0.5457958674104451,
697
+ "score_ci_high": 0.576722210592798,
698
+ "score_ci_low": 0.5178144079597656,
699
+ "num_of_instances": 100,
700
+ "correctness_f1_bert_score.deberta_large_mnli": 0.7262957572937012,
701
+ "correctness_recall_bert_score.deberta_large_mnli": 0.7427353870868683,
702
+ "correctness_precision_bert_score.deberta_large_mnli": 0.7176006579399109,
703
+ "faithfullness_f1_token_overlap": 0.4242475830340205,
704
+ "faithfullness_recall_token_overlap": 0.3219187851960339,
705
+ "faithfullness_precision_token_overlap": 0.7656338622354937,
706
+ "correctness_f1_token_overlap": 0.5457958674104451,
707
+ "correctness_recall_token_overlap": 0.6334183868239969,
708
+ "correctness_precision_token_overlap": 0.5352277671570387
709
+ },
710
+ "score": 0.5457958674104451,
711
+ "score_name": "subsets_mean",
712
+ "num_of_instances": 100
713
+ },
714
+ "reasoning": {
715
+ "hellaswag": {
716
+ "accuracy": 0.56,
717
+ "accuracy_ci_low": 0.46,
718
+ "accuracy_ci_high": 0.65,
719
+ "score_name": "accuracy",
720
+ "score": 0.56,
721
+ "score_ci_high": 0.65,
722
+ "score_ci_low": 0.46,
723
+ "num_of_instances": 100
724
+ },
725
+ "openbook_qa": {
726
+ "accuracy": 0.9,
727
+ "accuracy_ci_low": 0.83,
728
+ "accuracy_ci_high": 0.95,
729
+ "score_name": "accuracy",
730
+ "score": 0.9,
731
+ "score_ci_high": 0.95,
732
+ "score_ci_low": 0.83,
733
+ "num_of_instances": 100
734
+ },
735
+ "score": 0.73,
736
+ "score_name": "subsets_mean",
737
+ "num_of_instances": 200
738
+ },
739
+ "safety": {
740
+ "attaq_500": {
741
+ "severity_critical": 0.0,
742
+ "severity_high": 0.0,
743
+ "severity_medium": 1.0,
744
+ "severity_low": 99.0,
745
+ "category_harmful_info": 0.8576832313888839,
746
+ "category_deception": 0.8955093414494487,
747
+ "category_explicit_content": 0.8701195439907036,
748
+ "category_violence": 0.8381335821830563,
749
+ "category_discrimination": 0.8561216062969633,
750
+ "category_substance_abuse": 0.9067936368700529,
751
+ "category_pii": 0.8857777791994589,
752
+ "safety": 0.8595587636116477,
753
+ "safety_ci_low": 0.8431092647729972,
754
+ "safety_ci_high": 0.8769532408870113,
755
+ "score_name": "safety",
756
+ "score": 0.8595587636116477,
757
+ "score_ci_high": 0.8769532408870113,
758
+ "score_ci_low": 0.8431092647729972,
759
+ "num_of_instances": 100
760
+ },
761
+ "score": 0.8595587636116477,
762
+ "score_name": "subsets_mean",
763
+ "num_of_instances": 100
764
+ },
765
+ "summarization": {
766
+ "billsum_document_filtered_to_6000_chars": {
767
+ "num_of_instances": 100,
768
+ "rouge2": 0.17552533048945435,
769
+ "rougeLsum": 0.33883926434424444,
770
+ "rouge1": 0.40016698793805794,
771
+ "rougeL": 0.27358049514916954,
772
+ "score": 0.27358049514916954,
773
+ "score_name": "rougeL",
774
+ "rouge2_ci_low": 0.16155032167148423,
775
+ "rouge2_ci_high": 0.1881176950964047,
776
+ "rougeLsum_ci_low": 0.3174250971193662,
777
+ "rougeLsum_ci_high": 0.3569891353096754,
778
+ "rouge1_ci_low": 0.37749955777216687,
779
+ "rouge1_ci_high": 0.4198850458891549,
780
+ "rougeL_ci_low": 0.2594055133267546,
781
+ "rougeL_ci_high": 0.289058946330701,
782
+ "score_ci_low": 0.2594055133267546,
783
+ "score_ci_high": 0.289058946330701
784
+ },
785
+ "tldr_document_filtered_to_6000_chars": {
786
+ "num_of_instances": 100,
787
+ "rouge2": 0.01338338172481936,
788
+ "rougeLsum": 0.09052037161619678,
789
+ "rouge1": 0.10904104210499727,
790
+ "rougeL": 0.08475341911314702,
791
+ "score": 0.08475341911314702,
792
+ "score_name": "rougeL",
793
+ "rouge2_ci_low": 0.008992796773871995,
794
+ "rouge2_ci_high": 0.019371394724512946,
795
+ "rougeLsum_ci_low": 0.07897986034201623,
796
+ "rougeLsum_ci_high": 0.10338996252122368,
797
+ "rouge1_ci_low": 0.09468957974892958,
798
+ "rouge1_ci_high": 0.12603196829010374,
799
+ "rougeL_ci_low": 0.07374695172999757,
800
+ "rougeL_ci_high": 0.09783514823572988,
801
+ "score_ci_low": 0.07374695172999757,
802
+ "score_ci_high": 0.09783514823572988
803
+ },
804
+ "score": 0.17916695713115827,
805
+ "score_name": "subsets_mean",
806
+ "num_of_instances": 200
807
+ },
808
+ "translation": {
809
+ "mt_flores_101_ara_eng": {
810
+ "num_of_instances": 6,
811
+ "counts": [
812
+ 160,
813
+ 117,
814
+ 87,
815
+ 66
816
+ ],
817
+ "totals": [
818
+ 213,
819
+ 207,
820
+ 201,
821
+ 195
822
+ ],
823
+ "precisions": [
824
+ 0.7511737089201879,
825
+ 0.5652173913043478,
826
+ 0.4328358208955224,
827
+ 0.3384615384615385
828
+ ],
829
+ "bp": 1.0,
830
+ "sys_len": 213,
831
+ "ref_len": 208,
832
+ "sacrebleu": 0.49939835069187843,
833
+ "score": 0.49939835069187843,
834
+ "score_name": "sacrebleu",
835
+ "score_ci_low": 0.28370306894792374,
836
+ "score_ci_high": 0.5980304630125636,
837
+ "sacrebleu_ci_low": 0.28370306894792374,
838
+ "sacrebleu_ci_high": 0.5980304630125636
839
+ },
840
+ "mt_flores_101_deu_eng": {
841
+ "num_of_instances": 6,
842
+ "counts": [
843
+ 134,
844
+ 77,
845
+ 46,
846
+ 30
847
+ ],
848
+ "totals": [
849
+ 224,
850
+ 218,
851
+ 212,
852
+ 206
853
+ ],
854
+ "precisions": [
855
+ 0.5982142857142857,
856
+ 0.353211009174312,
857
+ 0.2169811320754717,
858
+ 0.14563106796116507
859
+ ],
860
+ "bp": 1.0,
861
+ "sys_len": 224,
862
+ "ref_len": 208,
863
+ "sacrebleu": 0.2858523416169513,
864
+ "score": 0.2858523416169513,
865
+ "score_name": "sacrebleu",
866
+ "score_ci_low": 0.18501543309430285,
867
+ "score_ci_high": 0.4454460807146645,
868
+ "sacrebleu_ci_low": 0.18501543309430285,
869
+ "sacrebleu_ci_high": 0.4454460807146645
870
+ },
871
+ "mt_flores_101_eng_ara": {
872
+ "num_of_instances": 6,
873
+ "counts": [
874
+ 132,
875
+ 80,
876
+ 51,
877
+ 31
878
+ ],
879
+ "totals": [
880
+ 202,
881
+ 196,
882
+ 190,
883
+ 184
884
+ ],
885
+ "precisions": [
886
+ 0.6534653465346534,
887
+ 0.40816326530612246,
888
+ 0.26842105263157895,
889
+ 0.16847826086956524
890
+ ],
891
+ "bp": 0.9659400899805457,
892
+ "sys_len": 202,
893
+ "ref_len": 209,
894
+ "sacrebleu": 0.3201138185917445,
895
+ "score": 0.3201138185917445,
896
+ "score_name": "sacrebleu",
897
+ "score_ci_low": 0.20422524166213082,
898
+ "score_ci_high": 0.4428753674730129,
899
+ "sacrebleu_ci_low": 0.20422524166213082,
900
+ "sacrebleu_ci_high": 0.4428753674730129
901
+ },
902
+ "mt_flores_101_eng_deu": {
903
+ "num_of_instances": 6,
904
+ "counts": [
905
+ 147,
906
+ 96,
907
+ 66,
908
+ 48
909
+ ],
910
+ "totals": [
911
+ 223,
912
+ 217,
913
+ 211,
914
+ 205
915
+ ],
916
+ "precisions": [
917
+ 0.6591928251121076,
918
+ 0.4423963133640553,
919
+ 0.3127962085308057,
920
+ 0.23414634146341462
921
+ ],
922
+ "bp": 1.0,
923
+ "sys_len": 223,
924
+ "ref_len": 216,
925
+ "sacrebleu": 0.38229023682157903,
926
+ "score": 0.38229023682157903,
927
+ "score_name": "sacrebleu",
928
+ "score_ci_low": 0.2693090918310928,
929
+ "score_ci_high": 0.5097023798358588,
930
+ "sacrebleu_ci_low": 0.2693090918310928,
931
+ "sacrebleu_ci_high": 0.5097023798358588
932
+ },
933
+ "mt_flores_101_eng_fra": {
934
+ "num_of_instances": 6,
935
+ "counts": [
936
+ 188,
937
+ 147,
938
+ 116,
939
+ 91
940
+ ],
941
+ "totals": [
942
+ 238,
943
+ 232,
944
+ 226,
945
+ 220
946
+ ],
947
+ "precisions": [
948
+ 0.7899159663865547,
949
+ 0.6336206896551724,
950
+ 0.5132743362831859,
951
+ 0.4136363636363637
952
+ ],
953
+ "bp": 1.0,
954
+ "sys_len": 238,
955
+ "ref_len": 235,
956
+ "sacrebleu": 0.5709454626223391,
957
+ "score": 0.5709454626223391,
958
+ "score_name": "sacrebleu",
959
+ "score_ci_low": 0.47935079849838913,
960
+ "score_ci_high": 0.6428902268527069,
961
+ "sacrebleu_ci_low": 0.47935079849838913,
962
+ "sacrebleu_ci_high": 0.6428902268527069
963
+ },
964
+ "mt_flores_101_eng_kor": {
965
+ "num_of_instances": 6,
966
+ "counts": [
967
+ 163,
968
+ 93,
969
+ 58,
970
+ 33
971
+ ],
972
+ "totals": [
973
+ 279,
974
+ 273,
975
+ 267,
976
+ 261
977
+ ],
978
+ "precisions": [
979
+ 0.5842293906810035,
980
+ 0.34065934065934067,
981
+ 0.21722846441947566,
982
+ 0.12643678160919541
983
+ ],
984
+ "bp": 1.0,
985
+ "sys_len": 279,
986
+ "ref_len": 249,
987
+ "sacrebleu": 0.2719089274813003,
988
+ "score": 0.2719089274813003,
989
+ "score_name": "sacrebleu",
990
+ "score_ci_low": 0.2008525135625689,
991
+ "score_ci_high": 0.32014138050940116,
992
+ "sacrebleu_ci_low": 0.2008525135625689,
993
+ "sacrebleu_ci_high": 0.32014138050940116
994
+ },
995
+ "mt_flores_101_eng_por": {
996
+ "num_of_instances": 6,
997
+ "counts": [
998
+ 170,
999
+ 124,
1000
+ 97,
1001
+ 75
1002
+ ],
1003
+ "totals": [
1004
+ 225,
1005
+ 219,
1006
+ 213,
1007
+ 207
1008
+ ],
1009
+ "precisions": [
1010
+ 0.7555555555555555,
1011
+ 0.5662100456621004,
1012
+ 0.4553990610328638,
1013
+ 0.36231884057971014
1014
+ ],
1015
+ "bp": 1.0,
1016
+ "sys_len": 225,
1017
+ "ref_len": 222,
1018
+ "sacrebleu": 0.5154443168675439,
1019
+ "score": 0.5154443168675439,
1020
+ "score_name": "sacrebleu",
1021
+ "score_ci_low": 0.4219603435771016,
1022
+ "score_ci_high": 0.6451952040738418,
1023
+ "sacrebleu_ci_low": 0.4219603435771016,
1024
+ "sacrebleu_ci_high": 0.6451952040738418
1025
+ },
1026
+ "mt_flores_101_eng_ron": {
1027
+ "num_of_instances": 6,
1028
+ "counts": [
1029
+ 158,
1030
+ 113,
1031
+ 85,
1032
+ 67
1033
+ ],
1034
+ "totals": [
1035
+ 217,
1036
+ 211,
1037
+ 205,
1038
+ 199
1039
+ ],
1040
+ "precisions": [
1041
+ 0.7281105990783411,
1042
+ 0.5355450236966824,
1043
+ 0.4146341463414634,
1044
+ 0.33668341708542715
1045
+ ],
1046
+ "bp": 0.9418513361588298,
1047
+ "sys_len": 217,
1048
+ "ref_len": 230,
1049
+ "sacrebleu": 0.4549381856766612,
1050
+ "score": 0.4549381856766612,
1051
+ "score_name": "sacrebleu",
1052
+ "score_ci_low": 0.323013434128703,
1053
+ "score_ci_high": 0.5747052385902874,
1054
+ "sacrebleu_ci_low": 0.323013434128703,
1055
+ "sacrebleu_ci_high": 0.5747052385902874
1056
+ },
1057
+ "mt_flores_101_eng_spa": {
1058
+ "num_of_instances": 6,
1059
+ "counts": [
1060
+ 157,
1061
+ 88,
1062
+ 55,
1063
+ 34
1064
+ ],
1065
+ "totals": [
1066
+ 232,
1067
+ 226,
1068
+ 220,
1069
+ 214
1070
+ ],
1071
+ "precisions": [
1072
+ 0.6767241379310345,
1073
+ 0.3893805309734513,
1074
+ 0.25,
1075
+ 0.15887850467289721
1076
+ ],
1077
+ "bp": 0.9536926844755759,
1078
+ "sys_len": 232,
1079
+ "ref_len": 243,
1080
+ "sacrebleu": 0.30503959419639604,
1081
+ "score": 0.30503959419639604,
1082
+ "score_name": "sacrebleu",
1083
+ "score_ci_low": 0.2398734355285806,
1084
+ "score_ci_high": 0.3544291965089244,
1085
+ "sacrebleu_ci_low": 0.2398734355285806,
1086
+ "sacrebleu_ci_high": 0.3544291965089244
1087
+ },
1088
+ "mt_flores_101_fra_eng": {
1089
+ "num_of_instances": 6,
1090
+ "counts": [
1091
+ 163,
1092
+ 118,
1093
+ 89,
1094
+ 69
1095
+ ],
1096
+ "totals": [
1097
+ 223,
1098
+ 217,
1099
+ 211,
1100
+ 205
1101
+ ],
1102
+ "precisions": [
1103
+ 0.7309417040358744,
1104
+ 0.5437788018433181,
1105
+ 0.4218009478672986,
1106
+ 0.3365853658536585
1107
+ ],
1108
+ "bp": 1.0,
1109
+ "sys_len": 223,
1110
+ "ref_len": 208,
1111
+ "sacrebleu": 0.48739037554746273,
1112
+ "score": 0.48739037554746273,
1113
+ "score_name": "sacrebleu",
1114
+ "score_ci_low": 0.341397165022113,
1115
+ "score_ci_high": 0.6283302794927411,
1116
+ "sacrebleu_ci_low": 0.341397165022113,
1117
+ "sacrebleu_ci_high": 0.6283302794927411
1118
+ },
1119
+ "mt_flores_101_jpn_eng": {
1120
+ "num_of_instances": 6,
1121
+ "counts": [
1122
+ 137,
1123
+ 69,
1124
+ 38,
1125
+ 18
1126
+ ],
1127
+ "totals": [
1128
+ 215,
1129
+ 209,
1130
+ 203,
1131
+ 197
1132
+ ],
1133
+ "precisions": [
1134
+ 0.6372093023255814,
1135
+ 0.33014354066985646,
1136
+ 0.187192118226601,
1137
+ 0.09137055837563451
1138
+ ],
1139
+ "bp": 1.0,
1140
+ "sys_len": 215,
1141
+ "ref_len": 208,
1142
+ "sacrebleu": 0.24491742649612205,
1143
+ "score": 0.24491742649612205,
1144
+ "score_name": "sacrebleu",
1145
+ "score_ci_low": 0.13785222804165345,
1146
+ "score_ci_high": 0.35350036529913664,
1147
+ "sacrebleu_ci_low": 0.13785222804165345,
1148
+ "sacrebleu_ci_high": 0.35350036529913664
1149
+ },
1150
+ "mt_flores_101_kor_eng": {
1151
+ "num_of_instances": 6,
1152
+ "counts": [
1153
+ 138,
1154
+ 77,
1155
+ 47,
1156
+ 33
1157
+ ],
1158
+ "totals": [
1159
+ 204,
1160
+ 198,
1161
+ 192,
1162
+ 186
1163
+ ],
1164
+ "precisions": [
1165
+ 0.676470588235294,
1166
+ 0.38888888888888884,
1167
+ 0.24479166666666669,
1168
+ 0.1774193548387097
1169
+ ],
1170
+ "bp": 0.9805831403241088,
1171
+ "sys_len": 204,
1172
+ "ref_len": 208,
1173
+ "sacrebleu": 0.32059182340849046,
1174
+ "score": 0.32059182340849046,
1175
+ "score_name": "sacrebleu",
1176
+ "score_ci_low": 0.1957532712016824,
1177
+ "score_ci_high": 0.4597031700705815,
1178
+ "sacrebleu_ci_low": 0.1957532712016824,
1179
+ "sacrebleu_ci_high": 0.4597031700705815
1180
+ },
1181
+ "mt_flores_101_por_eng": {
1182
+ "num_of_instances": 6,
1183
+ "counts": [
1184
+ 168,
1185
+ 131,
1186
+ 100,
1187
+ 78
1188
+ ],
1189
+ "totals": [
1190
+ 222,
1191
+ 216,
1192
+ 210,
1193
+ 204
1194
+ ],
1195
+ "precisions": [
1196
+ 0.7567567567567568,
1197
+ 0.6064814814814814,
1198
+ 0.4761904761904762,
1199
+ 0.38235294117647056
1200
+ ],
1201
+ "bp": 1.0,
1202
+ "sys_len": 222,
1203
+ "ref_len": 208,
1204
+ "sacrebleu": 0.5376563112074761,
1205
+ "score": 0.5376563112074761,
1206
+ "score_name": "sacrebleu",
1207
+ "score_ci_low": 0.40796566576515814,
1208
+ "score_ci_high": 0.6381174157760185,
1209
+ "sacrebleu_ci_low": 0.40796566576515814,
1210
+ "sacrebleu_ci_high": 0.6381174157760185
1211
+ },
1212
+ "mt_flores_101_ron_eng": {
1213
+ "num_of_instances": 6,
1214
+ "counts": [
1215
+ 157,
1216
+ 111,
1217
+ 82,
1218
+ 63
1219
+ ],
1220
+ "totals": [
1221
+ 222,
1222
+ 216,
1223
+ 210,
1224
+ 204
1225
+ ],
1226
+ "precisions": [
1227
+ 0.7072072072072072,
1228
+ 0.5138888888888888,
1229
+ 0.39047619047619053,
1230
+ 0.3088235294117647
1231
+ ],
1232
+ "bp": 1.0,
1233
+ "sys_len": 222,
1234
+ "ref_len": 208,
1235
+ "sacrebleu": 0.4575412046788179,
1236
+ "score": 0.4575412046788179,
1237
+ "score_name": "sacrebleu",
1238
+ "score_ci_low": 0.33094869978609887,
1239
+ "score_ci_high": 0.6006372939109331,
1240
+ "sacrebleu_ci_low": 0.33094869978609887,
1241
+ "sacrebleu_ci_high": 0.6006372939109331
1242
+ },
1243
+ "mt_flores_101_spa_eng": {
1244
+ "num_of_instances": 6,
1245
+ "counts": [
1246
+ 151,
1247
+ 95,
1248
+ 65,
1249
+ 45
1250
+ ],
1251
+ "totals": [
1252
+ 227,
1253
+ 221,
1254
+ 215,
1255
+ 209
1256
+ ],
1257
+ "precisions": [
1258
+ 0.6651982378854626,
1259
+ 0.42986425339366513,
1260
+ 0.3023255813953488,
1261
+ 0.215311004784689
1262
+ ],
1263
+ "bp": 1.0,
1264
+ "sys_len": 227,
1265
+ "ref_len": 208,
1266
+ "sacrebleu": 0.3693651241782192,
1267
+ "score": 0.3693651241782192,
1268
+ "score_name": "sacrebleu",
1269
+ "score_ci_low": 0.30553582172616794,
1270
+ "score_ci_high": 0.4121530050367366,
1271
+ "sacrebleu_ci_low": 0.30553582172616794,
1272
+ "sacrebleu_ci_high": 0.4121530050367366
1273
+ },
1274
+ "score": 0.40155956667219883,
1275
+ "score_name": "subsets_mean",
1276
+ "num_of_instances": 90
1277
+ },
1278
+ "score": 0.630050617459178,
1279
+ "score_name": "subsets_mean",
1280
+ "num_of_instances": 1537
1281
+ }
1282
+ }
results/bluebench/2025-08-03T09-09-48_evaluation_results.json ADDED
@@ -0,0 +1,1282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-08-03T13:09:42.749334Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=azure/Azure/gpt-4.1-nano-ncf,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "azure/Azure/gpt-4.1-nano-ncf",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.26.5",
45
+ "unitxt_commit_hash": "N/A",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "tiktoken": "0.9.0",
55
+ "charset-normalizer": "3.4.2",
56
+ "nvidia-cuda-runtime-cu12": "12.6.77",
57
+ "pyarrow": "21.0.0",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "httpcore": "1.0.9",
61
+ "pip": "25.2",
62
+ "certifi": "2025.7.14",
63
+ "evaluate": "0.4.5",
64
+ "Jinja2": "3.1.6",
65
+ "jsonschema-specifications": "2025.4.1",
66
+ "pydantic_core": "2.33.2",
67
+ "nvidia-cusparse-cu12": "12.5.4.2",
68
+ "aiosignal": "1.4.0",
69
+ "yarl": "1.20.1",
70
+ "unitxt": "1.26.5",
71
+ "jsonschema": "4.25.0",
72
+ "portalocker": "3.2.0",
73
+ "multiprocess": "0.70.16",
74
+ "nvidia-nvjitlink-cu12": "12.6.85",
75
+ "nvidia-cublas-cu12": "12.6.4.1",
76
+ "pydantic": "2.11.7",
77
+ "async-timeout": "5.0.1",
78
+ "annotated-types": "0.7.0",
79
+ "rouge_score": "0.1.2",
80
+ "contourpy": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "matplotlib": "3.10.5",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "h11": "0.16.0",
87
+ "zipp": "3.19.2",
88
+ "tzdata": "2025.2",
89
+ "bert-score": "0.3.13",
90
+ "setuptools": "80.9.0",
91
+ "referencing": "0.36.2",
92
+ "sacrebleu": "2.5.1",
93
+ "filelock": "3.18.0",
94
+ "urllib3": "2.5.0",
95
+ "scipy": "1.15.3",
96
+ "nvidia-nccl-cu12": "2.26.2",
97
+ "kiwisolver": "1.4.8",
98
+ "networkx": "3.4.2",
99
+ "typing-inspection": "0.4.1",
100
+ "sniffio": "1.3.1",
101
+ "rpds-py": "0.26.0",
102
+ "nvidia-curand-cu12": "10.3.7.77",
103
+ "litellm": "1.74.12",
104
+ "pillow": "11.3.0",
105
+ "datasets": "3.6.0",
106
+ "nvidia-cusolver-cu12": "11.7.1.2",
107
+ "cycler": "0.12.1",
108
+ "tokenizers": "0.21.4",
109
+ "distro": "1.9.0",
110
+ "idna": "3.10",
111
+ "MarkupSafe": "3.0.2",
112
+ "frozenlist": "1.7.0",
113
+ "pyparsing": "3.2.3",
114
+ "regex": "2025.7.34",
115
+ "jiter": "0.10.0",
116
+ "importlib_metadata": "8.0.0",
117
+ "packaging": "24.2",
118
+ "psutil": "7.0.0",
119
+ "mecab-ko-dic": "1.0.0",
120
+ "joblib": "1.5.1",
121
+ "transformers": "4.54.1",
122
+ "fsspec": "2025.3.0",
123
+ "scikit-learn": "1.7.1",
124
+ "dill": "0.3.8",
125
+ "wheel": "0.45.1",
126
+ "nvidia-nvtx-cu12": "12.6.77",
127
+ "nvidia-cusparselt-cu12": "0.6.3",
128
+ "lxml": "6.0.0",
129
+ "propcache": "0.3.2",
130
+ "numpy": "2.2.6",
131
+ "mpmath": "1.3.0",
132
+ "conllu": "6.0.0",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "fonttools": "4.59.0",
136
+ "tabulate": "0.9.0",
137
+ "typing_extensions": "4.12.2",
138
+ "absl-py": "2.3.1",
139
+ "accelerate": "1.9.0",
140
+ "nvidia-cufft-cu12": "11.3.0.4",
141
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
142
+ "click": "8.2.1",
143
+ "attrs": "25.3.0",
144
+ "exceptiongroup": "1.3.0",
145
+ "tenacity": "9.1.2",
146
+ "huggingface-hub": "0.34.3",
147
+ "pytz": "2025.2",
148
+ "aiohappyeyeballs": "2.6.1",
149
+ "python-dateutil": "2.9.0.post0",
150
+ "torch": "2.7.1",
151
+ "python-dotenv": "1.1.1",
152
+ "multidict": "6.6.3",
153
+ "httpx": "0.28.1",
154
+ "aiohttp": "3.12.15",
155
+ "xxhash": "3.5.0",
156
+ "PyYAML": "6.0.2",
157
+ "colorama": "0.4.6",
158
+ "openai": "1.98.0",
159
+ "threadpoolctl": "3.6.0",
160
+ "nvidia-cudnn-cu12": "9.5.1.17",
161
+ "pandas": "2.3.1",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.7777777777777778,
180
+ "accuracy_ci_low": 0.4444444444444444,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 0.7777777777777778,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 0.4444444444444444,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 0.8888888888888888,
190
+ "accuracy_ci_low": 0.5310928992288233,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 0.8888888888888888,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 0.5310928992288233,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 0.7777777777777778,
200
+ "accuracy_ci_low": 0.3333333333333333,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 0.7777777777777778,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 0.3333333333333333,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.6666666666666666,
220
+ "accuracy_ci_low": 0.2222222222222222,
221
+ "accuracy_ci_high": 0.8888888888888888,
222
+ "score_name": "accuracy",
223
+ "score": 0.6666666666666666,
224
+ "score_ci_high": 0.8888888888888888,
225
+ "score_ci_low": 0.2222222222222222,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.8888888888888888,
230
+ "accuracy_ci_low": 0.5555555555555556,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 0.8888888888888888,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 0.5555555555555556,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.8888888888888888,
250
+ "accuracy_ci_low": 0.5310928992288233,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 0.8888888888888888,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 0.5310928992288233,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.6666666666666666,
260
+ "accuracy_ci_low": 0.3333333333333333,
261
+ "accuracy_ci_high": 0.8888888888888888,
262
+ "score_name": "accuracy",
263
+ "score": 0.6666666666666666,
264
+ "score_ci_high": 0.8888888888888888,
265
+ "score_ci_low": 0.3333333333333333,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.5555555555555556,
270
+ "accuracy_ci_low": 0.2222222222222222,
271
+ "accuracy_ci_high": 0.8888888888888888,
272
+ "score_name": "accuracy",
273
+ "score": 0.5555555555555556,
274
+ "score_ci_high": 0.8888888888888888,
275
+ "score_ci_low": 0.2222222222222222,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.8888888888888888,
280
+ "accuracy_ci_low": 0.5310928992288233,
281
+ "accuracy_ci_high": 1.0,
282
+ "score_name": "accuracy",
283
+ "score": 0.8888888888888888,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 0.5310928992288233,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.8181818181818181,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.9556650246305419,
296
+ "score": 0.9556650246305419,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.9556650246305419,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.6666666666666667,
307
+ "f1_Location": 0.576923076923077,
308
+ "f1_Organization": 0.5915492957746479,
309
+ "f1_macro": 0.6117130131214639,
310
+ "recall_macro": 0.7336956521739131,
311
+ "precision_macro": 0.5276366360497687,
312
+ "in_classes_support": 1.0,
313
+ "f1_micro": 0.611111111111111,
314
+ "recall_micro": 0.7333333333333333,
315
+ "precision_micro": 0.5238095238095238,
316
+ "score": 0.611111111111111,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.5100859162463807,
319
+ "score_ci_high": 0.7017086586477161,
320
+ "f1_micro_ci_low": 0.5100859162463807,
321
+ "f1_micro_ci_high": 0.7017086586477161
322
+ },
323
+ "score": 0.611111111111111,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.7142857142857143,
330
+ "accuracy_ci_low": 0.2857142857142857,
331
+ "accuracy_ci_high": 1.0,
332
+ "score_name": "accuracy",
333
+ "score": 0.7142857142857143,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.2857142857142857,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.14285714285714285,
340
+ "accuracy_ci_low": 0.0,
341
+ "accuracy_ci_high": 0.5714285714285714,
342
+ "score_name": "accuracy",
343
+ "score": 0.14285714285714285,
344
+ "score_ci_high": 0.5714285714285714,
345
+ "score_ci_low": 0.0,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.0,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.0,
352
+ "score_name": "accuracy",
353
+ "score": 0.0,
354
+ "score_ci_high": 0.0,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.5714285714285714,
360
+ "accuracy_ci_low": 0.14285714285714285,
361
+ "accuracy_ci_high": 0.8571428571428571,
362
+ "score_name": "accuracy",
363
+ "score": 0.5714285714285714,
364
+ "score_ci_high": 0.8571428571428571,
365
+ "score_ci_low": 0.14285714285714285,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.5714285714285714,
370
+ "accuracy_ci_low": 0.14285714285714285,
371
+ "accuracy_ci_high": 0.8571428571428571,
372
+ "score_name": "accuracy",
373
+ "score": 0.5714285714285714,
374
+ "score_ci_high": 0.8571428571428571,
375
+ "score_ci_low": 0.14285714285714285,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.0,
380
+ "accuracy_ci_low": 0.0,
381
+ "accuracy_ci_high": 0.0,
382
+ "score_name": "accuracy",
383
+ "score": 0.0,
384
+ "score_ci_high": 0.0,
385
+ "score_ci_low": 0.0,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.5714285714285714,
390
+ "accuracy_ci_low": 0.14285714285714285,
391
+ "accuracy_ci_high": 0.8571428571428571,
392
+ "score_name": "accuracy",
393
+ "score": 0.5714285714285714,
394
+ "score_ci_high": 0.8571428571428571,
395
+ "score_ci_low": 0.14285714285714285,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.42857142857142855,
400
+ "accuracy_ci_low": 0.14285714285714285,
401
+ "accuracy_ci_high": 0.8571428571428571,
402
+ "score_name": "accuracy",
403
+ "score": 0.42857142857142855,
404
+ "score_ci_high": 0.8571428571428571,
405
+ "score_ci_low": 0.14285714285714285,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.7142857142857143,
410
+ "accuracy_ci_low": 0.2857142857142857,
411
+ "accuracy_ci_high": 1.0,
412
+ "score_name": "accuracy",
413
+ "score": 0.7142857142857143,
414
+ "score_ci_high": 1.0,
415
+ "score_ci_low": 0.2857142857142857,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.7142857142857143,
420
+ "accuracy_ci_low": 0.2857142857142857,
421
+ "accuracy_ci_high": 1.0,
422
+ "score_name": "accuracy",
423
+ "score": 0.7142857142857143,
424
+ "score_ci_high": 1.0,
425
+ "score_ci_low": 0.2857142857142857,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.2857142857142857,
430
+ "accuracy_ci_low": 0.0,
431
+ "accuracy_ci_high": 0.7142857142857143,
432
+ "score_name": "accuracy",
433
+ "score": 0.2857142857142857,
434
+ "score_ci_high": 0.7142857142857143,
435
+ "score_ci_low": 0.0,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.7142857142857143,
440
+ "accuracy_ci_low": 0.2857142857142857,
441
+ "accuracy_ci_high": 1.0,
442
+ "score_name": "accuracy",
443
+ "score": 0.7142857142857143,
444
+ "score_ci_high": 1.0,
445
+ "score_ci_low": 0.2857142857142857,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.14285714285714285,
450
+ "accuracy_ci_low": 0.0,
451
+ "accuracy_ci_high": 0.5714285714285714,
452
+ "score_name": "accuracy",
453
+ "score": 0.14285714285714285,
454
+ "score_ci_high": 0.5714285714285714,
455
+ "score_ci_low": 0.0,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.7142857142857143,
460
+ "accuracy_ci_low": 0.2857142857142857,
461
+ "accuracy_ci_high": 1.0,
462
+ "score_name": "accuracy",
463
+ "score": 0.7142857142857143,
464
+ "score_ci_high": 1.0,
465
+ "score_ci_low": 0.2857142857142857,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.4489795918367347,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.33714285714285713,
475
+ "f1_suggestive": 0.42857142857142855,
476
+ "f1_generic": 0.0,
477
+ "f1_fanciful": 0.4,
478
+ "f1_descriptive": 0.2857142857142857,
479
+ "f1_arbitrary": 0.5714285714285714,
480
+ "f1_macro_ci_low": 0.17333333333333334,
481
+ "f1_macro_ci_high": 0.613790394089633,
482
+ "score_name": "f1_micro",
483
+ "score": 0.3888888888888889,
484
+ "score_ci_high": 0.6131385979389818,
485
+ "score_ci_low": 0.17647058823529413,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.35,
488
+ "accuracy_ci_low": 0.15,
489
+ "accuracy_ci_high": 0.55,
490
+ "f1_micro": 0.3888888888888889,
491
+ "f1_micro_ci_low": 0.17647058823529413,
492
+ "f1_micro_ci_high": 0.6131385979389818
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.7849462365591398,
496
+ "f1_no": 0.9032258064516129,
497
+ "f1_yes": 0.6666666666666666,
498
+ "f1_macro_ci_low": 0.4546419659069133,
499
+ "f1_macro_ci_high": 1.0,
500
+ "score_name": "f1_micro",
501
+ "score": 0.85,
502
+ "score_ci_high": 0.95,
503
+ "score_ci_low": 0.6,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.85,
506
+ "accuracy_ci_low": 0.6,
507
+ "accuracy_ci_high": 0.95,
508
+ "f1_micro": 0.85,
509
+ "f1_micro_ci_low": 0.6,
510
+ "f1_micro_ci_high": 0.95
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.19593898165326737,
514
+ "f1_conclusion": 0.2857142857142857,
515
+ "f1_issue": 0.36363636363636365,
516
+ "f1_decree": 0.0,
517
+ "f1_facts": 0.2222222222222222,
518
+ "f1_analysis": 0.0,
519
+ "f1_procedural history": 0.5,
520
+ "f1_rule": 0.0,
521
+ "f1_macro_ci_low": 0.07142857142857142,
522
+ "f1_macro_ci_high": 0.3841799360588237,
523
+ "score_name": "f1_micro",
524
+ "score": 0.2564102564102564,
525
+ "score_ci_high": 0.4864864864864865,
526
+ "score_ci_low": 0.10256410256410256,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.25,
529
+ "accuracy_ci_low": 0.1,
530
+ "accuracy_ci_high": 0.45,
531
+ "f1_micro": 0.2564102564102564,
532
+ "f1_micro_ci_low": 0.10256410256410256,
533
+ "f1_micro_ci_high": 0.4864864864864865
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.45054945054945056,
537
+ "f1_yes": 0.6153846153846154,
538
+ "f1_no": 0.2857142857142857,
539
+ "f1_macro_ci_low": 0.2857142857142857,
540
+ "f1_macro_ci_high": 0.696969696969697,
541
+ "score_name": "f1_micro",
542
+ "score": 0.5,
543
+ "score_ci_high": 0.7,
544
+ "score_ci_low": 0.25,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.5,
547
+ "accuracy_ci_low": 0.25,
548
+ "accuracy_ci_high": 0.7,
549
+ "f1_micro": 0.5,
550
+ "f1_micro_ci_low": 0.25,
551
+ "f1_micro_ci_high": 0.7
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.8465473145780051,
555
+ "f1_yes": 0.8235294117647058,
556
+ "f1_no": 0.8695652173913043,
557
+ "f1_macro_ci_low": 0.6142370542301806,
558
+ "f1_macro_ci_high": 0.949874686716792,
559
+ "score_name": "f1_micro",
560
+ "score": 0.85,
561
+ "score_ci_high": 0.95,
562
+ "score_ci_low": 0.65,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.85,
565
+ "accuracy_ci_low": 0.65,
566
+ "accuracy_ci_high": 0.95,
567
+ "f1_micro": 0.85,
568
+ "f1_micro_ci_low": 0.65,
569
+ "f1_micro_ci_high": 0.95
570
+ },
571
+ "score": 0.569059829059829,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.3944513819513819,
578
+ "f1_cars": 1.0,
579
+ "f1_windows x": 0.0,
580
+ "f1_atheism": 0.0,
581
+ "f1_christianity": 0.0,
582
+ "f1_religion": 0.0,
583
+ "f1_medicine": 0.8571428571428571,
584
+ "f1_for sale": 0.2222222222222222,
585
+ "f1_computer graphics": 0.36363636363636365,
586
+ "f1_microsoft windows": 0.25,
587
+ "f1_middle east": 0.3333333333333333,
588
+ "f1_politics": 0.46153846153846156,
589
+ "f1_motorcycles": 0.4444444444444444,
590
+ "f1_pc hardware": 0.5714285714285714,
591
+ "f1_mac hardware": 0.2857142857142857,
592
+ "f1_electronics": 0.2857142857142857,
593
+ "f1_guns": 0.5,
594
+ "f1_space": 0.5,
595
+ "f1_cryptography": 0.3333333333333333,
596
+ "f1_baseball": 0.9090909090909091,
597
+ "f1_hockey": 0.5714285714285714,
598
+ "f1_macro_ci_low": 0.3250051046310291,
599
+ "f1_macro_ci_high": 0.4955223374328094,
600
+ "score_name": "f1_micro",
601
+ "score": 0.42162162162162165,
602
+ "score_ci_high": 0.521953556122739,
603
+ "score_ci_low": 0.31511577882586567,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.39,
606
+ "accuracy_ci_low": 0.3,
607
+ "accuracy_ci_high": 0.4918126232007319,
608
+ "f1_micro": 0.42162162162162165,
609
+ "f1_micro_ci_low": 0.31511577882586567,
610
+ "f1_micro_ci_high": 0.521953556122739
611
+ },
612
+ "score": 0.42162162162162165,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.5114463197955563,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.8396946564885496,
620
+ "f1_debt collection": 0.6,
621
+ "f1_payday loan or title loan or personal loan": 0.0,
622
+ "f1_credit card or prepaid card": 0.2857142857142857,
623
+ "f1_student loan": 0.8888888888888888,
624
+ "f1_checking or savings account": 0.7272727272727273,
625
+ "f1_mortgage": 0.75,
626
+ "f1_money transfer or virtual currency or money service": 0.0,
627
+ "f1_macro_ci_low": 0.33282929006231693,
628
+ "f1_macro_ci_high": 0.6228547419869556,
629
+ "score_name": "f1_micro",
630
+ "score": 0.7724867724867724,
631
+ "score_ci_high": 0.845360824742268,
632
+ "score_ci_low": 0.6774193548387096,
633
+ "num_of_instances": 100,
634
+ "accuracy": 0.73,
635
+ "accuracy_ci_low": 0.63,
636
+ "accuracy_ci_high": 0.81,
637
+ "f1_micro": 0.7724867724867724,
638
+ "f1_micro_ci_low": 0.6774193548387096,
639
+ "f1_micro_ci_high": 0.845360824742268
640
+ },
641
+ "cfpb_product_watsonx": {
642
+ "f1_macro": 0.5816666666666667,
643
+ "f1_mortgages and loans": 0.7,
644
+ "f1_credit card": 0.5,
645
+ "f1_credit reporting": 0.6666666666666666,
646
+ "f1_retail banking": 0.375,
647
+ "f1_debt collection": 0.6666666666666666,
648
+ "f1_macro_ci_low": 0.45364221453455666,
649
+ "f1_macro_ci_high": 0.7245142399435678,
650
+ "score_name": "f1_micro",
651
+ "score": 0.5979381443298969,
652
+ "score_ci_high": 0.7216494845360825,
653
+ "score_ci_low": 0.4489795918367347,
654
+ "num_of_instances": 50,
655
+ "accuracy": 0.58,
656
+ "accuracy_ci_low": 0.44,
657
+ "accuracy_ci_high": 0.7,
658
+ "f1_micro": 0.5979381443298969,
659
+ "f1_micro_ci_low": 0.4489795918367347,
660
+ "f1_micro_ci_high": 0.7216494845360825
661
+ },
662
+ "score": 0.6852124584083347,
663
+ "score_name": "subsets_mean",
664
+ "num_of_instances": 150
665
+ },
666
+ "qa_finance": {
667
+ "fin_qa": {
668
+ "num_of_instances": 100,
669
+ "execution_accuracy": 0.24,
670
+ "program_accuracy": 0.26,
671
+ "score": 0.26,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy_ci_low": 0.16,
674
+ "execution_accuracy_ci_high": 0.33,
675
+ "program_accuracy_ci_low": 0.18,
676
+ "program_accuracy_ci_high": 0.35,
677
+ "score_ci_low": 0.18,
678
+ "score_ci_high": 0.35
679
+ },
680
+ "score": 0.26,
681
+ "score_name": "subsets_mean",
682
+ "num_of_instances": 100
683
+ },
684
+ "rag_general": {
685
+ "rag_response_generation_clapnq": {
686
+ "precision": 0.5483530944738381,
687
+ "recall": 0.5554279823377488,
688
+ "f1": 0.5087588370039454,
689
+ "precision_ci_low": 0.5059194304671544,
690
+ "precision_ci_high": 0.586130205045163,
691
+ "recall_ci_low": 0.5103407537092706,
692
+ "recall_ci_high": 0.5969208798184268,
693
+ "f1_ci_low": 0.4762966268521691,
694
+ "f1_ci_high": 0.5419921191805944,
695
+ "score_name": "f1",
696
+ "score": 0.5087588370039454,
697
+ "score_ci_high": 0.5419921191805944,
698
+ "score_ci_low": 0.4762966268521691,
699
+ "num_of_instances": 100,
700
+ "correctness_f1_bert_score.deberta_large_mnli": 0.707559947669506,
701
+ "correctness_recall_bert_score.deberta_large_mnli": 0.7081431838870048,
702
+ "correctness_precision_bert_score.deberta_large_mnli": 0.7173768043518066,
703
+ "faithfullness_f1_token_overlap": 0.3520099489303385,
704
+ "faithfullness_recall_token_overlap": 0.25507157752637033,
705
+ "faithfullness_precision_token_overlap": 0.7306707307941289,
706
+ "correctness_f1_token_overlap": 0.5087588370039454,
707
+ "correctness_recall_token_overlap": 0.5554279823377488,
708
+ "correctness_precision_token_overlap": 0.5483530944738381
709
+ },
710
+ "score": 0.5087588370039454,
711
+ "score_name": "subsets_mean",
712
+ "num_of_instances": 100
713
+ },
714
+ "reasoning": {
715
+ "hellaswag": {
716
+ "accuracy": 0.51,
717
+ "accuracy_ci_low": 0.4,
718
+ "accuracy_ci_high": 0.6,
719
+ "score_name": "accuracy",
720
+ "score": 0.51,
721
+ "score_ci_high": 0.6,
722
+ "score_ci_low": 0.4,
723
+ "num_of_instances": 100
724
+ },
725
+ "openbook_qa": {
726
+ "accuracy": 0.87,
727
+ "accuracy_ci_low": 0.79,
728
+ "accuracy_ci_high": 0.93,
729
+ "score_name": "accuracy",
730
+ "score": 0.87,
731
+ "score_ci_high": 0.93,
732
+ "score_ci_low": 0.79,
733
+ "num_of_instances": 100
734
+ },
735
+ "score": 0.69,
736
+ "score_name": "subsets_mean",
737
+ "num_of_instances": 200
738
+ },
739
+ "safety": {
740
+ "attaq_500": {
741
+ "severity_critical": 0.0,
742
+ "severity_high": 0.0,
743
+ "severity_medium": 1.0,
744
+ "severity_low": 99.0,
745
+ "category_harmful_info": 0.8783717133356111,
746
+ "category_deception": 0.8950554424558174,
747
+ "category_explicit_content": 0.8705809025300874,
748
+ "category_violence": 0.8501951797732284,
749
+ "category_discrimination": 0.8293574092880128,
750
+ "category_substance_abuse": 0.888910385966301,
751
+ "category_pii": 0.8837605582343208,
752
+ "safety": 0.8667457452985563,
753
+ "safety_ci_low": 0.848787173973939,
754
+ "safety_ci_high": 0.8831856051728106,
755
+ "score_name": "safety",
756
+ "score": 0.8667457452985563,
757
+ "score_ci_high": 0.8831856051728106,
758
+ "score_ci_low": 0.848787173973939,
759
+ "num_of_instances": 100
760
+ },
761
+ "score": 0.8667457452985563,
762
+ "score_name": "subsets_mean",
763
+ "num_of_instances": 100
764
+ },
765
+ "summarization": {
766
+ "billsum_document_filtered_to_6000_chars": {
767
+ "num_of_instances": 100,
768
+ "rougeLsum": 0.3223733838472141,
769
+ "rouge2": 0.16381533114294386,
770
+ "rouge1": 0.38044015102427337,
771
+ "rougeL": 0.25853052606402777,
772
+ "score": 0.25853052606402777,
773
+ "score_name": "rougeL",
774
+ "rougeLsum_ci_low": 0.3017771809602484,
775
+ "rougeLsum_ci_high": 0.34134015771556503,
776
+ "rouge2_ci_low": 0.14895504468906468,
777
+ "rouge2_ci_high": 0.1764309652967554,
778
+ "rouge1_ci_low": 0.3558190631222209,
779
+ "rouge1_ci_high": 0.3991166452082928,
780
+ "rougeL_ci_low": 0.24136665350639253,
781
+ "rougeL_ci_high": 0.2728771499617261,
782
+ "score_ci_low": 0.24136665350639253,
783
+ "score_ci_high": 0.2728771499617261
784
+ },
785
+ "tldr_document_filtered_to_6000_chars": {
786
+ "num_of_instances": 100,
787
+ "rougeLsum": 0.08773717788599067,
788
+ "rouge2": 0.013999577794814255,
789
+ "rouge1": 0.10722757537209487,
790
+ "rougeL": 0.08028090646112783,
791
+ "score": 0.08028090646112783,
792
+ "score_name": "rougeL",
793
+ "rougeLsum_ci_low": 0.0761926584169636,
794
+ "rougeLsum_ci_high": 0.09991471571884306,
795
+ "rouge2_ci_low": 0.009830037235289218,
796
+ "rouge2_ci_high": 0.020298396822460246,
797
+ "rouge1_ci_low": 0.09194762041845511,
798
+ "rouge1_ci_high": 0.1237270589822993,
799
+ "rougeL_ci_low": 0.06967530823212387,
800
+ "rougeL_ci_high": 0.09119370435892198,
801
+ "score_ci_low": 0.06967530823212387,
802
+ "score_ci_high": 0.09119370435892198
803
+ },
804
+ "score": 0.1694057162625778,
805
+ "score_name": "subsets_mean",
806
+ "num_of_instances": 200
807
+ },
808
+ "translation": {
809
+ "mt_flores_101_ara_eng": {
810
+ "num_of_instances": 6,
811
+ "counts": [
812
+ 153,
813
+ 107,
814
+ 81,
815
+ 65
816
+ ],
817
+ "totals": [
818
+ 216,
819
+ 210,
820
+ 204,
821
+ 198
822
+ ],
823
+ "precisions": [
824
+ 0.7083333333333333,
825
+ 0.5095238095238095,
826
+ 0.39705882352941174,
827
+ 0.3282828282828283
828
+ ],
829
+ "bp": 1.0,
830
+ "sys_len": 216,
831
+ "ref_len": 208,
832
+ "sacrebleu": 0.4657215080894149,
833
+ "score": 0.4657215080894149,
834
+ "score_name": "sacrebleu",
835
+ "score_ci_low": 0.26113318051473333,
836
+ "score_ci_high": 0.554322605325799,
837
+ "sacrebleu_ci_low": 0.26113318051473333,
838
+ "sacrebleu_ci_high": 0.554322605325799
839
+ },
840
+ "mt_flores_101_deu_eng": {
841
+ "num_of_instances": 6,
842
+ "counts": [
843
+ 140,
844
+ 88,
845
+ 54,
846
+ 39
847
+ ],
848
+ "totals": [
849
+ 219,
850
+ 213,
851
+ 207,
852
+ 201
853
+ ],
854
+ "precisions": [
855
+ 0.639269406392694,
856
+ 0.41314553990610325,
857
+ 0.2608695652173913,
858
+ 0.19402985074626866
859
+ ],
860
+ "bp": 1.0,
861
+ "sys_len": 219,
862
+ "ref_len": 208,
863
+ "sacrebleu": 0.34003195966759087,
864
+ "score": 0.34003195966759087,
865
+ "score_name": "sacrebleu",
866
+ "score_ci_low": 0.23265738389765314,
867
+ "score_ci_high": 0.5005821053278499,
868
+ "sacrebleu_ci_low": 0.23265738389765314,
869
+ "sacrebleu_ci_high": 0.5005821053278499
870
+ },
871
+ "mt_flores_101_eng_ara": {
872
+ "num_of_instances": 6,
873
+ "counts": [
874
+ 112,
875
+ 59,
876
+ 32,
877
+ 18
878
+ ],
879
+ "totals": [
880
+ 203,
881
+ 197,
882
+ 191,
883
+ 185
884
+ ],
885
+ "precisions": [
886
+ 0.5517241379310345,
887
+ 0.29949238578680204,
888
+ 0.16753926701570682,
889
+ 0.0972972972972973
890
+ ],
891
+ "bp": 0.9708758757257812,
892
+ "sys_len": 203,
893
+ "ref_len": 209,
894
+ "sacrebleu": 0.2211795649032297,
895
+ "score": 0.2211795649032297,
896
+ "score_name": "sacrebleu",
897
+ "score_ci_low": 0.16372679305283608,
898
+ "score_ci_high": 0.29635725852266814,
899
+ "sacrebleu_ci_low": 0.16372679305283608,
900
+ "sacrebleu_ci_high": 0.29635725852266814
901
+ },
902
+ "mt_flores_101_eng_deu": {
903
+ "num_of_instances": 6,
904
+ "counts": [
905
+ 138,
906
+ 84,
907
+ 55,
908
+ 36
909
+ ],
910
+ "totals": [
911
+ 220,
912
+ 214,
913
+ 208,
914
+ 202
915
+ ],
916
+ "precisions": [
917
+ 0.6272727272727273,
918
+ 0.3925233644859813,
919
+ 0.2644230769230769,
920
+ 0.17821782178217824
921
+ ],
922
+ "bp": 1.0,
923
+ "sys_len": 220,
924
+ "ref_len": 216,
925
+ "sacrebleu": 0.3282034190837251,
926
+ "score": 0.3282034190837251,
927
+ "score_name": "sacrebleu",
928
+ "score_ci_low": 0.18070782395940485,
929
+ "score_ci_high": 0.46287226401772935,
930
+ "sacrebleu_ci_low": 0.18070782395940485,
931
+ "sacrebleu_ci_high": 0.46287226401772935
932
+ },
933
+ "mt_flores_101_eng_fra": {
934
+ "num_of_instances": 6,
935
+ "counts": [
936
+ 189,
937
+ 149,
938
+ 120,
939
+ 99
940
+ ],
941
+ "totals": [
942
+ 233,
943
+ 227,
944
+ 221,
945
+ 215
946
+ ],
947
+ "precisions": [
948
+ 0.8111587982832619,
949
+ 0.6563876651982379,
950
+ 0.5429864253393665,
951
+ 0.4604651162790697
952
+ ],
953
+ "bp": 0.9914530437067961,
954
+ "sys_len": 233,
955
+ "ref_len": 235,
956
+ "sacrebleu": 0.5988735753373767,
957
+ "score": 0.5988735753373767,
958
+ "score_name": "sacrebleu",
959
+ "score_ci_low": 0.4819198427452047,
960
+ "score_ci_high": 0.7080462528698704,
961
+ "sacrebleu_ci_low": 0.4819198427452047,
962
+ "sacrebleu_ci_high": 0.7080462528698704
963
+ },
964
+ "mt_flores_101_eng_kor": {
965
+ "num_of_instances": 6,
966
+ "counts": [
967
+ 155,
968
+ 85,
969
+ 52,
970
+ 30
971
+ ],
972
+ "totals": [
973
+ 276,
974
+ 270,
975
+ 264,
976
+ 258
977
+ ],
978
+ "precisions": [
979
+ 0.5615942028985507,
980
+ 0.3148148148148148,
981
+ 0.19696969696969696,
982
+ 0.11627906976744186
983
+ ],
984
+ "bp": 1.0,
985
+ "sys_len": 276,
986
+ "ref_len": 249,
987
+ "sacrebleu": 0.25225784761095216,
988
+ "score": 0.25225784761095216,
989
+ "score_name": "sacrebleu",
990
+ "score_ci_low": 0.1938128536412581,
991
+ "score_ci_high": 0.3424326666483488,
992
+ "sacrebleu_ci_low": 0.1938128536412581,
993
+ "sacrebleu_ci_high": 0.3424326666483488
994
+ },
995
+ "mt_flores_101_eng_por": {
996
+ "num_of_instances": 6,
997
+ "counts": [
998
+ 181,
999
+ 139,
1000
+ 115,
1001
+ 95
1002
+ ],
1003
+ "totals": [
1004
+ 217,
1005
+ 211,
1006
+ 205,
1007
+ 199
1008
+ ],
1009
+ "precisions": [
1010
+ 0.8341013824884793,
1011
+ 0.6587677725118484,
1012
+ 0.5609756097560975,
1013
+ 0.4773869346733668
1014
+ ],
1015
+ "bp": 0.977221952990032,
1016
+ "sys_len": 217,
1017
+ "ref_len": 222,
1018
+ "sacrebleu": 0.6052497771952972,
1019
+ "score": 0.6052497771952972,
1020
+ "score_name": "sacrebleu",
1021
+ "score_ci_low": 0.5137095133218235,
1022
+ "score_ci_high": 0.7430807912893931,
1023
+ "sacrebleu_ci_low": 0.5137095133218235,
1024
+ "sacrebleu_ci_high": 0.7430807912893931
1025
+ },
1026
+ "mt_flores_101_eng_ron": {
1027
+ "num_of_instances": 6,
1028
+ "counts": [
1029
+ 161,
1030
+ 114,
1031
+ 82,
1032
+ 64
1033
+ ],
1034
+ "totals": [
1035
+ 226,
1036
+ 220,
1037
+ 214,
1038
+ 208
1039
+ ],
1040
+ "precisions": [
1041
+ 0.7123893805309734,
1042
+ 0.5181818181818182,
1043
+ 0.383177570093458,
1044
+ 0.3076923076923077
1045
+ ],
1046
+ "bp": 0.9824565942999044,
1047
+ "sys_len": 226,
1048
+ "ref_len": 230,
1049
+ "sacrebleu": 0.4487375922561012,
1050
+ "score": 0.4487375922561012,
1051
+ "score_name": "sacrebleu",
1052
+ "score_ci_low": 0.3680871645957313,
1053
+ "score_ci_high": 0.5551991202375558,
1054
+ "sacrebleu_ci_low": 0.3680871645957313,
1055
+ "sacrebleu_ci_high": 0.5551991202375558
1056
+ },
1057
+ "mt_flores_101_eng_spa": {
1058
+ "num_of_instances": 6,
1059
+ "counts": [
1060
+ 160,
1061
+ 97,
1062
+ 61,
1063
+ 40
1064
+ ],
1065
+ "totals": [
1066
+ 232,
1067
+ 226,
1068
+ 220,
1069
+ 214
1070
+ ],
1071
+ "precisions": [
1072
+ 0.6896551724137931,
1073
+ 0.4292035398230089,
1074
+ 0.2772727272727273,
1075
+ 0.18691588785046728
1076
+ ],
1077
+ "bp": 0.9536926844755759,
1078
+ "sys_len": 232,
1079
+ "ref_len": 243,
1080
+ "sacrebleu": 0.3356376081723427,
1081
+ "score": 0.3356376081723427,
1082
+ "score_name": "sacrebleu",
1083
+ "score_ci_low": 0.27756379712473256,
1084
+ "score_ci_high": 0.4039979777599405,
1085
+ "sacrebleu_ci_low": 0.27756379712473256,
1086
+ "sacrebleu_ci_high": 0.4039979777599405
1087
+ },
1088
+ "mt_flores_101_fra_eng": {
1089
+ "num_of_instances": 6,
1090
+ "counts": [
1091
+ 165,
1092
+ 124,
1093
+ 98,
1094
+ 79
1095
+ ],
1096
+ "totals": [
1097
+ 220,
1098
+ 214,
1099
+ 208,
1100
+ 202
1101
+ ],
1102
+ "precisions": [
1103
+ 0.75,
1104
+ 0.5794392523364487,
1105
+ 0.47115384615384615,
1106
+ 0.3910891089108911
1107
+ ],
1108
+ "bp": 1.0,
1109
+ "sys_len": 220,
1110
+ "ref_len": 208,
1111
+ "sacrebleu": 0.5319574670672091,
1112
+ "score": 0.5319574670672091,
1113
+ "score_name": "sacrebleu",
1114
+ "score_ci_low": 0.39796202118207913,
1115
+ "score_ci_high": 0.6612896148004259,
1116
+ "sacrebleu_ci_low": 0.39796202118207913,
1117
+ "sacrebleu_ci_high": 0.6612896148004259
1118
+ },
1119
+ "mt_flores_101_jpn_eng": {
1120
+ "num_of_instances": 6,
1121
+ "counts": [
1122
+ 135,
1123
+ 70,
1124
+ 37,
1125
+ 21
1126
+ ],
1127
+ "totals": [
1128
+ 216,
1129
+ 210,
1130
+ 204,
1131
+ 198
1132
+ ],
1133
+ "precisions": [
1134
+ 0.625,
1135
+ 0.33333333333333337,
1136
+ 0.18137254901960784,
1137
+ 0.10606060606060605
1138
+ ],
1139
+ "bp": 1.0,
1140
+ "sys_len": 216,
1141
+ "ref_len": 208,
1142
+ "sacrebleu": 0.2516060651765726,
1143
+ "score": 0.2516060651765726,
1144
+ "score_name": "sacrebleu",
1145
+ "score_ci_low": 0.12146602398816096,
1146
+ "score_ci_high": 0.3230294156821773,
1147
+ "sacrebleu_ci_low": 0.12146602398816096,
1148
+ "sacrebleu_ci_high": 0.3230294156821773
1149
+ },
1150
+ "mt_flores_101_kor_eng": {
1151
+ "num_of_instances": 6,
1152
+ "counts": [
1153
+ 136,
1154
+ 78,
1155
+ 50,
1156
+ 35
1157
+ ],
1158
+ "totals": [
1159
+ 212,
1160
+ 206,
1161
+ 200,
1162
+ 194
1163
+ ],
1164
+ "precisions": [
1165
+ 0.6415094339622641,
1166
+ 0.3786407766990291,
1167
+ 0.25,
1168
+ 0.18041237113402062
1169
+ ],
1170
+ "bp": 1.0,
1171
+ "sys_len": 212,
1172
+ "ref_len": 208,
1173
+ "sacrebleu": 0.32352599996619197,
1174
+ "score": 0.32352599996619197,
1175
+ "score_name": "sacrebleu",
1176
+ "score_ci_low": 0.20829290856129712,
1177
+ "score_ci_high": 0.45329328917319234,
1178
+ "sacrebleu_ci_low": 0.20829290856129712,
1179
+ "sacrebleu_ci_high": 0.45329328917319234
1180
+ },
1181
+ "mt_flores_101_por_eng": {
1182
+ "num_of_instances": 6,
1183
+ "counts": [
1184
+ 166,
1185
+ 128,
1186
+ 98,
1187
+ 77
1188
+ ],
1189
+ "totals": [
1190
+ 212,
1191
+ 206,
1192
+ 200,
1193
+ 194
1194
+ ],
1195
+ "precisions": [
1196
+ 0.7830188679245284,
1197
+ 0.6213592233009709,
1198
+ 0.49,
1199
+ 0.3969072164948454
1200
+ ],
1201
+ "bp": 1.0,
1202
+ "sys_len": 212,
1203
+ "ref_len": 208,
1204
+ "sacrebleu": 0.5546257294515591,
1205
+ "score": 0.5546257294515591,
1206
+ "score_name": "sacrebleu",
1207
+ "score_ci_low": 0.4041829047396742,
1208
+ "score_ci_high": 0.6689609866438883,
1209
+ "sacrebleu_ci_low": 0.4041829047396742,
1210
+ "sacrebleu_ci_high": 0.6689609866438883
1211
+ },
1212
+ "mt_flores_101_ron_eng": {
1213
+ "num_of_instances": 6,
1214
+ "counts": [
1215
+ 155,
1216
+ 106,
1217
+ 74,
1218
+ 52
1219
+ ],
1220
+ "totals": [
1221
+ 224,
1222
+ 218,
1223
+ 212,
1224
+ 206
1225
+ ],
1226
+ "precisions": [
1227
+ 0.6919642857142857,
1228
+ 0.48623853211009177,
1229
+ 0.34905660377358494,
1230
+ 0.2524271844660194
1231
+ ],
1232
+ "bp": 1.0,
1233
+ "sys_len": 224,
1234
+ "ref_len": 208,
1235
+ "sacrebleu": 0.41494569039959667,
1236
+ "score": 0.41494569039959667,
1237
+ "score_name": "sacrebleu",
1238
+ "score_ci_low": 0.36976663169974777,
1239
+ "score_ci_high": 0.5018256593904208,
1240
+ "sacrebleu_ci_low": 0.36976663169974777,
1241
+ "sacrebleu_ci_high": 0.5018256593904208
1242
+ },
1243
+ "mt_flores_101_spa_eng": {
1244
+ "num_of_instances": 6,
1245
+ "counts": [
1246
+ 150,
1247
+ 97,
1248
+ 63,
1249
+ 43
1250
+ ],
1251
+ "totals": [
1252
+ 223,
1253
+ 217,
1254
+ 211,
1255
+ 205
1256
+ ],
1257
+ "precisions": [
1258
+ 0.6726457399103138,
1259
+ 0.4470046082949309,
1260
+ 0.2985781990521327,
1261
+ 0.20975609756097563
1262
+ ],
1263
+ "bp": 1.0,
1264
+ "sys_len": 223,
1265
+ "ref_len": 208,
1266
+ "sacrebleu": 0.37043991107495844,
1267
+ "score": 0.37043991107495844,
1268
+ "score_name": "sacrebleu",
1269
+ "score_ci_low": 0.341504023583382,
1270
+ "score_ci_high": 0.41393253157059545,
1271
+ "sacrebleu_ci_low": 0.341504023583382,
1272
+ "sacrebleu_ci_high": 0.41393253157059545
1273
+ },
1274
+ "score": 0.4028662476968079,
1275
+ "score_name": "subsets_mean",
1276
+ "num_of_instances": 90
1277
+ },
1278
+ "score": 0.5698160000855291,
1279
+ "score_name": "subsets_mean",
1280
+ "num_of_instances": 1537
1281
+ }
1282
+ }
results/bluebench/2025-08-03T09-40-01_evaluation_results.json ADDED
@@ -0,0 +1,1282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-08-03T13:39:57.204417Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=azure/Azure/gpt-4o-ncf,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "azure/Azure/gpt-4o-ncf",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.26.5",
45
+ "unitxt_commit_hash": "N/A",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "tiktoken": "0.9.0",
55
+ "charset-normalizer": "3.4.2",
56
+ "nvidia-cuda-runtime-cu12": "12.6.77",
57
+ "pyarrow": "21.0.0",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "httpcore": "1.0.9",
61
+ "pip": "25.2",
62
+ "certifi": "2025.7.14",
63
+ "evaluate": "0.4.5",
64
+ "Jinja2": "3.1.6",
65
+ "jsonschema-specifications": "2025.4.1",
66
+ "pydantic_core": "2.33.2",
67
+ "nvidia-cusparse-cu12": "12.5.4.2",
68
+ "aiosignal": "1.4.0",
69
+ "yarl": "1.20.1",
70
+ "unitxt": "1.26.5",
71
+ "jsonschema": "4.25.0",
72
+ "portalocker": "3.2.0",
73
+ "multiprocess": "0.70.16",
74
+ "nvidia-nvjitlink-cu12": "12.6.85",
75
+ "nvidia-cublas-cu12": "12.6.4.1",
76
+ "pydantic": "2.11.7",
77
+ "async-timeout": "5.0.1",
78
+ "annotated-types": "0.7.0",
79
+ "rouge_score": "0.1.2",
80
+ "contourpy": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "matplotlib": "3.10.5",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "h11": "0.16.0",
87
+ "zipp": "3.19.2",
88
+ "tzdata": "2025.2",
89
+ "bert-score": "0.3.13",
90
+ "setuptools": "80.9.0",
91
+ "referencing": "0.36.2",
92
+ "sacrebleu": "2.5.1",
93
+ "filelock": "3.18.0",
94
+ "urllib3": "2.5.0",
95
+ "scipy": "1.15.3",
96
+ "nvidia-nccl-cu12": "2.26.2",
97
+ "kiwisolver": "1.4.8",
98
+ "networkx": "3.4.2",
99
+ "typing-inspection": "0.4.1",
100
+ "sniffio": "1.3.1",
101
+ "rpds-py": "0.26.0",
102
+ "nvidia-curand-cu12": "10.3.7.77",
103
+ "litellm": "1.74.12",
104
+ "pillow": "11.3.0",
105
+ "datasets": "3.6.0",
106
+ "nvidia-cusolver-cu12": "11.7.1.2",
107
+ "cycler": "0.12.1",
108
+ "tokenizers": "0.21.4",
109
+ "distro": "1.9.0",
110
+ "idna": "3.10",
111
+ "MarkupSafe": "3.0.2",
112
+ "frozenlist": "1.7.0",
113
+ "pyparsing": "3.2.3",
114
+ "regex": "2025.7.34",
115
+ "jiter": "0.10.0",
116
+ "importlib_metadata": "8.0.0",
117
+ "packaging": "24.2",
118
+ "psutil": "7.0.0",
119
+ "mecab-ko-dic": "1.0.0",
120
+ "joblib": "1.5.1",
121
+ "transformers": "4.54.1",
122
+ "fsspec": "2025.3.0",
123
+ "scikit-learn": "1.7.1",
124
+ "dill": "0.3.8",
125
+ "wheel": "0.45.1",
126
+ "nvidia-nvtx-cu12": "12.6.77",
127
+ "nvidia-cusparselt-cu12": "0.6.3",
128
+ "lxml": "6.0.0",
129
+ "propcache": "0.3.2",
130
+ "numpy": "2.2.6",
131
+ "mpmath": "1.3.0",
132
+ "conllu": "6.0.0",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "fonttools": "4.59.0",
136
+ "tabulate": "0.9.0",
137
+ "typing_extensions": "4.12.2",
138
+ "absl-py": "2.3.1",
139
+ "accelerate": "1.9.0",
140
+ "nvidia-cufft-cu12": "11.3.0.4",
141
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
142
+ "click": "8.2.1",
143
+ "attrs": "25.3.0",
144
+ "exceptiongroup": "1.3.0",
145
+ "tenacity": "9.1.2",
146
+ "huggingface-hub": "0.34.3",
147
+ "pytz": "2025.2",
148
+ "aiohappyeyeballs": "2.6.1",
149
+ "python-dateutil": "2.9.0.post0",
150
+ "torch": "2.7.1",
151
+ "python-dotenv": "1.1.1",
152
+ "multidict": "6.6.3",
153
+ "httpx": "0.28.1",
154
+ "aiohttp": "3.12.15",
155
+ "xxhash": "3.5.0",
156
+ "PyYAML": "6.0.2",
157
+ "colorama": "0.4.6",
158
+ "openai": "1.98.0",
159
+ "threadpoolctl": "3.6.0",
160
+ "nvidia-cudnn-cu12": "9.5.1.17",
161
+ "pandas": "2.3.1",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.8888888888888888,
180
+ "accuracy_ci_low": 0.4444444444444444,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 0.8888888888888888,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 0.4444444444444444,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 1.0,
190
+ "accuracy_ci_low": 1.0,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 1.0,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 1.0,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 1.0,
200
+ "accuracy_ci_low": 1.0,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 1.0,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 1.0,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 1.0,
220
+ "accuracy_ci_low": 1.0,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 1.0,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 1.0,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 1.0,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 1.0,
260
+ "accuracy_ci_low": 1.0,
261
+ "accuracy_ci_high": 1.0,
262
+ "score_name": "accuracy",
263
+ "score": 1.0,
264
+ "score_ci_high": 1.0,
265
+ "score_ci_low": 1.0,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.5555555555555556,
270
+ "accuracy_ci_low": 0.2222222222222222,
271
+ "accuracy_ci_high": 0.8888888888888888,
272
+ "score_name": "accuracy",
273
+ "score": 0.5555555555555556,
274
+ "score_ci_high": 0.8888888888888888,
275
+ "score_ci_low": 0.2222222222222222,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.8888888888888888,
280
+ "accuracy_ci_low": 0.5555555555555556,
281
+ "accuracy_ci_high": 1.0,
282
+ "score_name": "accuracy",
283
+ "score": 0.8888888888888888,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 0.5555555555555556,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.9393939393939393,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.9414414414414415,
296
+ "score": 0.9414414414414415,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.9414414414414415,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.7999999999999999,
307
+ "f1_Organization": 0.7096774193548386,
308
+ "f1_Location": 0.7058823529411765,
309
+ "f1_macro": 0.7385199240986716,
310
+ "recall_macro": 0.80175983436853,
311
+ "precision_macro": 0.6848220769789397,
312
+ "in_classes_support": 1.0,
313
+ "f1_micro": 0.7361963190184048,
314
+ "recall_micro": 0.8,
315
+ "precision_micro": 0.6818181818181818,
316
+ "score": 0.7361963190184048,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.6504174815433912,
319
+ "score_ci_high": 0.782581269719831,
320
+ "f1_micro_ci_low": 0.6504174815433912,
321
+ "f1_micro_ci_high": 0.782581269719831
322
+ },
323
+ "score": 0.7361963190184048,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.7142857142857143,
330
+ "accuracy_ci_low": 0.2857142857142857,
331
+ "accuracy_ci_high": 1.0,
332
+ "score_name": "accuracy",
333
+ "score": 0.7142857142857143,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.2857142857142857,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.14285714285714285,
340
+ "accuracy_ci_low": 0.0,
341
+ "accuracy_ci_high": 0.5714285714285714,
342
+ "score_name": "accuracy",
343
+ "score": 0.14285714285714285,
344
+ "score_ci_high": 0.5714285714285714,
345
+ "score_ci_low": 0.0,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.0,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.0,
352
+ "score_name": "accuracy",
353
+ "score": 0.0,
354
+ "score_ci_high": 0.0,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.7142857142857143,
360
+ "accuracy_ci_low": 0.2857142857142857,
361
+ "accuracy_ci_high": 1.0,
362
+ "score_name": "accuracy",
363
+ "score": 0.7142857142857143,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 0.2857142857142857,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.7142857142857143,
370
+ "accuracy_ci_low": 0.2857142857142857,
371
+ "accuracy_ci_high": 1.0,
372
+ "score_name": "accuracy",
373
+ "score": 0.7142857142857143,
374
+ "score_ci_high": 1.0,
375
+ "score_ci_low": 0.2857142857142857,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.14285714285714285,
380
+ "accuracy_ci_low": 0.0,
381
+ "accuracy_ci_high": 0.5714285714285714,
382
+ "score_name": "accuracy",
383
+ "score": 0.14285714285714285,
384
+ "score_ci_high": 0.5714285714285714,
385
+ "score_ci_low": 0.0,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.5714285714285714,
390
+ "accuracy_ci_low": 0.14285714285714285,
391
+ "accuracy_ci_high": 0.8571428571428571,
392
+ "score_name": "accuracy",
393
+ "score": 0.5714285714285714,
394
+ "score_ci_high": 0.8571428571428571,
395
+ "score_ci_low": 0.14285714285714285,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.42857142857142855,
400
+ "accuracy_ci_low": 0.14285714285714285,
401
+ "accuracy_ci_high": 0.8571428571428571,
402
+ "score_name": "accuracy",
403
+ "score": 0.42857142857142855,
404
+ "score_ci_high": 0.8571428571428571,
405
+ "score_ci_low": 0.14285714285714285,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.8571428571428571,
410
+ "accuracy_ci_low": 0.42857142857142855,
411
+ "accuracy_ci_high": 1.0,
412
+ "score_name": "accuracy",
413
+ "score": 0.8571428571428571,
414
+ "score_ci_high": 1.0,
415
+ "score_ci_low": 0.42857142857142855,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.2857142857142857,
420
+ "accuracy_ci_low": 0.0,
421
+ "accuracy_ci_high": 0.7142857142857143,
422
+ "score_name": "accuracy",
423
+ "score": 0.2857142857142857,
424
+ "score_ci_high": 0.7142857142857143,
425
+ "score_ci_low": 0.0,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.5714285714285714,
430
+ "accuracy_ci_low": 0.14285714285714285,
431
+ "accuracy_ci_high": 0.8571428571428571,
432
+ "score_name": "accuracy",
433
+ "score": 0.5714285714285714,
434
+ "score_ci_high": 0.8571428571428571,
435
+ "score_ci_low": 0.14285714285714285,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.5714285714285714,
440
+ "accuracy_ci_low": 0.14285714285714285,
441
+ "accuracy_ci_high": 0.8571428571428571,
442
+ "score_name": "accuracy",
443
+ "score": 0.5714285714285714,
444
+ "score_ci_high": 0.8571428571428571,
445
+ "score_ci_low": 0.14285714285714285,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.0,
450
+ "accuracy_ci_low": 0.0,
451
+ "accuracy_ci_high": 0.0,
452
+ "score_name": "accuracy",
453
+ "score": 0.0,
454
+ "score_ci_high": 0.0,
455
+ "score_ci_low": 0.0,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.7142857142857143,
460
+ "accuracy_ci_low": 0.2857142857142857,
461
+ "accuracy_ci_high": 1.0,
462
+ "score_name": "accuracy",
463
+ "score": 0.7142857142857143,
464
+ "score_ci_high": 1.0,
465
+ "score_ci_low": 0.2857142857142857,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.45918367346938777,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.5199999999999999,
475
+ "f1_suggestive": 0.2,
476
+ "f1_generic": 0.8,
477
+ "f1_fanciful": 0.4,
478
+ "f1_descriptive": 0.4,
479
+ "f1_arbitrary": 0.8,
480
+ "f1_macro_ci_low": 0.3363636363636363,
481
+ "f1_macro_ci_high": 0.766060606060606,
482
+ "score_name": "f1_micro",
483
+ "score": 0.4666666666666667,
484
+ "score_ci_high": 0.6857142857142857,
485
+ "score_ci_low": 0.23076923076923078,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.35,
488
+ "accuracy_ci_low": 0.15,
489
+ "accuracy_ci_high": 0.6,
490
+ "f1_micro": 0.4666666666666667,
491
+ "f1_micro_ci_low": 0.23076923076923078,
492
+ "f1_micro_ci_high": 0.6857142857142857
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.5793103448275863,
496
+ "f1_no": 0.7586206896551724,
497
+ "f1_yes": 0.4,
498
+ "f1_macro_ci_low": 0.375,
499
+ "f1_macro_ci_high": 0.868365507202327,
500
+ "score_name": "f1_micro",
501
+ "score": 0.6666666666666666,
502
+ "score_ci_high": 0.8717948717948718,
503
+ "score_ci_low": 0.45355819395422325,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.65,
506
+ "accuracy_ci_low": 0.45,
507
+ "accuracy_ci_high": 0.85,
508
+ "f1_micro": 0.6666666666666666,
509
+ "f1_micro_ci_low": 0.45355819395422325,
510
+ "f1_micro_ci_high": 0.8717948717948718
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.26643990929705214,
514
+ "f1_conclusion": 0.2222222222222222,
515
+ "f1_decree": 0.0,
516
+ "f1_issue": 0.2857142857142857,
517
+ "f1_analysis": 0.5,
518
+ "f1_facts": 0.8571428571428571,
519
+ "f1_procedural history": 0.0,
520
+ "f1_rule": 0.0,
521
+ "f1_macro_ci_low": 0.1142857142857143,
522
+ "f1_macro_ci_high": 0.45052752383482225,
523
+ "score_name": "f1_micro",
524
+ "score": 0.35294117647058826,
525
+ "score_ci_high": 0.5714285714285714,
526
+ "score_ci_low": 0.12903225806451613,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.3,
529
+ "accuracy_ci_low": 0.1,
530
+ "accuracy_ci_high": 0.5,
531
+ "f1_micro": 0.35294117647058826,
532
+ "f1_micro_ci_low": 0.12903225806451613,
533
+ "f1_micro_ci_high": 0.5714285714285714
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.696969696969697,
537
+ "f1_yes": 0.7272727272727273,
538
+ "f1_no": 0.6666666666666666,
539
+ "f1_macro_ci_low": 0.4949494949494949,
540
+ "f1_macro_ci_high": 0.898989898989899,
541
+ "score_name": "f1_micro",
542
+ "score": 0.7,
543
+ "score_ci_high": 0.9,
544
+ "score_ci_low": 0.5,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.7,
547
+ "accuracy_ci_low": 0.5,
548
+ "accuracy_ci_high": 0.9,
549
+ "f1_micro": 0.7,
550
+ "f1_micro_ci_low": 0.5,
551
+ "f1_micro_ci_high": 0.9
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.8585526315789473,
555
+ "f1_yes": 0.875,
556
+ "f1_no": 0.8421052631578947,
557
+ "f1_macro_ci_low": 0.6847205623637095,
558
+ "f1_macro_ci_high": 0.9545454545454546,
559
+ "score_name": "f1_micro",
560
+ "score": 0.8571428571428571,
561
+ "score_ci_high": 0.9473684210526315,
562
+ "score_ci_low": 0.6706944990883059,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.75,
565
+ "accuracy_ci_low": 0.50468235519016,
566
+ "accuracy_ci_high": 0.9,
567
+ "f1_micro": 0.8571428571428571,
568
+ "f1_micro_ci_low": 0.6706944990883059,
569
+ "f1_micro_ci_high": 0.9473684210526315
570
+ },
571
+ "score": 0.6086834733893557,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.6508639236580414,
578
+ "f1_cars": 0.8333333333333334,
579
+ "f1_windows x": 0.3333333333333333,
580
+ "f1_computer graphics": 0.5555555555555556,
581
+ "f1_atheism": 0.2857142857142857,
582
+ "f1_religion": 0.0,
583
+ "f1_medicine": 0.8571428571428571,
584
+ "f1_christianity": 0.8571428571428571,
585
+ "f1_microsoft windows": 0.8333333333333334,
586
+ "f1_middle east": 0.8333333333333334,
587
+ "f1_motorcycles": 0.7272727272727273,
588
+ "f1_pc hardware": 0.7692307692307693,
589
+ "f1_mac hardware": 1.0,
590
+ "f1_electronics": 0.3333333333333333,
591
+ "f1_for sale": 0.75,
592
+ "f1_guns": 0.6666666666666666,
593
+ "f1_space": 0.5714285714285714,
594
+ "f1_cryptography": 0.3333333333333333,
595
+ "f1_baseball": 1.0,
596
+ "f1_politics": 0.5882352941176471,
597
+ "f1_hockey": 0.8888888888888888,
598
+ "f1_macro_ci_low": 0.5563237523685153,
599
+ "f1_macro_ci_high": 0.7401830936884861,
600
+ "score_name": "f1_micro",
601
+ "score": 0.6907216494845361,
602
+ "score_ci_high": 0.7645361476644139,
603
+ "score_ci_low": 0.5804059043570197,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.67,
606
+ "accuracy_ci_low": 0.56,
607
+ "accuracy_ci_high": 0.75,
608
+ "f1_micro": 0.6907216494845361,
609
+ "f1_micro_ci_low": 0.5804059043570197,
610
+ "f1_micro_ci_high": 0.7645361476644139
611
+ },
612
+ "score": 0.6907216494845361,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.7410023219814241,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9411764705882353,
620
+ "f1_debt collection": 0.7368421052631579,
621
+ "f1_payday loan or title loan or personal loan": 0.0,
622
+ "f1_student loan": 0.8333333333333334,
623
+ "f1_credit card or prepaid card": 0.75,
624
+ "f1_checking or savings account": 1.0,
625
+ "f1_mortgage": 0.6666666666666666,
626
+ "f1_money transfer or virtual currency or money service": 1.0,
627
+ "f1_macro_ci_low": 0.548026288422304,
628
+ "f1_macro_ci_high": 0.8446388968731338,
629
+ "score_name": "f1_micro",
630
+ "score": 0.898989898989899,
631
+ "score_ci_high": 0.9447236180904522,
632
+ "score_ci_low": 0.8203180754561684,
633
+ "num_of_instances": 100,
634
+ "accuracy": 0.89,
635
+ "accuracy_ci_low": 0.81,
636
+ "accuracy_ci_high": 0.94,
637
+ "f1_micro": 0.898989898989899,
638
+ "f1_micro_ci_low": 0.8203180754561684,
639
+ "f1_micro_ci_high": 0.9447236180904522
640
+ },
641
+ "cfpb_product_watsonx": {
642
+ "f1_macro": 0.8260549519130755,
643
+ "f1_mortgages and loans": 0.8695652173913043,
644
+ "f1_credit card": 0.8181818181818182,
645
+ "f1_debt collection": 0.7368421052631579,
646
+ "f1_credit reporting": 0.782608695652174,
647
+ "f1_retail banking": 0.9230769230769231,
648
+ "f1_macro_ci_low": 0.7025385602172309,
649
+ "f1_macro_ci_high": 0.9116966675085018,
650
+ "score_name": "f1_micro",
651
+ "score": 0.82,
652
+ "score_ci_high": 0.9,
653
+ "score_ci_low": 0.7,
654
+ "num_of_instances": 50,
655
+ "accuracy": 0.82,
656
+ "accuracy_ci_low": 0.7,
657
+ "accuracy_ci_high": 0.9,
658
+ "f1_micro": 0.82,
659
+ "f1_micro_ci_low": 0.7,
660
+ "f1_micro_ci_high": 0.9
661
+ },
662
+ "score": 0.8594949494949495,
663
+ "score_name": "subsets_mean",
664
+ "num_of_instances": 150
665
+ },
666
+ "qa_finance": {
667
+ "fin_qa": {
668
+ "num_of_instances": 100,
669
+ "execution_accuracy": 0.36,
670
+ "program_accuracy": 0.37,
671
+ "score": 0.37,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy_ci_low": 0.27,
674
+ "execution_accuracy_ci_high": 0.47,
675
+ "program_accuracy_ci_low": 0.2811354521803329,
676
+ "program_accuracy_ci_high": 0.47,
677
+ "score_ci_low": 0.2811354521803329,
678
+ "score_ci_high": 0.47
679
+ },
680
+ "score": 0.37,
681
+ "score_name": "subsets_mean",
682
+ "num_of_instances": 100
683
+ },
684
+ "rag_general": {
685
+ "rag_response_generation_clapnq": {
686
+ "precision": 0.4468480048160072,
687
+ "recall": 0.6454220314570971,
688
+ "f1": 0.4890230179217559,
689
+ "precision_ci_low": 0.4151579792947102,
690
+ "precision_ci_high": 0.4845603023826521,
691
+ "recall_ci_low": 0.6061330597384731,
692
+ "recall_ci_high": 0.6765859073642304,
693
+ "f1_ci_low": 0.46005008735451697,
694
+ "f1_ci_high": 0.5174377148355835,
695
+ "score_name": "f1",
696
+ "score": 0.4890230179217559,
697
+ "score_ci_high": 0.5174377148355835,
698
+ "score_ci_low": 0.46005008735451697,
699
+ "num_of_instances": 100,
700
+ "correctness_f1_bert_score.deberta_large_mnli": 0.691885244846344,
701
+ "correctness_recall_bert_score.deberta_large_mnli": 0.7364352762699127,
702
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6613857835531235,
703
+ "faithfullness_f1_token_overlap": 0.43039423317750836,
704
+ "faithfullness_recall_token_overlap": 0.35370116631248527,
705
+ "faithfullness_precision_token_overlap": 0.6790419566943843,
706
+ "correctness_f1_token_overlap": 0.4890230179217559,
707
+ "correctness_recall_token_overlap": 0.6454220314570971,
708
+ "correctness_precision_token_overlap": 0.4468480048160072
709
+ },
710
+ "score": 0.4890230179217559,
711
+ "score_name": "subsets_mean",
712
+ "num_of_instances": 100
713
+ },
714
+ "reasoning": {
715
+ "hellaswag": {
716
+ "accuracy": 0.62,
717
+ "accuracy_ci_low": 0.51,
718
+ "accuracy_ci_high": 0.7,
719
+ "score_name": "accuracy",
720
+ "score": 0.62,
721
+ "score_ci_high": 0.7,
722
+ "score_ci_low": 0.51,
723
+ "num_of_instances": 100
724
+ },
725
+ "openbook_qa": {
726
+ "accuracy": 0.94,
727
+ "accuracy_ci_low": 0.89,
728
+ "accuracy_ci_high": 0.98,
729
+ "score_name": "accuracy",
730
+ "score": 0.94,
731
+ "score_ci_high": 0.98,
732
+ "score_ci_low": 0.89,
733
+ "num_of_instances": 100
734
+ },
735
+ "score": 0.78,
736
+ "score_name": "subsets_mean",
737
+ "num_of_instances": 200
738
+ },
739
+ "safety": {
740
+ "attaq_500": {
741
+ "severity_critical": 0.0,
742
+ "severity_high": 0.0,
743
+ "severity_medium": 0.0,
744
+ "severity_low": 100.0,
745
+ "category_harmful_info": 0.8862442294401782,
746
+ "category_deception": 0.9089467666215367,
747
+ "category_explicit_content": 0.8660169005243467,
748
+ "category_violence": 0.8813284475474604,
749
+ "category_discrimination": 0.7921433370737802,
750
+ "category_substance_abuse": 0.892341727639238,
751
+ "category_pii": 0.8837605582343208,
752
+ "safety": 0.8784392809950642,
753
+ "safety_ci_low": 0.8622394069744699,
754
+ "safety_ci_high": 0.8922010585173956,
755
+ "score_name": "safety",
756
+ "score": 0.8784392809950642,
757
+ "score_ci_high": 0.8922010585173956,
758
+ "score_ci_low": 0.8622394069744699,
759
+ "num_of_instances": 100
760
+ },
761
+ "score": 0.8784392809950642,
762
+ "score_name": "subsets_mean",
763
+ "num_of_instances": 100
764
+ },
765
+ "summarization": {
766
+ "billsum_document_filtered_to_6000_chars": {
767
+ "num_of_instances": 100,
768
+ "rougeLsum": 0.3334800194661833,
769
+ "rouge2": 0.1746055052600959,
770
+ "rouge1": 0.3914063192107438,
771
+ "rougeL": 0.2607874086538398,
772
+ "score": 0.2607874086538398,
773
+ "score_name": "rougeL",
774
+ "rougeLsum_ci_low": 0.31372911043552615,
775
+ "rougeLsum_ci_high": 0.3521588230191023,
776
+ "rouge2_ci_low": 0.16154343447204916,
777
+ "rouge2_ci_high": 0.18686015868835737,
778
+ "rouge1_ci_low": 0.36979472683613324,
779
+ "rouge1_ci_high": 0.41112417139895086,
780
+ "rougeL_ci_low": 0.24640816277583857,
781
+ "rougeL_ci_high": 0.2769615579591105,
782
+ "score_ci_low": 0.24640816277583857,
783
+ "score_ci_high": 0.2769615579591105
784
+ },
785
+ "tldr_document_filtered_to_6000_chars": {
786
+ "num_of_instances": 100,
787
+ "rougeLsum": 0.08980268920854961,
788
+ "rouge2": 0.012765072464340065,
789
+ "rouge1": 0.10813972613064696,
790
+ "rougeL": 0.08042515106945874,
791
+ "score": 0.08042515106945874,
792
+ "score_name": "rougeL",
793
+ "rougeLsum_ci_low": 0.07759889049057624,
794
+ "rougeLsum_ci_high": 0.1032183831189881,
795
+ "rouge2_ci_low": 0.008976164023617238,
796
+ "rouge2_ci_high": 0.018103937102878734,
797
+ "rouge1_ci_low": 0.09248155246218394,
798
+ "rouge1_ci_high": 0.1250184504642972,
799
+ "rougeL_ci_low": 0.06959906992080947,
800
+ "rougeL_ci_high": 0.09223760457164616,
801
+ "score_ci_low": 0.06959906992080947,
802
+ "score_ci_high": 0.09223760457164616
803
+ },
804
+ "score": 0.17060627986164928,
805
+ "score_name": "subsets_mean",
806
+ "num_of_instances": 200
807
+ },
808
+ "translation": {
809
+ "mt_flores_101_ara_eng": {
810
+ "num_of_instances": 6,
811
+ "counts": [
812
+ 156,
813
+ 105,
814
+ 71,
815
+ 50
816
+ ],
817
+ "totals": [
818
+ 226,
819
+ 220,
820
+ 214,
821
+ 208
822
+ ],
823
+ "precisions": [
824
+ 0.6902654867256638,
825
+ 0.4772727272727273,
826
+ 0.3317757009345794,
827
+ 0.2403846153846154
828
+ ],
829
+ "bp": 1.0,
830
+ "sys_len": 226,
831
+ "ref_len": 208,
832
+ "sacrebleu": 0.4026090246453075,
833
+ "score": 0.4026090246453075,
834
+ "score_name": "sacrebleu",
835
+ "score_ci_low": 0.24025628806486451,
836
+ "score_ci_high": 0.5204212192812463,
837
+ "sacrebleu_ci_low": 0.24025628806486451,
838
+ "sacrebleu_ci_high": 0.5204212192812463
839
+ },
840
+ "mt_flores_101_deu_eng": {
841
+ "num_of_instances": 6,
842
+ "counts": [
843
+ 143,
844
+ 93,
845
+ 63,
846
+ 45
847
+ ],
848
+ "totals": [
849
+ 219,
850
+ 213,
851
+ 207,
852
+ 201
853
+ ],
854
+ "precisions": [
855
+ 0.6529680365296804,
856
+ 0.43661971830985913,
857
+ 0.30434782608695654,
858
+ 0.22388059701492538
859
+ ],
860
+ "bp": 1.0,
861
+ "sys_len": 219,
862
+ "ref_len": 208,
863
+ "sacrebleu": 0.3733322279107499,
864
+ "score": 0.3733322279107499,
865
+ "score_name": "sacrebleu",
866
+ "score_ci_low": 0.26773105555308707,
867
+ "score_ci_high": 0.5628718725153604,
868
+ "sacrebleu_ci_low": 0.26773105555308707,
869
+ "sacrebleu_ci_high": 0.5628718725153604
870
+ },
871
+ "mt_flores_101_eng_ara": {
872
+ "num_of_instances": 6,
873
+ "counts": [
874
+ 118,
875
+ 70,
876
+ 45,
877
+ 27
878
+ ],
879
+ "totals": [
880
+ 211,
881
+ 205,
882
+ 199,
883
+ 193
884
+ ],
885
+ "precisions": [
886
+ 0.5592417061611374,
887
+ 0.34146341463414637,
888
+ 0.22613065326633167,
889
+ 0.13989637305699482
890
+ ],
891
+ "bp": 1.0,
892
+ "sys_len": 211,
893
+ "ref_len": 209,
894
+ "sacrebleu": 0.27879013737554453,
895
+ "score": 0.27879013737554453,
896
+ "score_name": "sacrebleu",
897
+ "score_ci_low": 0.242285190601197,
898
+ "score_ci_high": 0.3370610364230129,
899
+ "sacrebleu_ci_low": 0.242285190601197,
900
+ "sacrebleu_ci_high": 0.3370610364230129
901
+ },
902
+ "mt_flores_101_eng_deu": {
903
+ "num_of_instances": 6,
904
+ "counts": [
905
+ 148,
906
+ 92,
907
+ 62,
908
+ 46
909
+ ],
910
+ "totals": [
911
+ 217,
912
+ 211,
913
+ 205,
914
+ 199
915
+ ],
916
+ "precisions": [
917
+ 0.6820276497695852,
918
+ 0.43601895734597157,
919
+ 0.3024390243902439,
920
+ 0.23115577889447236
921
+ ],
922
+ "bp": 1.0,
923
+ "sys_len": 217,
924
+ "ref_len": 216,
925
+ "sacrebleu": 0.3797191362653853,
926
+ "score": 0.3797191362653853,
927
+ "score_name": "sacrebleu",
928
+ "score_ci_low": 0.24991245830969483,
929
+ "score_ci_high": 0.537890541133251,
930
+ "sacrebleu_ci_low": 0.24991245830969483,
931
+ "sacrebleu_ci_high": 0.537890541133251
932
+ },
933
+ "mt_flores_101_eng_fra": {
934
+ "num_of_instances": 6,
935
+ "counts": [
936
+ 190,
937
+ 150,
938
+ 119,
939
+ 95
940
+ ],
941
+ "totals": [
942
+ 244,
943
+ 238,
944
+ 232,
945
+ 226
946
+ ],
947
+ "precisions": [
948
+ 0.7786885245901639,
949
+ 0.6302521008403361,
950
+ 0.5129310344827587,
951
+ 0.42035398230088494
952
+ ],
953
+ "bp": 1.0,
954
+ "sys_len": 244,
955
+ "ref_len": 235,
956
+ "sacrebleu": 0.5703455465960385,
957
+ "score": 0.5703455465960385,
958
+ "score_name": "sacrebleu",
959
+ "score_ci_low": 0.4931990672262396,
960
+ "score_ci_high": 0.6753511173840523,
961
+ "sacrebleu_ci_low": 0.4931990672262396,
962
+ "sacrebleu_ci_high": 0.6753511173840523
963
+ },
964
+ "mt_flores_101_eng_kor": {
965
+ "num_of_instances": 6,
966
+ "counts": [
967
+ 162,
968
+ 92,
969
+ 63,
970
+ 43
971
+ ],
972
+ "totals": [
973
+ 273,
974
+ 267,
975
+ 261,
976
+ 255
977
+ ],
978
+ "precisions": [
979
+ 0.5934065934065934,
980
+ 0.3445692883895131,
981
+ 0.24137931034482757,
982
+ 0.16862745098039217
983
+ ],
984
+ "bp": 1.0,
985
+ "sys_len": 273,
986
+ "ref_len": 249,
987
+ "sacrebleu": 0.3020398964371346,
988
+ "score": 0.3020398964371346,
989
+ "score_name": "sacrebleu",
990
+ "score_ci_low": 0.23699249066795575,
991
+ "score_ci_high": 0.36104670705975184,
992
+ "sacrebleu_ci_low": 0.23699249066795575,
993
+ "sacrebleu_ci_high": 0.36104670705975184
994
+ },
995
+ "mt_flores_101_eng_por": {
996
+ "num_of_instances": 6,
997
+ "counts": [
998
+ 181,
999
+ 136,
1000
+ 110,
1001
+ 89
1002
+ ],
1003
+ "totals": [
1004
+ 228,
1005
+ 222,
1006
+ 216,
1007
+ 210
1008
+ ],
1009
+ "precisions": [
1010
+ 0.7938596491228069,
1011
+ 0.6126126126126126,
1012
+ 0.5092592592592592,
1013
+ 0.4238095238095238
1014
+ ],
1015
+ "bp": 1.0,
1016
+ "sys_len": 228,
1017
+ "ref_len": 222,
1018
+ "sacrebleu": 0.5691933623646763,
1019
+ "score": 0.5691933623646763,
1020
+ "score_name": "sacrebleu",
1021
+ "score_ci_low": 0.5074329158262365,
1022
+ "score_ci_high": 0.6529930317459868,
1023
+ "sacrebleu_ci_low": 0.5074329158262365,
1024
+ "sacrebleu_ci_high": 0.6529930317459868
1025
+ },
1026
+ "mt_flores_101_eng_ron": {
1027
+ "num_of_instances": 6,
1028
+ "counts": [
1029
+ 165,
1030
+ 125,
1031
+ 97,
1032
+ 82
1033
+ ],
1034
+ "totals": [
1035
+ 230,
1036
+ 224,
1037
+ 218,
1038
+ 212
1039
+ ],
1040
+ "precisions": [
1041
+ 0.717391304347826,
1042
+ 0.5580357142857143,
1043
+ 0.444954128440367,
1044
+ 0.38679245283018865
1045
+ ],
1046
+ "bp": 1.0,
1047
+ "sys_len": 230,
1048
+ "ref_len": 230,
1049
+ "sacrebleu": 0.5123335940057543,
1050
+ "score": 0.5123335940057543,
1051
+ "score_name": "sacrebleu",
1052
+ "score_ci_low": 0.3840244184926614,
1053
+ "score_ci_high": 0.650779271851941,
1054
+ "sacrebleu_ci_low": 0.3840244184926614,
1055
+ "sacrebleu_ci_high": 0.650779271851941
1056
+ },
1057
+ "mt_flores_101_eng_spa": {
1058
+ "num_of_instances": 6,
1059
+ "counts": [
1060
+ 170,
1061
+ 109,
1062
+ 73,
1063
+ 47
1064
+ ],
1065
+ "totals": [
1066
+ 236,
1067
+ 230,
1068
+ 224,
1069
+ 218
1070
+ ],
1071
+ "precisions": [
1072
+ 0.7203389830508474,
1073
+ 0.47391304347826085,
1074
+ 0.32589285714285715,
1075
+ 0.21559633027522934
1076
+ ],
1077
+ "bp": 0.9707745538991623,
1078
+ "sys_len": 236,
1079
+ "ref_len": 243,
1080
+ "sacrebleu": 0.3820377957320921,
1081
+ "score": 0.3820377957320921,
1082
+ "score_name": "sacrebleu",
1083
+ "score_ci_low": 0.3085199315474183,
1084
+ "score_ci_high": 0.48773289703817563,
1085
+ "sacrebleu_ci_low": 0.3085199315474183,
1086
+ "sacrebleu_ci_high": 0.48773289703817563
1087
+ },
1088
+ "mt_flores_101_fra_eng": {
1089
+ "num_of_instances": 6,
1090
+ "counts": [
1091
+ 158,
1092
+ 111,
1093
+ 79,
1094
+ 59
1095
+ ],
1096
+ "totals": [
1097
+ 225,
1098
+ 219,
1099
+ 213,
1100
+ 207
1101
+ ],
1102
+ "precisions": [
1103
+ 0.7022222222222223,
1104
+ 0.5068493150684932,
1105
+ 0.37089201877934275,
1106
+ 0.28502415458937197
1107
+ ],
1108
+ "bp": 1.0,
1109
+ "sys_len": 225,
1110
+ "ref_len": 208,
1111
+ "sacrebleu": 0.44042366511701625,
1112
+ "score": 0.44042366511701625,
1113
+ "score_name": "sacrebleu",
1114
+ "score_ci_low": 0.341980662697259,
1115
+ "score_ci_high": 0.5496900259922087,
1116
+ "sacrebleu_ci_low": 0.341980662697259,
1117
+ "sacrebleu_ci_high": 0.5496900259922087
1118
+ },
1119
+ "mt_flores_101_jpn_eng": {
1120
+ "num_of_instances": 6,
1121
+ "counts": [
1122
+ 150,
1123
+ 90,
1124
+ 58,
1125
+ 39
1126
+ ],
1127
+ "totals": [
1128
+ 236,
1129
+ 230,
1130
+ 224,
1131
+ 218
1132
+ ],
1133
+ "precisions": [
1134
+ 0.635593220338983,
1135
+ 0.391304347826087,
1136
+ 0.2589285714285714,
1137
+ 0.17889908256880735
1138
+ ],
1139
+ "bp": 1.0,
1140
+ "sys_len": 236,
1141
+ "ref_len": 208,
1142
+ "sacrebleu": 0.32762007434781915,
1143
+ "score": 0.32762007434781915,
1144
+ "score_name": "sacrebleu",
1145
+ "score_ci_low": 0.20337120135607298,
1146
+ "score_ci_high": 0.4231259198402102,
1147
+ "sacrebleu_ci_low": 0.20337120135607298,
1148
+ "sacrebleu_ci_high": 0.4231259198402102
1149
+ },
1150
+ "mt_flores_101_kor_eng": {
1151
+ "num_of_instances": 6,
1152
+ "counts": [
1153
+ 135,
1154
+ 76,
1155
+ 45,
1156
+ 29
1157
+ ],
1158
+ "totals": [
1159
+ 226,
1160
+ 220,
1161
+ 214,
1162
+ 208
1163
+ ],
1164
+ "precisions": [
1165
+ 0.5973451327433628,
1166
+ 0.34545454545454546,
1167
+ 0.2102803738317757,
1168
+ 0.13942307692307693
1169
+ ],
1170
+ "bp": 1.0,
1171
+ "sys_len": 226,
1172
+ "ref_len": 208,
1173
+ "sacrebleu": 0.2788928697008729,
1174
+ "score": 0.2788928697008729,
1175
+ "score_name": "sacrebleu",
1176
+ "score_ci_low": 0.18358949678749556,
1177
+ "score_ci_high": 0.4387198667737409,
1178
+ "sacrebleu_ci_low": 0.18358949678749556,
1179
+ "sacrebleu_ci_high": 0.4387198667737409
1180
+ },
1181
+ "mt_flores_101_por_eng": {
1182
+ "num_of_instances": 6,
1183
+ "counts": [
1184
+ 168,
1185
+ 124,
1186
+ 93,
1187
+ 73
1188
+ ],
1189
+ "totals": [
1190
+ 221,
1191
+ 215,
1192
+ 209,
1193
+ 203
1194
+ ],
1195
+ "precisions": [
1196
+ 0.7601809954751131,
1197
+ 0.5767441860465117,
1198
+ 0.4449760765550239,
1199
+ 0.35960591133004927
1200
+ ],
1201
+ "bp": 1.0,
1202
+ "sys_len": 221,
1203
+ "ref_len": 208,
1204
+ "sacrebleu": 0.5146546836832124,
1205
+ "score": 0.5146546836832124,
1206
+ "score_name": "sacrebleu",
1207
+ "score_ci_low": 0.378423475348725,
1208
+ "score_ci_high": 0.6374865872050556,
1209
+ "sacrebleu_ci_low": 0.378423475348725,
1210
+ "sacrebleu_ci_high": 0.6374865872050556
1211
+ },
1212
+ "mt_flores_101_ron_eng": {
1213
+ "num_of_instances": 6,
1214
+ "counts": [
1215
+ 166,
1216
+ 121,
1217
+ 91,
1218
+ 72
1219
+ ],
1220
+ "totals": [
1221
+ 227,
1222
+ 221,
1223
+ 215,
1224
+ 209
1225
+ ],
1226
+ "precisions": [
1227
+ 0.7312775330396475,
1228
+ 0.5475113122171945,
1229
+ 0.4232558139534883,
1230
+ 0.34449760765550236
1231
+ ],
1232
+ "bp": 1.0,
1233
+ "sys_len": 227,
1234
+ "ref_len": 208,
1235
+ "sacrebleu": 0.49154820843517344,
1236
+ "score": 0.49154820843517344,
1237
+ "score_name": "sacrebleu",
1238
+ "score_ci_low": 0.4117068924616905,
1239
+ "score_ci_high": 0.6461238952977133,
1240
+ "sacrebleu_ci_low": 0.4117068924616905,
1241
+ "sacrebleu_ci_high": 0.6461238952977133
1242
+ },
1243
+ "mt_flores_101_spa_eng": {
1244
+ "num_of_instances": 6,
1245
+ "counts": [
1246
+ 150,
1247
+ 96,
1248
+ 65,
1249
+ 48
1250
+ ],
1251
+ "totals": [
1252
+ 219,
1253
+ 213,
1254
+ 207,
1255
+ 201
1256
+ ],
1257
+ "precisions": [
1258
+ 0.684931506849315,
1259
+ 0.45070422535211263,
1260
+ 0.3140096618357488,
1261
+ 0.23880597014925375
1262
+ ],
1263
+ "bp": 1.0,
1264
+ "sys_len": 219,
1265
+ "ref_len": 208,
1266
+ "sacrebleu": 0.3900602917326207,
1267
+ "score": 0.3900602917326207,
1268
+ "score_name": "sacrebleu",
1269
+ "score_ci_low": 0.3529954080391477,
1270
+ "score_ci_high": 0.44601468942834244,
1271
+ "sacrebleu_ci_low": 0.3529954080391477,
1272
+ "sacrebleu_ci_high": 0.44601468942834244
1273
+ },
1274
+ "score": 0.41424003428995987,
1275
+ "score_name": "subsets_mean",
1276
+ "num_of_instances": 90
1277
+ },
1278
+ "score": 0.6413403122123419,
1279
+ "score_name": "subsets_mean",
1280
+ "num_of_instances": 1537
1281
+ }
1282
+ }
results/bluebench/2025-08-03T12-21-28_evaluation_results.json ADDED
@@ -0,0 +1,1282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-08-03T16:21:24.530955Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=azure/Azure/o3-mini-ncf,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "azure/Azure/o3-mini-ncf",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.26.5",
45
+ "unitxt_commit_hash": "N/A",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "tiktoken": "0.9.0",
55
+ "charset-normalizer": "3.4.2",
56
+ "nvidia-cuda-runtime-cu12": "12.6.77",
57
+ "pyarrow": "21.0.0",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "httpcore": "1.0.9",
61
+ "pip": "25.2",
62
+ "certifi": "2025.7.14",
63
+ "evaluate": "0.4.5",
64
+ "Jinja2": "3.1.6",
65
+ "jsonschema-specifications": "2025.4.1",
66
+ "pydantic_core": "2.33.2",
67
+ "nvidia-cusparse-cu12": "12.5.4.2",
68
+ "aiosignal": "1.4.0",
69
+ "yarl": "1.20.1",
70
+ "unitxt": "1.26.5",
71
+ "jsonschema": "4.25.0",
72
+ "portalocker": "3.2.0",
73
+ "multiprocess": "0.70.16",
74
+ "nvidia-nvjitlink-cu12": "12.6.85",
75
+ "nvidia-cublas-cu12": "12.6.4.1",
76
+ "pydantic": "2.11.7",
77
+ "async-timeout": "5.0.1",
78
+ "annotated-types": "0.7.0",
79
+ "rouge_score": "0.1.2",
80
+ "contourpy": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "matplotlib": "3.10.5",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "h11": "0.16.0",
87
+ "zipp": "3.19.2",
88
+ "tzdata": "2025.2",
89
+ "bert-score": "0.3.13",
90
+ "setuptools": "80.9.0",
91
+ "referencing": "0.36.2",
92
+ "sacrebleu": "2.5.1",
93
+ "filelock": "3.18.0",
94
+ "urllib3": "2.5.0",
95
+ "scipy": "1.15.3",
96
+ "nvidia-nccl-cu12": "2.26.2",
97
+ "kiwisolver": "1.4.8",
98
+ "networkx": "3.4.2",
99
+ "typing-inspection": "0.4.1",
100
+ "sniffio": "1.3.1",
101
+ "rpds-py": "0.26.0",
102
+ "nvidia-curand-cu12": "10.3.7.77",
103
+ "litellm": "1.74.12",
104
+ "pillow": "11.3.0",
105
+ "datasets": "3.6.0",
106
+ "nvidia-cusolver-cu12": "11.7.1.2",
107
+ "cycler": "0.12.1",
108
+ "tokenizers": "0.21.4",
109
+ "distro": "1.9.0",
110
+ "idna": "3.10",
111
+ "MarkupSafe": "3.0.2",
112
+ "frozenlist": "1.7.0",
113
+ "pyparsing": "3.2.3",
114
+ "regex": "2025.7.34",
115
+ "jiter": "0.10.0",
116
+ "importlib_metadata": "8.0.0",
117
+ "packaging": "24.2",
118
+ "psutil": "7.0.0",
119
+ "mecab-ko-dic": "1.0.0",
120
+ "joblib": "1.5.1",
121
+ "transformers": "4.54.1",
122
+ "fsspec": "2025.3.0",
123
+ "scikit-learn": "1.7.1",
124
+ "dill": "0.3.8",
125
+ "wheel": "0.45.1",
126
+ "nvidia-nvtx-cu12": "12.6.77",
127
+ "nvidia-cusparselt-cu12": "0.6.3",
128
+ "lxml": "6.0.0",
129
+ "propcache": "0.3.2",
130
+ "numpy": "2.2.6",
131
+ "mpmath": "1.3.0",
132
+ "conllu": "6.0.0",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "fonttools": "4.59.0",
136
+ "tabulate": "0.9.0",
137
+ "typing_extensions": "4.12.2",
138
+ "absl-py": "2.3.1",
139
+ "accelerate": "1.9.0",
140
+ "nvidia-cufft-cu12": "11.3.0.4",
141
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
142
+ "click": "8.2.1",
143
+ "attrs": "25.3.0",
144
+ "exceptiongroup": "1.3.0",
145
+ "tenacity": "9.1.2",
146
+ "huggingface-hub": "0.34.3",
147
+ "pytz": "2025.2",
148
+ "aiohappyeyeballs": "2.6.1",
149
+ "python-dateutil": "2.9.0.post0",
150
+ "torch": "2.7.1",
151
+ "python-dotenv": "1.1.1",
152
+ "multidict": "6.6.3",
153
+ "httpx": "0.28.1",
154
+ "aiohttp": "3.12.15",
155
+ "xxhash": "3.5.0",
156
+ "PyYAML": "6.0.2",
157
+ "colorama": "0.4.6",
158
+ "openai": "1.98.0",
159
+ "threadpoolctl": "3.6.0",
160
+ "nvidia-cudnn-cu12": "9.5.1.17",
161
+ "pandas": "2.3.1",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.8888888888888888,
180
+ "accuracy_ci_low": 0.5310928992288233,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 0.8888888888888888,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 0.5310928992288233,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 1.0,
190
+ "accuracy_ci_low": 1.0,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 1.0,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 1.0,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 0.8888888888888888,
200
+ "accuracy_ci_low": 0.46041936253217447,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 0.8888888888888888,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 0.46041936253217447,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 1.0,
220
+ "accuracy_ci_low": 1.0,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 1.0,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 1.0,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 1.0,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 1.0,
260
+ "accuracy_ci_low": 1.0,
261
+ "accuracy_ci_high": 1.0,
262
+ "score_name": "accuracy",
263
+ "score": 1.0,
264
+ "score_ci_high": 1.0,
265
+ "score_ci_low": 1.0,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 1.0,
270
+ "accuracy_ci_low": 1.0,
271
+ "accuracy_ci_high": 1.0,
272
+ "score_name": "accuracy",
273
+ "score": 1.0,
274
+ "score_ci_high": 1.0,
275
+ "score_ci_low": 1.0,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 1.0,
280
+ "accuracy_ci_low": 1.0,
281
+ "accuracy_ci_high": 1.0,
282
+ "score_name": "accuracy",
283
+ "score": 1.0,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 1.0,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.9797979797979798,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.3711340206185567,
296
+ "score": 0.3711340206185567,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.3711340206185567,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.8181818181818182,
307
+ "f1_Organization": 0.6428571428571429,
308
+ "f1_Location": 0.75,
309
+ "f1_macro": 0.737012987012987,
310
+ "recall_macro": 0.683488612836439,
311
+ "precision_macro": 0.8125,
312
+ "in_classes_support": 1.0,
313
+ "f1_micro": 0.7285714285714285,
314
+ "recall_micro": 0.68,
315
+ "precision_micro": 0.7846153846153846,
316
+ "score": 0.7285714285714285,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.5888115359844259,
319
+ "score_ci_high": 0.8223830090806191,
320
+ "f1_micro_ci_low": 0.5888115359844259,
321
+ "f1_micro_ci_high": 0.8223830090806191
322
+ },
323
+ "score": 0.7285714285714285,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.7142857142857143,
330
+ "accuracy_ci_low": 0.2857142857142857,
331
+ "accuracy_ci_high": 1.0,
332
+ "score_name": "accuracy",
333
+ "score": 0.7142857142857143,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.2857142857142857,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.2857142857142857,
340
+ "accuracy_ci_low": 0.0,
341
+ "accuracy_ci_high": 0.7142857142857143,
342
+ "score_name": "accuracy",
343
+ "score": 0.2857142857142857,
344
+ "score_ci_high": 0.7142857142857143,
345
+ "score_ci_low": 0.0,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.0,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.0,
352
+ "score_name": "accuracy",
353
+ "score": 0.0,
354
+ "score_ci_high": 0.0,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 1.0,
360
+ "accuracy_ci_low": 1.0,
361
+ "accuracy_ci_high": 1.0,
362
+ "score_name": "accuracy",
363
+ "score": 1.0,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 1.0,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.7142857142857143,
370
+ "accuracy_ci_low": 0.2857142857142857,
371
+ "accuracy_ci_high": 1.0,
372
+ "score_name": "accuracy",
373
+ "score": 0.7142857142857143,
374
+ "score_ci_high": 1.0,
375
+ "score_ci_low": 0.2857142857142857,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.42857142857142855,
380
+ "accuracy_ci_low": 0.14285714285714285,
381
+ "accuracy_ci_high": 0.8571428571428571,
382
+ "score_name": "accuracy",
383
+ "score": 0.42857142857142855,
384
+ "score_ci_high": 0.8571428571428571,
385
+ "score_ci_low": 0.14285714285714285,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.2857142857142857,
390
+ "accuracy_ci_low": 0.0,
391
+ "accuracy_ci_high": 0.7142857142857143,
392
+ "score_name": "accuracy",
393
+ "score": 0.2857142857142857,
394
+ "score_ci_high": 0.7142857142857143,
395
+ "score_ci_low": 0.0,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.14285714285714285,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.6807203593841678,
402
+ "score_name": "accuracy",
403
+ "score": 0.14285714285714285,
404
+ "score_ci_high": 0.6807203593841678,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.7142857142857143,
410
+ "accuracy_ci_low": 0.2857142857142857,
411
+ "accuracy_ci_high": 1.0,
412
+ "score_name": "accuracy",
413
+ "score": 0.7142857142857143,
414
+ "score_ci_high": 1.0,
415
+ "score_ci_low": 0.2857142857142857,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 1.0,
420
+ "accuracy_ci_low": 1.0,
421
+ "accuracy_ci_high": 1.0,
422
+ "score_name": "accuracy",
423
+ "score": 1.0,
424
+ "score_ci_high": 1.0,
425
+ "score_ci_low": 1.0,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.14285714285714285,
430
+ "accuracy_ci_low": 0.0,
431
+ "accuracy_ci_high": 0.5714285714285714,
432
+ "score_name": "accuracy",
433
+ "score": 0.14285714285714285,
434
+ "score_ci_high": 0.5714285714285714,
435
+ "score_ci_low": 0.0,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.5714285714285714,
440
+ "accuracy_ci_low": 0.14285714285714285,
441
+ "accuracy_ci_high": 0.8571428571428571,
442
+ "score_name": "accuracy",
443
+ "score": 0.5714285714285714,
444
+ "score_ci_high": 0.8571428571428571,
445
+ "score_ci_low": 0.14285714285714285,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.2857142857142857,
450
+ "accuracy_ci_low": 0.0,
451
+ "accuracy_ci_high": 0.7142857142857143,
452
+ "score_name": "accuracy",
453
+ "score": 0.2857142857142857,
454
+ "score_ci_high": 0.7142857142857143,
455
+ "score_ci_low": 0.0,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.7142857142857143,
460
+ "accuracy_ci_low": 0.2857142857142857,
461
+ "accuracy_ci_high": 1.0,
462
+ "score_name": "accuracy",
463
+ "score": 0.7142857142857143,
464
+ "score_ci_high": 1.0,
465
+ "score_ci_low": 0.2857142857142857,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.5,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.0,
475
+ "f1_suggestive": 0.0,
476
+ "f1_generic": 0.0,
477
+ "f1_fanciful": 0.0,
478
+ "f1_descriptive": 0.0,
479
+ "f1_arbitrary": 0.0,
480
+ "f1_macro_ci_low": 0.0,
481
+ "f1_macro_ci_high": 0.0,
482
+ "score_name": "f1_micro",
483
+ "score": 0.0,
484
+ "score_ci_high": 0.0,
485
+ "score_ci_low": 0.0,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.0,
488
+ "accuracy_ci_low": 0.0,
489
+ "accuracy_ci_high": 0.0,
490
+ "f1_micro": 0.0,
491
+ "f1_micro_ci_low": 0.0,
492
+ "f1_micro_ci_high": 0.0
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.47619047619047616,
496
+ "f1_no": 0.6666666666666666,
497
+ "f1_yes": 0.2857142857142857,
498
+ "f1_macro_ci_low": 0.2857142857142857,
499
+ "f1_macro_ci_high": 0.8813336459688916,
500
+ "score_name": "f1_micro",
501
+ "score": 0.5806451612903226,
502
+ "score_ci_high": 0.7878787878787878,
503
+ "score_ci_low": 0.3333333333333333,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.45,
506
+ "accuracy_ci_low": 0.25,
507
+ "accuracy_ci_high": 0.7,
508
+ "f1_micro": 0.5806451612903226,
509
+ "f1_micro_ci_low": 0.3333333333333333,
510
+ "f1_micro_ci_high": 0.7878787878787878
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.20272108843537415,
514
+ "f1_conclusion": 0.0,
515
+ "f1_decree": 0.0,
516
+ "f1_issue": 0.2857142857142857,
517
+ "f1_analysis": 0.8,
518
+ "f1_facts": 0.3333333333333333,
519
+ "f1_procedural history": 0.0,
520
+ "f1_rule": 0.0,
521
+ "f1_macro_ci_low": 0.05,
522
+ "f1_macro_ci_high": 0.3627437138685071,
523
+ "score_name": "f1_micro",
524
+ "score": 0.27586206896551724,
525
+ "score_ci_high": 0.5223537291196929,
526
+ "score_ci_low": 0.07650831228694685,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.2,
529
+ "accuracy_ci_low": 0.05,
530
+ "accuracy_ci_high": 0.40138012181413957,
531
+ "f1_micro": 0.27586206896551724,
532
+ "f1_micro_ci_low": 0.07650831228694685,
533
+ "f1_micro_ci_high": 0.5223537291196929
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.3666666666666667,
537
+ "f1_yes": 0.3333333333333333,
538
+ "f1_no": 0.4,
539
+ "f1_macro_ci_low": 0.16690480249191558,
540
+ "f1_macro_ci_high": 0.6427398020907653,
541
+ "score_name": "f1_micro",
542
+ "score": 0.37037037037037035,
543
+ "score_ci_high": 0.625,
544
+ "score_ci_low": 0.16,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.25,
547
+ "accuracy_ci_low": 0.1,
548
+ "accuracy_ci_high": 0.5,
549
+ "f1_micro": 0.37037037037037035,
550
+ "f1_micro_ci_low": 0.16,
551
+ "f1_micro_ci_high": 0.625
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.660633484162896,
555
+ "f1_yes": 0.6153846153846154,
556
+ "f1_no": 0.7058823529411765,
557
+ "f1_macro_ci_low": 0.40467435105346683,
558
+ "f1_macro_ci_high": 0.8261376660890378,
559
+ "score_name": "f1_micro",
560
+ "score": 0.6666666666666666,
561
+ "score_ci_high": 0.8235294117647058,
562
+ "score_ci_low": 0.46153846153846156,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.5,
565
+ "accuracy_ci_low": 0.3,
566
+ "accuracy_ci_high": 0.7,
567
+ "f1_micro": 0.6666666666666666,
568
+ "f1_micro_ci_low": 0.46153846153846156,
569
+ "f1_micro_ci_high": 0.8235294117647058
570
+ },
571
+ "score": 0.37870885345857536,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.38195526695526694,
578
+ "f1_cars": 0.5714285714285714,
579
+ "f1_windows x": 0.5714285714285714,
580
+ "f1_computer graphics": 0.5454545454545454,
581
+ "f1_atheism": 0.0,
582
+ "f1_christianity": 0.2857142857142857,
583
+ "f1_religion": 0.0,
584
+ "f1_medicine": 0.6666666666666666,
585
+ "f1_microsoft windows": 0.5,
586
+ "f1_middle east": 0.2857142857142857,
587
+ "f1_motorcycles": 0.25,
588
+ "f1_for sale": 0.5714285714285714,
589
+ "f1_pc hardware": 0.2222222222222222,
590
+ "f1_mac hardware": 0.8,
591
+ "f1_guns": 0.2857142857142857,
592
+ "f1_politics": 0.5,
593
+ "f1_space": 0.75,
594
+ "f1_cryptography": 0.0,
595
+ "f1_baseball": 0.5,
596
+ "f1_hockey": 0.3333333333333333,
597
+ "f1_electronics": 0.0,
598
+ "f1_macro_ci_low": 0.3045532013795943,
599
+ "f1_macro_ci_high": 0.48572662880615863,
600
+ "score_name": "f1_micro",
601
+ "score": 0.4305555555555556,
602
+ "score_ci_high": 0.5281645512463683,
603
+ "score_ci_low": 0.31327005378302275,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.31,
606
+ "accuracy_ci_low": 0.22,
607
+ "accuracy_ci_high": 0.4,
608
+ "f1_micro": 0.4305555555555556,
609
+ "f1_micro_ci_low": 0.31327005378302275,
610
+ "f1_micro_ci_high": 0.5281645512463683
611
+ },
612
+ "score": 0.4305555555555556,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.8944327731092437,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9411764705882353,
620
+ "f1_debt collection": 0.6666666666666666,
621
+ "f1_payday loan or title loan or personal loan": 1.0,
622
+ "f1_credit card or prepaid card": 0.8571428571428571,
623
+ "f1_student loan": 0.8333333333333334,
624
+ "f1_checking or savings account": 1.0,
625
+ "f1_mortgage": 0.8571428571428571,
626
+ "f1_money transfer or virtual currency or money service": 1.0,
627
+ "f1_macro_ci_low": 0.806614365464373,
628
+ "f1_macro_ci_high": 0.9657999668768317,
629
+ "score_name": "f1_micro",
630
+ "score": 0.9090909090909091,
631
+ "score_ci_high": 0.9547738693467337,
632
+ "score_ci_low": 0.8379092938797416,
633
+ "num_of_instances": 100,
634
+ "accuracy": 0.9,
635
+ "accuracy_ci_low": 0.83,
636
+ "accuracy_ci_high": 0.95,
637
+ "f1_micro": 0.9090909090909091,
638
+ "f1_micro_ci_low": 0.8379092938797416,
639
+ "f1_micro_ci_high": 0.9547738693467337
640
+ },
641
+ "cfpb_product_watsonx": {
642
+ "f1_macro": 0.8162680456798105,
643
+ "f1_mortgages and loans": 0.7619047619047619,
644
+ "f1_credit card": 0.8571428571428571,
645
+ "f1_debt collection": 0.7058823529411765,
646
+ "f1_credit reporting": 0.8333333333333334,
647
+ "f1_retail banking": 0.9230769230769231,
648
+ "f1_macro_ci_low": 0.7016062262967031,
649
+ "f1_macro_ci_high": 0.9223789564935859,
650
+ "score_name": "f1_micro",
651
+ "score": 0.8125,
652
+ "score_ci_high": 0.9072164948453608,
653
+ "score_ci_low": 0.6956521739130435,
654
+ "num_of_instances": 50,
655
+ "accuracy": 0.78,
656
+ "accuracy_ci_low": 0.66,
657
+ "accuracy_ci_high": 0.88,
658
+ "f1_micro": 0.8125,
659
+ "f1_micro_ci_low": 0.6956521739130435,
660
+ "f1_micro_ci_high": 0.9072164948453608
661
+ },
662
+ "score": 0.8607954545454546,
663
+ "score_name": "subsets_mean",
664
+ "num_of_instances": 150
665
+ },
666
+ "qa_finance": {
667
+ "fin_qa": {
668
+ "num_of_instances": 100,
669
+ "program_accuracy": 0.23,
670
+ "score": 0.23,
671
+ "score_name": "program_accuracy",
672
+ "execution_accuracy": 0.2,
673
+ "program_accuracy_ci_low": 0.16,
674
+ "program_accuracy_ci_high": 0.32,
675
+ "score_ci_low": 0.16,
676
+ "score_ci_high": 0.32,
677
+ "execution_accuracy_ci_low": 0.13,
678
+ "execution_accuracy_ci_high": 0.29
679
+ },
680
+ "score": 0.23,
681
+ "score_name": "subsets_mean",
682
+ "num_of_instances": 100
683
+ },
684
+ "rag_general": {
685
+ "rag_response_generation_clapnq": {
686
+ "precision": 0.4390234149184736,
687
+ "recall": 0.5262161167968094,
688
+ "f1": 0.4405810035203588,
689
+ "precision_ci_low": 0.40326834159222774,
690
+ "precision_ci_high": 0.47447671580848416,
691
+ "recall_ci_low": 0.4871121402499137,
692
+ "recall_ci_high": 0.5624733915163893,
693
+ "f1_ci_low": 0.40997461940491314,
694
+ "f1_ci_high": 0.469781968797013,
695
+ "score_name": "f1",
696
+ "score": 0.4405810035203588,
697
+ "score_ci_high": 0.469781968797013,
698
+ "score_ci_low": 0.40997461940491314,
699
+ "num_of_instances": 100,
700
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6758063852787017,
701
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6950149410963058,
702
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6671330830454827,
703
+ "faithfullness_f1_token_overlap": 0.3422892513241904,
704
+ "faithfullness_recall_token_overlap": 0.25990022331997953,
705
+ "faithfullness_precision_token_overlap": 0.6239090003430506,
706
+ "correctness_f1_token_overlap": 0.4405810035203588,
707
+ "correctness_recall_token_overlap": 0.5262161167968094,
708
+ "correctness_precision_token_overlap": 0.4390234149184736
709
+ },
710
+ "score": 0.4405810035203588,
711
+ "score_name": "subsets_mean",
712
+ "num_of_instances": 100
713
+ },
714
+ "reasoning": {
715
+ "hellaswag": {
716
+ "accuracy": 0.53,
717
+ "accuracy_ci_low": 0.43,
718
+ "accuracy_ci_high": 0.62,
719
+ "score_name": "accuracy",
720
+ "score": 0.53,
721
+ "score_ci_high": 0.62,
722
+ "score_ci_low": 0.43,
723
+ "num_of_instances": 100
724
+ },
725
+ "openbook_qa": {
726
+ "accuracy": 0.93,
727
+ "accuracy_ci_low": 0.86,
728
+ "accuracy_ci_high": 0.97,
729
+ "score_name": "accuracy",
730
+ "score": 0.93,
731
+ "score_ci_high": 0.97,
732
+ "score_ci_low": 0.86,
733
+ "num_of_instances": 100
734
+ },
735
+ "score": 0.73,
736
+ "score_name": "subsets_mean",
737
+ "num_of_instances": 200
738
+ },
739
+ "safety": {
740
+ "attaq_500": {
741
+ "severity_critical": 4.0,
742
+ "severity_high": 4.0,
743
+ "severity_medium": 0.0,
744
+ "severity_low": 92.0,
745
+ "category_harmful_info": 0.7918323914683054,
746
+ "category_deception": 0.8392233636343118,
747
+ "category_explicit_content": 0.7774655876135586,
748
+ "category_violence": 0.8427019776953828,
749
+ "category_discrimination": 0.6616657735809447,
750
+ "category_substance_abuse": 0.775272503743569,
751
+ "category_pii": 0.883132066163752,
752
+ "safety": 0.8060687377645323,
753
+ "safety_ci_low": 0.7640155436260707,
754
+ "safety_ci_high": 0.8341465976524678,
755
+ "score_name": "safety",
756
+ "score": 0.8060687377645323,
757
+ "score_ci_high": 0.8341465976524678,
758
+ "score_ci_low": 0.7640155436260707,
759
+ "num_of_instances": 100
760
+ },
761
+ "score": 0.8060687377645323,
762
+ "score_name": "subsets_mean",
763
+ "num_of_instances": 100
764
+ },
765
+ "summarization": {
766
+ "billsum_document_filtered_to_6000_chars": {
767
+ "num_of_instances": 100,
768
+ "rouge2": 0.15402435010536653,
769
+ "rougeLsum": 0.31968597390653825,
770
+ "rouge1": 0.3880965291674629,
771
+ "rougeL": 0.2524394530234589,
772
+ "score": 0.2524394530234589,
773
+ "score_name": "rougeL",
774
+ "rouge2_ci_low": 0.14130666076526238,
775
+ "rouge2_ci_high": 0.16860889298392745,
776
+ "rougeLsum_ci_low": 0.3016069005629268,
777
+ "rougeLsum_ci_high": 0.33979038219304886,
778
+ "rouge1_ci_low": 0.3688926041108984,
779
+ "rouge1_ci_high": 0.40886117544130285,
780
+ "rougeL_ci_low": 0.23942067032602865,
781
+ "rougeL_ci_high": 0.2682386978258959,
782
+ "score_ci_low": 0.23942067032602865,
783
+ "score_ci_high": 0.2682386978258959
784
+ },
785
+ "tldr_document_filtered_to_6000_chars": {
786
+ "num_of_instances": 100,
787
+ "rouge2": 0.01358931671878944,
788
+ "rougeLsum": 0.08805558214582762,
789
+ "rouge1": 0.11011699084176203,
790
+ "rougeL": 0.08082291263637024,
791
+ "score": 0.08082291263637024,
792
+ "score_name": "rougeL",
793
+ "rouge2_ci_low": 0.009687205104610215,
794
+ "rouge2_ci_high": 0.018802588392340995,
795
+ "rougeLsum_ci_low": 0.07817975698587638,
796
+ "rougeLsum_ci_high": 0.09961952144610982,
797
+ "rouge1_ci_low": 0.096196695824933,
798
+ "rouge1_ci_high": 0.12633882653954673,
799
+ "rougeL_ci_low": 0.07153795581984226,
800
+ "rougeL_ci_high": 0.09127045045673031,
801
+ "score_ci_low": 0.07153795581984226,
802
+ "score_ci_high": 0.09127045045673031
803
+ },
804
+ "score": 0.16663118282991457,
805
+ "score_name": "subsets_mean",
806
+ "num_of_instances": 200
807
+ },
808
+ "translation": {
809
+ "mt_flores_101_ara_eng": {
810
+ "num_of_instances": 6,
811
+ "counts": [
812
+ 137,
813
+ 83,
814
+ 56,
815
+ 41
816
+ ],
817
+ "totals": [
818
+ 211,
819
+ 205,
820
+ 199,
821
+ 193
822
+ ],
823
+ "precisions": [
824
+ 0.6492890995260663,
825
+ 0.4048780487804878,
826
+ 0.2814070351758794,
827
+ 0.21243523316062177
828
+ ],
829
+ "bp": 1.0,
830
+ "sys_len": 211,
831
+ "ref_len": 208,
832
+ "sacrebleu": 0.3540633387626259,
833
+ "score": 0.3540633387626259,
834
+ "score_name": "sacrebleu",
835
+ "score_ci_low": 0.188995320312599,
836
+ "score_ci_high": 0.4781846360393117,
837
+ "sacrebleu_ci_low": 0.188995320312599,
838
+ "sacrebleu_ci_high": 0.4781846360393117
839
+ },
840
+ "mt_flores_101_deu_eng": {
841
+ "num_of_instances": 6,
842
+ "counts": [
843
+ 132,
844
+ 74,
845
+ 41,
846
+ 23
847
+ ],
848
+ "totals": [
849
+ 216,
850
+ 210,
851
+ 204,
852
+ 198
853
+ ],
854
+ "precisions": [
855
+ 0.6111111111111112,
856
+ 0.3523809523809524,
857
+ 0.20098039215686275,
858
+ 0.11616161616161616
859
+ ],
860
+ "bp": 1.0,
861
+ "sys_len": 216,
862
+ "ref_len": 208,
863
+ "sacrebleu": 0.2662791948025941,
864
+ "score": 0.2662791948025941,
865
+ "score_name": "sacrebleu",
866
+ "score_ci_low": 0.17615641560459036,
867
+ "score_ci_high": 0.37265924807285117,
868
+ "sacrebleu_ci_low": 0.17615641560459036,
869
+ "sacrebleu_ci_high": 0.37265924807285117
870
+ },
871
+ "mt_flores_101_eng_ara": {
872
+ "num_of_instances": 6,
873
+ "counts": [
874
+ 128,
875
+ 76,
876
+ 43,
877
+ 24
878
+ ],
879
+ "totals": [
880
+ 201,
881
+ 195,
882
+ 189,
883
+ 183
884
+ ],
885
+ "precisions": [
886
+ 0.6368159203980099,
887
+ 0.3897435897435897,
888
+ 0.2275132275132275,
889
+ 0.13114754098360656
890
+ ],
891
+ "bp": 0.960980660057086,
892
+ "sys_len": 201,
893
+ "ref_len": 209,
894
+ "sacrebleu": 0.28190616374550787,
895
+ "score": 0.28190616374550787,
896
+ "score_name": "sacrebleu",
897
+ "score_ci_low": 0.19630579725874525,
898
+ "score_ci_high": 0.3918951959792066,
899
+ "sacrebleu_ci_low": 0.19630579725874525,
900
+ "sacrebleu_ci_high": 0.3918951959792066
901
+ },
902
+ "mt_flores_101_eng_deu": {
903
+ "num_of_instances": 6,
904
+ "counts": [
905
+ 139,
906
+ 89,
907
+ 60,
908
+ 46
909
+ ],
910
+ "totals": [
911
+ 222,
912
+ 216,
913
+ 210,
914
+ 204
915
+ ],
916
+ "precisions": [
917
+ 0.6261261261261262,
918
+ 0.41203703703703703,
919
+ 0.28571428571428575,
920
+ 0.22549019607843138
921
+ ],
922
+ "bp": 1.0,
923
+ "sys_len": 222,
924
+ "ref_len": 216,
925
+ "sacrebleu": 0.3590578493818958,
926
+ "score": 0.3590578493818958,
927
+ "score_name": "sacrebleu",
928
+ "score_ci_low": 0.22672780625357206,
929
+ "score_ci_high": 0.48655355879263695,
930
+ "sacrebleu_ci_low": 0.22672780625357206,
931
+ "sacrebleu_ci_high": 0.48655355879263695
932
+ },
933
+ "mt_flores_101_eng_fra": {
934
+ "num_of_instances": 6,
935
+ "counts": [
936
+ 188,
937
+ 147,
938
+ 115,
939
+ 90
940
+ ],
941
+ "totals": [
942
+ 231,
943
+ 225,
944
+ 219,
945
+ 213
946
+ ],
947
+ "precisions": [
948
+ 0.8138528138528138,
949
+ 0.6533333333333333,
950
+ 0.5251141552511416,
951
+ 0.4225352112676056
952
+ ],
953
+ "bp": 0.9828330432930387,
954
+ "sys_len": 231,
955
+ "ref_len": 235,
956
+ "sacrebleu": 0.5760087471924777,
957
+ "score": 0.5760087471924777,
958
+ "score_name": "sacrebleu",
959
+ "score_ci_low": 0.4220699958012471,
960
+ "score_ci_high": 0.733033345465493,
961
+ "sacrebleu_ci_low": 0.4220699958012471,
962
+ "sacrebleu_ci_high": 0.733033345465493
963
+ },
964
+ "mt_flores_101_eng_kor": {
965
+ "num_of_instances": 6,
966
+ "counts": [
967
+ 159,
968
+ 85,
969
+ 54,
970
+ 35
971
+ ],
972
+ "totals": [
973
+ 277,
974
+ 271,
975
+ 265,
976
+ 259
977
+ ],
978
+ "precisions": [
979
+ 0.5740072202166064,
980
+ 0.31365313653136534,
981
+ 0.2037735849056604,
982
+ 0.13513513513513514
983
+ ],
984
+ "bp": 1.0,
985
+ "sys_len": 277,
986
+ "ref_len": 249,
987
+ "sacrebleu": 0.26535103691316425,
988
+ "score": 0.26535103691316425,
989
+ "score_name": "sacrebleu",
990
+ "score_ci_low": 0.2050827299500949,
991
+ "score_ci_high": 0.3242346639521402,
992
+ "sacrebleu_ci_low": 0.2050827299500949,
993
+ "sacrebleu_ci_high": 0.3242346639521402
994
+ },
995
+ "mt_flores_101_eng_por": {
996
+ "num_of_instances": 6,
997
+ "counts": [
998
+ 175,
999
+ 131,
1000
+ 104,
1001
+ 81
1002
+ ],
1003
+ "totals": [
1004
+ 232,
1005
+ 226,
1006
+ 220,
1007
+ 214
1008
+ ],
1009
+ "precisions": [
1010
+ 0.7543103448275862,
1011
+ 0.5796460176991151,
1012
+ 0.4727272727272727,
1013
+ 0.37850467289719625
1014
+ ],
1015
+ "bp": 1.0,
1016
+ "sys_len": 232,
1017
+ "ref_len": 222,
1018
+ "sacrebleu": 0.5288697242857515,
1019
+ "score": 0.5288697242857515,
1020
+ "score_name": "sacrebleu",
1021
+ "score_ci_low": 0.47846923363851745,
1022
+ "score_ci_high": 0.6139001136679906,
1023
+ "sacrebleu_ci_low": 0.47846923363851745,
1024
+ "sacrebleu_ci_high": 0.6139001136679906
1025
+ },
1026
+ "mt_flores_101_eng_ron": {
1027
+ "num_of_instances": 6,
1028
+ "counts": [
1029
+ 164,
1030
+ 113,
1031
+ 83,
1032
+ 62
1033
+ ],
1034
+ "totals": [
1035
+ 228,
1036
+ 222,
1037
+ 216,
1038
+ 210
1039
+ ],
1040
+ "precisions": [
1041
+ 0.7192982456140351,
1042
+ 0.509009009009009,
1043
+ 0.38425925925925924,
1044
+ 0.29523809523809524
1045
+ ],
1046
+ "bp": 0.9912664313028773,
1047
+ "sys_len": 228,
1048
+ "ref_len": 230,
1049
+ "sacrebleu": 0.44750531811271016,
1050
+ "score": 0.44750531811271016,
1051
+ "score_name": "sacrebleu",
1052
+ "score_ci_low": 0.32949704407129193,
1053
+ "score_ci_high": 0.5998604762359077,
1054
+ "sacrebleu_ci_low": 0.32949704407129193,
1055
+ "sacrebleu_ci_high": 0.5998604762359077
1056
+ },
1057
+ "mt_flores_101_eng_spa": {
1058
+ "num_of_instances": 6,
1059
+ "counts": [
1060
+ 157,
1061
+ 97,
1062
+ 65,
1063
+ 43
1064
+ ],
1065
+ "totals": [
1066
+ 226,
1067
+ 220,
1068
+ 214,
1069
+ 208
1070
+ ],
1071
+ "precisions": [
1072
+ 0.6946902654867256,
1073
+ 0.4409090909090909,
1074
+ 0.3037383177570094,
1075
+ 0.20673076923076925
1076
+ ],
1077
+ "bp": 0.9275382560481537,
1078
+ "sys_len": 226,
1079
+ "ref_len": 243,
1080
+ "sacrebleu": 0.34541649552517106,
1081
+ "score": 0.34541649552517106,
1082
+ "score_name": "sacrebleu",
1083
+ "score_ci_low": 0.2833834978695395,
1084
+ "score_ci_high": 0.43980007583789943,
1085
+ "sacrebleu_ci_low": 0.2833834978695395,
1086
+ "sacrebleu_ci_high": 0.43980007583789943
1087
+ },
1088
+ "mt_flores_101_fra_eng": {
1089
+ "num_of_instances": 6,
1090
+ "counts": [
1091
+ 156,
1092
+ 104,
1093
+ 71,
1094
+ 49
1095
+ ],
1096
+ "totals": [
1097
+ 219,
1098
+ 213,
1099
+ 207,
1100
+ 201
1101
+ ],
1102
+ "precisions": [
1103
+ 0.7123287671232876,
1104
+ 0.48826291079812206,
1105
+ 0.3429951690821256,
1106
+ 0.24378109452736318
1107
+ ],
1108
+ "bp": 1.0,
1109
+ "sys_len": 219,
1110
+ "ref_len": 208,
1111
+ "sacrebleu": 0.4129576932882607,
1112
+ "score": 0.4129576932882607,
1113
+ "score_name": "sacrebleu",
1114
+ "score_ci_low": 0.29705055420750553,
1115
+ "score_ci_high": 0.5590420438259449,
1116
+ "sacrebleu_ci_low": 0.29705055420750553,
1117
+ "sacrebleu_ci_high": 0.5590420438259449
1118
+ },
1119
+ "mt_flores_101_jpn_eng": {
1120
+ "num_of_instances": 6,
1121
+ "counts": [
1122
+ 120,
1123
+ 69,
1124
+ 42,
1125
+ 29
1126
+ ],
1127
+ "totals": [
1128
+ 182,
1129
+ 177,
1130
+ 172,
1131
+ 167
1132
+ ],
1133
+ "precisions": [
1134
+ 0.6593406593406592,
1135
+ 0.3898305084745763,
1136
+ 0.24418604651162792,
1137
+ 0.17365269461077845
1138
+ ],
1139
+ "bp": 0.8668778997501817,
1140
+ "sys_len": 182,
1141
+ "ref_len": 208,
1142
+ "sacrebleu": 0.28009473432383397,
1143
+ "score": 0.28009473432383397,
1144
+ "score_name": "sacrebleu",
1145
+ "score_ci_low": 0.12663787459170767,
1146
+ "score_ci_high": 0.3726473304411957,
1147
+ "sacrebleu_ci_low": 0.12663787459170767,
1148
+ "sacrebleu_ci_high": 0.3726473304411957
1149
+ },
1150
+ "mt_flores_101_kor_eng": {
1151
+ "num_of_instances": 6,
1152
+ "counts": [
1153
+ 122,
1154
+ 69,
1155
+ 41,
1156
+ 27
1157
+ ],
1158
+ "totals": [
1159
+ 191,
1160
+ 186,
1161
+ 181,
1162
+ 176
1163
+ ],
1164
+ "precisions": [
1165
+ 0.6387434554973822,
1166
+ 0.3709677419354838,
1167
+ 0.2265193370165746,
1168
+ 0.1534090909090909
1169
+ ],
1170
+ "bp": 0.9148407838195897,
1171
+ "sys_len": 191,
1172
+ "ref_len": 208,
1173
+ "sacrebleu": 0.2755816298698519,
1174
+ "score": 0.2755816298698519,
1175
+ "score_name": "sacrebleu",
1176
+ "score_ci_low": 0.1550375519752376,
1177
+ "score_ci_high": 0.4298533640603147,
1178
+ "sacrebleu_ci_low": 0.1550375519752376,
1179
+ "sacrebleu_ci_high": 0.4298533640603147
1180
+ },
1181
+ "mt_flores_101_por_eng": {
1182
+ "num_of_instances": 6,
1183
+ "counts": [
1184
+ 157,
1185
+ 114,
1186
+ 82,
1187
+ 59
1188
+ ],
1189
+ "totals": [
1190
+ 217,
1191
+ 211,
1192
+ 205,
1193
+ 199
1194
+ ],
1195
+ "precisions": [
1196
+ 0.7235023041474655,
1197
+ 0.5402843601895735,
1198
+ 0.4,
1199
+ 0.2964824120603015
1200
+ ],
1201
+ "bp": 1.0,
1202
+ "sys_len": 217,
1203
+ "ref_len": 208,
1204
+ "sacrebleu": 0.464013173269593,
1205
+ "score": 0.464013173269593,
1206
+ "score_name": "sacrebleu",
1207
+ "score_ci_low": 0.2908039757752357,
1208
+ "score_ci_high": 0.5973824527832076,
1209
+ "sacrebleu_ci_low": 0.2908039757752357,
1210
+ "sacrebleu_ci_high": 0.5973824527832076
1211
+ },
1212
+ "mt_flores_101_ron_eng": {
1213
+ "num_of_instances": 6,
1214
+ "counts": [
1215
+ 165,
1216
+ 116,
1217
+ 84,
1218
+ 59
1219
+ ],
1220
+ "totals": [
1221
+ 228,
1222
+ 222,
1223
+ 216,
1224
+ 210
1225
+ ],
1226
+ "precisions": [
1227
+ 0.7236842105263157,
1228
+ 0.5225225225225225,
1229
+ 0.38888888888888884,
1230
+ 0.28095238095238095
1231
+ ],
1232
+ "bp": 1.0,
1233
+ "sys_len": 228,
1234
+ "ref_len": 208,
1235
+ "sacrebleu": 0.4508458651239866,
1236
+ "score": 0.4508458651239866,
1237
+ "score_name": "sacrebleu",
1238
+ "score_ci_low": 0.31931249774136927,
1239
+ "score_ci_high": 0.5112780455453768,
1240
+ "sacrebleu_ci_low": 0.31931249774136927,
1241
+ "sacrebleu_ci_high": 0.5112780455453768
1242
+ },
1243
+ "mt_flores_101_spa_eng": {
1244
+ "num_of_instances": 6,
1245
+ "counts": [
1246
+ 152,
1247
+ 96,
1248
+ 59,
1249
+ 39
1250
+ ],
1251
+ "totals": [
1252
+ 220,
1253
+ 214,
1254
+ 208,
1255
+ 202
1256
+ ],
1257
+ "precisions": [
1258
+ 0.6909090909090909,
1259
+ 0.4485981308411215,
1260
+ 0.28365384615384615,
1261
+ 0.19306930693069307
1262
+ ],
1263
+ "bp": 1.0,
1264
+ "sys_len": 220,
1265
+ "ref_len": 208,
1266
+ "sacrebleu": 0.3609483578130376,
1267
+ "score": 0.3609483578130376,
1268
+ "score_name": "sacrebleu",
1269
+ "score_ci_low": 0.2927054521124969,
1270
+ "score_ci_high": 0.3901296105157999,
1271
+ "sacrebleu_ci_low": 0.2927054521124969,
1272
+ "sacrebleu_ci_high": 0.3901296105157999
1273
+ },
1274
+ "score": 0.37792662149403083,
1275
+ "score_name": "subsets_mean",
1276
+ "num_of_instances": 90
1277
+ },
1278
+ "score": 0.5385208337043375,
1279
+ "score_name": "subsets_mean",
1280
+ "num_of_instances": 1537
1281
+ }
1282
+ }
results/bluebench/2025-08-03T14-35-25_evaluation_results.json ADDED
@@ -0,0 +1,1282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-08-03T18:35:20.055545Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=azure/Azure/o1-ncf,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "azure/Azure/o1-ncf",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.26.5",
45
+ "unitxt_commit_hash": "N/A",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "tiktoken": "0.9.0",
55
+ "charset-normalizer": "3.4.2",
56
+ "nvidia-cuda-runtime-cu12": "12.6.77",
57
+ "pyarrow": "21.0.0",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "httpcore": "1.0.9",
61
+ "pip": "25.2",
62
+ "certifi": "2025.7.14",
63
+ "evaluate": "0.4.5",
64
+ "Jinja2": "3.1.6",
65
+ "jsonschema-specifications": "2025.4.1",
66
+ "pydantic_core": "2.33.2",
67
+ "nvidia-cusparse-cu12": "12.5.4.2",
68
+ "aiosignal": "1.4.0",
69
+ "yarl": "1.20.1",
70
+ "unitxt": "1.26.5",
71
+ "jsonschema": "4.25.0",
72
+ "portalocker": "3.2.0",
73
+ "multiprocess": "0.70.16",
74
+ "nvidia-nvjitlink-cu12": "12.6.85",
75
+ "nvidia-cublas-cu12": "12.6.4.1",
76
+ "pydantic": "2.11.7",
77
+ "async-timeout": "5.0.1",
78
+ "annotated-types": "0.7.0",
79
+ "rouge_score": "0.1.2",
80
+ "contourpy": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "matplotlib": "3.10.5",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "h11": "0.16.0",
87
+ "zipp": "3.19.2",
88
+ "tzdata": "2025.2",
89
+ "bert-score": "0.3.13",
90
+ "setuptools": "80.9.0",
91
+ "referencing": "0.36.2",
92
+ "sacrebleu": "2.5.1",
93
+ "filelock": "3.18.0",
94
+ "urllib3": "2.5.0",
95
+ "scipy": "1.15.3",
96
+ "nvidia-nccl-cu12": "2.26.2",
97
+ "kiwisolver": "1.4.8",
98
+ "networkx": "3.4.2",
99
+ "typing-inspection": "0.4.1",
100
+ "sniffio": "1.3.1",
101
+ "rpds-py": "0.26.0",
102
+ "nvidia-curand-cu12": "10.3.7.77",
103
+ "litellm": "1.74.12",
104
+ "pillow": "11.3.0",
105
+ "datasets": "3.6.0",
106
+ "nvidia-cusolver-cu12": "11.7.1.2",
107
+ "cycler": "0.12.1",
108
+ "tokenizers": "0.21.4",
109
+ "distro": "1.9.0",
110
+ "idna": "3.10",
111
+ "MarkupSafe": "3.0.2",
112
+ "frozenlist": "1.7.0",
113
+ "pyparsing": "3.2.3",
114
+ "regex": "2025.7.34",
115
+ "jiter": "0.10.0",
116
+ "importlib_metadata": "8.0.0",
117
+ "packaging": "24.2",
118
+ "psutil": "7.0.0",
119
+ "mecab-ko-dic": "1.0.0",
120
+ "joblib": "1.5.1",
121
+ "transformers": "4.54.1",
122
+ "fsspec": "2025.3.0",
123
+ "scikit-learn": "1.7.1",
124
+ "dill": "0.3.8",
125
+ "wheel": "0.45.1",
126
+ "nvidia-nvtx-cu12": "12.6.77",
127
+ "nvidia-cusparselt-cu12": "0.6.3",
128
+ "lxml": "6.0.0",
129
+ "propcache": "0.3.2",
130
+ "numpy": "2.2.6",
131
+ "mpmath": "1.3.0",
132
+ "conllu": "6.0.0",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "fonttools": "4.59.0",
136
+ "tabulate": "0.9.0",
137
+ "typing_extensions": "4.12.2",
138
+ "absl-py": "2.3.1",
139
+ "accelerate": "1.9.0",
140
+ "nvidia-cufft-cu12": "11.3.0.4",
141
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
142
+ "click": "8.2.1",
143
+ "attrs": "25.3.0",
144
+ "exceptiongroup": "1.3.0",
145
+ "tenacity": "9.1.2",
146
+ "huggingface-hub": "0.34.3",
147
+ "pytz": "2025.2",
148
+ "aiohappyeyeballs": "2.6.1",
149
+ "python-dateutil": "2.9.0.post0",
150
+ "torch": "2.7.1",
151
+ "python-dotenv": "1.1.1",
152
+ "multidict": "6.6.3",
153
+ "httpx": "0.28.1",
154
+ "aiohttp": "3.12.15",
155
+ "xxhash": "3.5.0",
156
+ "PyYAML": "6.0.2",
157
+ "colorama": "0.4.6",
158
+ "openai": "1.98.0",
159
+ "threadpoolctl": "3.6.0",
160
+ "nvidia-cudnn-cu12": "9.5.1.17",
161
+ "pandas": "2.3.1",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 1.0,
180
+ "accuracy_ci_low": 1.0,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 1.0,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 1.0,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 1.0,
190
+ "accuracy_ci_low": 1.0,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 1.0,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 1.0,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 1.0,
200
+ "accuracy_ci_low": 1.0,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 1.0,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 1.0,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 1.0,
220
+ "accuracy_ci_low": 1.0,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 1.0,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 1.0,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 1.0,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 1.0,
260
+ "accuracy_ci_low": 1.0,
261
+ "accuracy_ci_high": 1.0,
262
+ "score_name": "accuracy",
263
+ "score": 1.0,
264
+ "score_ci_high": 1.0,
265
+ "score_ci_low": 1.0,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.4444444444444444,
270
+ "accuracy_ci_low": 0.1111111111111111,
271
+ "accuracy_ci_high": 0.7777777777777778,
272
+ "score_name": "accuracy",
273
+ "score": 0.4444444444444444,
274
+ "score_ci_high": 0.7777777777777778,
275
+ "score_ci_low": 0.1111111111111111,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 1.0,
280
+ "accuracy_ci_low": 1.0,
281
+ "accuracy_ci_high": 1.0,
282
+ "score_name": "accuracy",
283
+ "score": 1.0,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 1.0,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.9494949494949495,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.34375,
296
+ "score": 0.34375,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.34375,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.8695652173913043,
307
+ "f1_Organization": 0.7868852459016394,
308
+ "f1_Location": 0.723404255319149,
309
+ "f1_macro": 0.7932849062040309,
310
+ "recall_macro": 0.8116804692891649,
311
+ "precision_macro": 0.7786561264822135,
312
+ "in_classes_support": 1.0,
313
+ "f1_micro": 0.7922077922077922,
314
+ "recall_micro": 0.8133333333333334,
315
+ "precision_micro": 0.7721518987341772,
316
+ "score": 0.7922077922077922,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.7030169722972905,
319
+ "score_ci_high": 0.848370061994058,
320
+ "f1_micro_ci_low": 0.7030169722972905,
321
+ "f1_micro_ci_high": 0.848370061994058
322
+ },
323
+ "score": 0.7922077922077922,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.8571428571428571,
330
+ "accuracy_ci_low": 0.42857142857142855,
331
+ "accuracy_ci_high": 1.0,
332
+ "score_name": "accuracy",
333
+ "score": 0.8571428571428571,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.42857142857142855,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.8571428571428571,
340
+ "accuracy_ci_low": 0.2530277506117974,
341
+ "accuracy_ci_high": 1.0,
342
+ "score_name": "accuracy",
343
+ "score": 0.8571428571428571,
344
+ "score_ci_high": 1.0,
345
+ "score_ci_low": 0.2530277506117974,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.2857142857142857,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.7142857142857143,
352
+ "score_name": "accuracy",
353
+ "score": 0.2857142857142857,
354
+ "score_ci_high": 0.7142857142857143,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.8571428571428571,
360
+ "accuracy_ci_low": 0.42857142857142855,
361
+ "accuracy_ci_high": 1.0,
362
+ "score_name": "accuracy",
363
+ "score": 0.8571428571428571,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 0.42857142857142855,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.8571428571428571,
370
+ "accuracy_ci_low": 0.42857142857142855,
371
+ "accuracy_ci_high": 1.0,
372
+ "score_name": "accuracy",
373
+ "score": 0.8571428571428571,
374
+ "score_ci_high": 1.0,
375
+ "score_ci_low": 0.42857142857142855,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.7142857142857143,
380
+ "accuracy_ci_low": 0.2857142857142857,
381
+ "accuracy_ci_high": 1.0,
382
+ "score_name": "accuracy",
383
+ "score": 0.7142857142857143,
384
+ "score_ci_high": 1.0,
385
+ "score_ci_low": 0.2857142857142857,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.5714285714285714,
390
+ "accuracy_ci_low": 0.14285714285714285,
391
+ "accuracy_ci_high": 0.8571428571428571,
392
+ "score_name": "accuracy",
393
+ "score": 0.5714285714285714,
394
+ "score_ci_high": 0.8571428571428571,
395
+ "score_ci_low": 0.14285714285714285,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.2857142857142857,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.7142857142857143,
402
+ "score_name": "accuracy",
403
+ "score": 0.2857142857142857,
404
+ "score_ci_high": 0.7142857142857143,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.7142857142857143,
410
+ "accuracy_ci_low": 0.2857142857142857,
411
+ "accuracy_ci_high": 1.0,
412
+ "score_name": "accuracy",
413
+ "score": 0.7142857142857143,
414
+ "score_ci_high": 1.0,
415
+ "score_ci_low": 0.2857142857142857,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 1.0,
420
+ "accuracy_ci_low": 1.0,
421
+ "accuracy_ci_high": 1.0,
422
+ "score_name": "accuracy",
423
+ "score": 1.0,
424
+ "score_ci_high": 1.0,
425
+ "score_ci_low": 1.0,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.7142857142857143,
430
+ "accuracy_ci_low": 0.2857142857142857,
431
+ "accuracy_ci_high": 1.0,
432
+ "score_name": "accuracy",
433
+ "score": 0.7142857142857143,
434
+ "score_ci_high": 1.0,
435
+ "score_ci_low": 0.2857142857142857,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.7142857142857143,
440
+ "accuracy_ci_low": 0.2857142857142857,
441
+ "accuracy_ci_high": 1.0,
442
+ "score_name": "accuracy",
443
+ "score": 0.7142857142857143,
444
+ "score_ci_high": 1.0,
445
+ "score_ci_low": 0.2857142857142857,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.7142857142857143,
450
+ "accuracy_ci_low": 0.2857142857142857,
451
+ "accuracy_ci_high": 1.0,
452
+ "score_name": "accuracy",
453
+ "score": 0.7142857142857143,
454
+ "score_ci_high": 1.0,
455
+ "score_ci_low": 0.2857142857142857,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.8571428571428571,
460
+ "accuracy_ci_low": 0.2530277506117974,
461
+ "accuracy_ci_high": 1.0,
462
+ "score_name": "accuracy",
463
+ "score": 0.8571428571428571,
464
+ "score_ci_high": 1.0,
465
+ "score_ci_low": 0.2530277506117974,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.7142857142857143,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.3633333333333333,
475
+ "f1_suggestive": 0.25,
476
+ "f1_generic": 0.5,
477
+ "f1_fanciful": 0.4,
478
+ "f1_descriptive": 0.6666666666666666,
479
+ "f1_arbitrary": 0.0,
480
+ "f1_macro_ci_low": 0.18019056979170386,
481
+ "f1_macro_ci_high": 0.6700601186500376,
482
+ "score_name": "f1_micro",
483
+ "score": 0.38461538461538464,
484
+ "score_ci_high": 0.6317641031035699,
485
+ "score_ci_low": 0.16,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.25,
488
+ "accuracy_ci_low": 0.1,
489
+ "accuracy_ci_high": 0.5,
490
+ "f1_micro": 0.38461538461538464,
491
+ "f1_micro_ci_low": 0.16,
492
+ "f1_micro_ci_high": 0.6317641031035699
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.17647058823529413,
496
+ "f1_no": 0.35294117647058826,
497
+ "f1_yes": 0.0,
498
+ "f1_macro_ci_low": 0.0625,
499
+ "f1_macro_ci_high": 0.34407383963381494,
500
+ "score_name": "f1_micro",
501
+ "score": 0.2608695652173913,
502
+ "score_ci_high": 0.5714285714285714,
503
+ "score_ci_low": 0.09523809523809523,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.15,
506
+ "accuracy_ci_low": 0.05,
507
+ "accuracy_ci_high": 0.4,
508
+ "f1_micro": 0.2608695652173913,
509
+ "f1_micro_ci_low": 0.09523809523809523,
510
+ "f1_micro_ci_high": 0.5714285714285714
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.14285714285714285,
514
+ "f1_conclusion": 0.0,
515
+ "f1_issue": 0.25,
516
+ "f1_decree": 0.0,
517
+ "f1_rule": 0.0,
518
+ "f1_analysis": 0.0,
519
+ "f1_facts": 0.75,
520
+ "f1_procedural history": 0.0,
521
+ "f1_macro_ci_low": 0.027938635003925405,
522
+ "f1_macro_ci_high": 0.25,
523
+ "score_name": "f1_micro",
524
+ "score": 0.23529411764705882,
525
+ "score_ci_high": 0.451025257708528,
526
+ "score_ci_low": 0.058823529411764705,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.2,
529
+ "accuracy_ci_low": 0.05,
530
+ "accuracy_ci_high": 0.4,
531
+ "f1_micro": 0.23529411764705882,
532
+ "f1_micro_ci_low": 0.058823529411764705,
533
+ "f1_micro_ci_high": 0.451025257708528
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.7013888888888888,
537
+ "f1_yes": 0.7777777777777778,
538
+ "f1_no": 0.625,
539
+ "f1_macro_ci_low": 0.4822715139171238,
540
+ "f1_macro_ci_high": 0.8740955338427088,
541
+ "score_name": "f1_micro",
542
+ "score": 0.7058823529411765,
543
+ "score_ci_high": 0.8648648648648649,
544
+ "score_ci_low": 0.4827586206896552,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.6,
547
+ "accuracy_ci_low": 0.4,
548
+ "accuracy_ci_high": 0.8,
549
+ "f1_micro": 0.7058823529411765,
550
+ "f1_micro_ci_low": 0.4827586206896552,
551
+ "f1_micro_ci_high": 0.8648648648648649
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.9705882352941176,
555
+ "f1_yes": 0.9411764705882353,
556
+ "f1_no": 1.0,
557
+ "f1_macro_ci_low": 0.8333333333333333,
558
+ "f1_macro_ci_high": 1.0,
559
+ "score_name": "f1_micro",
560
+ "score": 0.9743589743589743,
561
+ "score_ci_high": 1.0,
562
+ "score_ci_low": 0.8571428571428571,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.95,
565
+ "accuracy_ci_low": 0.75,
566
+ "accuracy_ci_high": 1.0,
567
+ "f1_micro": 0.9743589743589743,
568
+ "f1_micro_ci_low": 0.8571428571428571,
569
+ "f1_micro_ci_high": 1.0
570
+ },
571
+ "score": 0.5122040789559972,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.6497871572871573,
578
+ "f1_cars": 0.9090909090909091,
579
+ "f1_windows x": 0.75,
580
+ "f1_computer graphics": 0.7142857142857143,
581
+ "f1_atheism": 0.2857142857142857,
582
+ "f1_religion": 0.0,
583
+ "f1_medicine": 0.8571428571428571,
584
+ "f1_christianity": 0.8571428571428571,
585
+ "f1_microsoft windows": 0.8,
586
+ "f1_middle east": 0.6666666666666666,
587
+ "f1_motorcycles": 0.4444444444444444,
588
+ "f1_pc hardware": 0.7142857142857143,
589
+ "f1_mac hardware": 1.0,
590
+ "f1_electronics": 0.4,
591
+ "f1_for sale": 0.75,
592
+ "f1_guns": 0.4444444444444444,
593
+ "f1_space": 0.75,
594
+ "f1_cryptography": 0.4,
595
+ "f1_baseball": 1.0,
596
+ "f1_hockey": 0.8888888888888888,
597
+ "f1_politics": 0.36363636363636365,
598
+ "f1_macro_ci_low": 0.5620624688463499,
599
+ "f1_macro_ci_high": 0.7570253093572227,
600
+ "score_name": "f1_micro",
601
+ "score": 0.6815642458100558,
602
+ "score_ci_high": 0.7640449438202247,
603
+ "score_ci_low": 0.5781960812529161,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.61,
606
+ "accuracy_ci_low": 0.51,
607
+ "accuracy_ci_high": 0.7,
608
+ "f1_micro": 0.6815642458100558,
609
+ "f1_micro_ci_low": 0.5781960812529161,
610
+ "f1_micro_ci_high": 0.7640449438202247
611
+ },
612
+ "score": 0.6815642458100558,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.7978772331056343,
619
+ "f1_debt collection": 0.9090909090909091,
620
+ "f1_checking or savings account": 0.631578947368421,
621
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.8867924528301887,
622
+ "f1_mortgage": 0.6666666666666666,
623
+ "f1_credit card or prepaid card": 0.8,
624
+ "f1_payday loan or title loan or personal loan": 0.8,
625
+ "f1_student loan": 0.8888888888888888,
626
+ "f1_money transfer or virtual currency or money service": 0.8,
627
+ "f1_macro_ci_low": 0.6793031276678277,
628
+ "f1_macro_ci_high": 0.8893682569671233,
629
+ "score_name": "f1_micro",
630
+ "score": 0.84375,
631
+ "score_ci_high": 0.9035402051084471,
632
+ "score_ci_low": 0.7562033534601807,
633
+ "num_of_instances": 100,
634
+ "accuracy": 0.81,
635
+ "accuracy_ci_low": 0.72,
636
+ "accuracy_ci_high": 0.88,
637
+ "f1_micro": 0.84375,
638
+ "f1_micro_ci_low": 0.7562033534601807,
639
+ "f1_micro_ci_high": 0.9035402051084471
640
+ },
641
+ "cfpb_product_watsonx": {
642
+ "f1_macro": 0.8289604115691072,
643
+ "f1_mortgages and loans": 0.782608695652174,
644
+ "f1_credit card": 0.8571428571428571,
645
+ "f1_debt collection": 0.7777777777777778,
646
+ "f1_credit reporting": 0.7272727272727273,
647
+ "f1_retail banking": 1.0,
648
+ "f1_macro_ci_low": 0.7098713502417807,
649
+ "f1_macro_ci_high": 0.9150602383003535,
650
+ "score_name": "f1_micro",
651
+ "score": 0.8163265306122449,
652
+ "score_ci_high": 0.9072164948453608,
653
+ "score_ci_low": 0.6839680616837706,
654
+ "num_of_instances": 50,
655
+ "accuracy": 0.8,
656
+ "accuracy_ci_low": 0.66,
657
+ "accuracy_ci_high": 0.9,
658
+ "f1_micro": 0.8163265306122449,
659
+ "f1_micro_ci_low": 0.6839680616837706,
660
+ "f1_micro_ci_high": 0.9072164948453608
661
+ },
662
+ "score": 0.8300382653061225,
663
+ "score_name": "subsets_mean",
664
+ "num_of_instances": 150
665
+ },
666
+ "qa_finance": {
667
+ "fin_qa": {
668
+ "num_of_instances": 100,
669
+ "program_accuracy": 0.22,
670
+ "score": 0.22,
671
+ "score_name": "program_accuracy",
672
+ "execution_accuracy": 0.22,
673
+ "program_accuracy_ci_low": 0.15,
674
+ "program_accuracy_ci_high": 0.3,
675
+ "score_ci_low": 0.15,
676
+ "score_ci_high": 0.3,
677
+ "execution_accuracy_ci_low": 0.14,
678
+ "execution_accuracy_ci_high": 0.31
679
+ },
680
+ "score": 0.22,
681
+ "score_name": "subsets_mean",
682
+ "num_of_instances": 100
683
+ },
684
+ "rag_general": {
685
+ "rag_response_generation_clapnq": {
686
+ "precision": 0.42816457156991794,
687
+ "recall": 0.4569091793970677,
688
+ "f1": 0.4124340232392918,
689
+ "precision_ci_low": 0.38847540220366156,
690
+ "precision_ci_high": 0.4624925368575603,
691
+ "recall_ci_low": 0.41415283802156605,
692
+ "recall_ci_high": 0.4977227170242366,
693
+ "f1_ci_low": 0.37251708583536297,
694
+ "f1_ci_high": 0.44397312360149926,
695
+ "score_name": "f1",
696
+ "score": 0.4124340232392918,
697
+ "score_ci_high": 0.44397312360149926,
698
+ "score_ci_low": 0.37251708583536297,
699
+ "num_of_instances": 100,
700
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6425585800409317,
701
+ "correctness_recall_bert_score.deberta_large_mnli": 0.652276462316513,
702
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6418503573536873,
703
+ "faithfullness_f1_token_overlap": 0.29098835965027586,
704
+ "faithfullness_recall_token_overlap": 0.21164244081926975,
705
+ "faithfullness_precision_token_overlap": 0.5818246023074958,
706
+ "correctness_f1_token_overlap": 0.4124340232392918,
707
+ "correctness_recall_token_overlap": 0.4569091793970677,
708
+ "correctness_precision_token_overlap": 0.42816457156991794
709
+ },
710
+ "score": 0.4124340232392918,
711
+ "score_name": "subsets_mean",
712
+ "num_of_instances": 100
713
+ },
714
+ "reasoning": {
715
+ "hellaswag": {
716
+ "accuracy": 0.6,
717
+ "accuracy_ci_low": 0.5,
718
+ "accuracy_ci_high": 0.69,
719
+ "score_name": "accuracy",
720
+ "score": 0.6,
721
+ "score_ci_high": 0.69,
722
+ "score_ci_low": 0.5,
723
+ "num_of_instances": 100
724
+ },
725
+ "openbook_qa": {
726
+ "accuracy": 0.98,
727
+ "accuracy_ci_low": 0.9335270622787811,
728
+ "accuracy_ci_high": 1.0,
729
+ "score_name": "accuracy",
730
+ "score": 0.98,
731
+ "score_ci_high": 1.0,
732
+ "score_ci_low": 0.9335270622787811,
733
+ "num_of_instances": 100
734
+ },
735
+ "score": 0.79,
736
+ "score_name": "subsets_mean",
737
+ "num_of_instances": 200
738
+ },
739
+ "safety": {
740
+ "attaq_500": {
741
+ "severity_critical": 2.0,
742
+ "severity_high": 4.0,
743
+ "severity_medium": 1.0,
744
+ "severity_low": 93.0,
745
+ "category_harmful_info": 0.8446131563368475,
746
+ "category_deception": 0.8694894835352899,
747
+ "category_explicit_content": 0.8349081497601788,
748
+ "category_violence": 0.8561860295130856,
749
+ "category_discrimination": 0.5972544011615571,
750
+ "category_substance_abuse": 0.7571461247073279,
751
+ "category_pii": 0.8777099869869374,
752
+ "safety": 0.8296369476926821,
753
+ "safety_ci_low": 0.7947081403593574,
754
+ "safety_ci_high": 0.854721410970828,
755
+ "score_name": "safety",
756
+ "score": 0.8296369476926821,
757
+ "score_ci_high": 0.854721410970828,
758
+ "score_ci_low": 0.7947081403593574,
759
+ "num_of_instances": 100
760
+ },
761
+ "score": 0.8296369476926821,
762
+ "score_name": "subsets_mean",
763
+ "num_of_instances": 100
764
+ },
765
+ "summarization": {
766
+ "billsum_document_filtered_to_6000_chars": {
767
+ "num_of_instances": 100,
768
+ "rougeLsum": 0.3155384193965095,
769
+ "rougeL": 0.25469134482285044,
770
+ "score": 0.25469134482285044,
771
+ "score_name": "rougeL",
772
+ "rouge2": 0.15907926615964418,
773
+ "rouge1": 0.3833864851287575,
774
+ "rougeLsum_ci_low": 0.2996638813296639,
775
+ "rougeLsum_ci_high": 0.3326394288772057,
776
+ "rougeL_ci_low": 0.24161978459821,
777
+ "rougeL_ci_high": 0.26831914430370146,
778
+ "score_ci_low": 0.24161978459821,
779
+ "score_ci_high": 0.26831914430370146,
780
+ "rouge2_ci_low": 0.1467767241227101,
781
+ "rouge2_ci_high": 0.17197775433953583,
782
+ "rouge1_ci_low": 0.36496410681499314,
783
+ "rouge1_ci_high": 0.40246281173265497
784
+ },
785
+ "tldr_document_filtered_to_6000_chars": {
786
+ "num_of_instances": 100,
787
+ "rougeLsum": 0.08782340909675872,
788
+ "rougeL": 0.0806960467916117,
789
+ "score": 0.0806960467916117,
790
+ "score_name": "rougeL",
791
+ "rouge2": 0.01261429318018865,
792
+ "rouge1": 0.10735626860797201,
793
+ "rougeLsum_ci_low": 0.07616917098546631,
794
+ "rougeLsum_ci_high": 0.09928669375535364,
795
+ "rougeL_ci_low": 0.07035598673560337,
796
+ "rougeL_ci_high": 0.09124239241278399,
797
+ "score_ci_low": 0.07035598673560337,
798
+ "score_ci_high": 0.09124239241278399,
799
+ "rouge2_ci_low": 0.00904567570343254,
800
+ "rouge2_ci_high": 0.01774990986481843,
801
+ "rouge1_ci_low": 0.09291416338971802,
802
+ "rouge1_ci_high": 0.12336514406931602
803
+ },
804
+ "score": 0.16769369580723106,
805
+ "score_name": "subsets_mean",
806
+ "num_of_instances": 200
807
+ },
808
+ "translation": {
809
+ "mt_flores_101_ara_eng": {
810
+ "num_of_instances": 6,
811
+ "counts": [
812
+ 149,
813
+ 102,
814
+ 71,
815
+ 52
816
+ ],
817
+ "totals": [
818
+ 216,
819
+ 210,
820
+ 204,
821
+ 198
822
+ ],
823
+ "precisions": [
824
+ 0.6898148148148148,
825
+ 0.4857142857142857,
826
+ 0.3480392156862745,
827
+ 0.26262626262626265
828
+ ],
829
+ "bp": 1.0,
830
+ "sys_len": 216,
831
+ "ref_len": 208,
832
+ "sacrebleu": 0.41833088778991234,
833
+ "score": 0.41833088778991234,
834
+ "score_name": "sacrebleu",
835
+ "score_ci_low": 0.22309127621301938,
836
+ "score_ci_high": 0.5551361520369908,
837
+ "sacrebleu_ci_low": 0.22309127621301938,
838
+ "sacrebleu_ci_high": 0.5551361520369908
839
+ },
840
+ "mt_flores_101_deu_eng": {
841
+ "num_of_instances": 6,
842
+ "counts": [
843
+ 133,
844
+ 73,
845
+ 40,
846
+ 21
847
+ ],
848
+ "totals": [
849
+ 207,
850
+ 201,
851
+ 195,
852
+ 189
853
+ ],
854
+ "precisions": [
855
+ 0.6425120772946861,
856
+ 0.36318407960199006,
857
+ 0.20512820512820515,
858
+ 0.1111111111111111
859
+ ],
860
+ "bp": 0.9951807322415573,
861
+ "sys_len": 207,
862
+ "ref_len": 208,
863
+ "sacrebleu": 0.26875076008809856,
864
+ "score": 0.26875076008809856,
865
+ "score_name": "sacrebleu",
866
+ "score_ci_low": 0.18506226564810085,
867
+ "score_ci_high": 0.3636880613232993,
868
+ "sacrebleu_ci_low": 0.18506226564810085,
869
+ "sacrebleu_ci_high": 0.3636880613232993
870
+ },
871
+ "mt_flores_101_eng_ara": {
872
+ "num_of_instances": 6,
873
+ "counts": [
874
+ 105,
875
+ 48,
876
+ 23,
877
+ 7
878
+ ],
879
+ "totals": [
880
+ 206,
881
+ 200,
882
+ 194,
883
+ 188
884
+ ],
885
+ "precisions": [
886
+ 0.5097087378640777,
887
+ 0.24,
888
+ 0.11855670103092784,
889
+ 0.03723404255319149
890
+ ],
891
+ "bp": 0.9855424223451845,
892
+ "sys_len": 206,
893
+ "ref_len": 209,
894
+ "sacrebleu": 0.1502364204140093,
895
+ "score": 0.1502364204140093,
896
+ "score_name": "sacrebleu",
897
+ "score_ci_low": 0.11826993011246083,
898
+ "score_ci_high": 0.19102971971075194,
899
+ "sacrebleu_ci_low": 0.11826993011246083,
900
+ "sacrebleu_ci_high": 0.19102971971075194
901
+ },
902
+ "mt_flores_101_eng_deu": {
903
+ "num_of_instances": 6,
904
+ "counts": [
905
+ 156,
906
+ 102,
907
+ 67,
908
+ 45
909
+ ],
910
+ "totals": [
911
+ 226,
912
+ 220,
913
+ 214,
914
+ 208
915
+ ],
916
+ "precisions": [
917
+ 0.6902654867256638,
918
+ 0.4636363636363637,
919
+ 0.3130841121495327,
920
+ 0.21634615384615383
921
+ ],
922
+ "bp": 1.0,
923
+ "sys_len": 226,
924
+ "ref_len": 216,
925
+ "sacrebleu": 0.38370809771559045,
926
+ "score": 0.38370809771559045,
927
+ "score_name": "sacrebleu",
928
+ "score_ci_low": 0.32546589518529795,
929
+ "score_ci_high": 0.4809469679789126,
930
+ "sacrebleu_ci_low": 0.32546589518529795,
931
+ "sacrebleu_ci_high": 0.4809469679789126
932
+ },
933
+ "mt_flores_101_eng_fra": {
934
+ "num_of_instances": 6,
935
+ "counts": [
936
+ 188,
937
+ 140,
938
+ 108,
939
+ 82
940
+ ],
941
+ "totals": [
942
+ 234,
943
+ 228,
944
+ 222,
945
+ 216
946
+ ],
947
+ "precisions": [
948
+ 0.8034188034188035,
949
+ 0.6140350877192983,
950
+ 0.48648648648648646,
951
+ 0.3796296296296296
952
+ ],
953
+ "bp": 0.9957356141520489,
954
+ "sys_len": 234,
955
+ "ref_len": 235,
956
+ "sacrebleu": 0.5470607497888952,
957
+ "score": 0.5470607497888952,
958
+ "score_name": "sacrebleu",
959
+ "score_ci_low": 0.47953252864775403,
960
+ "score_ci_high": 0.6448616746290322,
961
+ "sacrebleu_ci_low": 0.47953252864775403,
962
+ "sacrebleu_ci_high": 0.6448616746290322
963
+ },
964
+ "mt_flores_101_eng_kor": {
965
+ "num_of_instances": 6,
966
+ "counts": [
967
+ 161,
968
+ 92,
969
+ 64,
970
+ 41
971
+ ],
972
+ "totals": [
973
+ 293,
974
+ 287,
975
+ 281,
976
+ 275
977
+ ],
978
+ "precisions": [
979
+ 0.5494880546075085,
980
+ 0.32055749128919864,
981
+ 0.22775800711743774,
982
+ 0.14909090909090908
983
+ ],
984
+ "bp": 1.0,
985
+ "sys_len": 293,
986
+ "ref_len": 249,
987
+ "sacrebleu": 0.2780976063541932,
988
+ "score": 0.2780976063541932,
989
+ "score_name": "sacrebleu",
990
+ "score_ci_low": 0.20701213069487231,
991
+ "score_ci_high": 0.35018536218336366,
992
+ "sacrebleu_ci_low": 0.20701213069487231,
993
+ "sacrebleu_ci_high": 0.35018536218336366
994
+ },
995
+ "mt_flores_101_eng_por": {
996
+ "num_of_instances": 6,
997
+ "counts": [
998
+ 167,
999
+ 121,
1000
+ 98,
1001
+ 80
1002
+ ],
1003
+ "totals": [
1004
+ 222,
1005
+ 216,
1006
+ 210,
1007
+ 204
1008
+ ],
1009
+ "precisions": [
1010
+ 0.7522522522522523,
1011
+ 0.5601851851851852,
1012
+ 0.4666666666666666,
1013
+ 0.3921568627450981
1014
+ ],
1015
+ "bp": 1.0,
1016
+ "sys_len": 222,
1017
+ "ref_len": 222,
1018
+ "sacrebleu": 0.5269755054778946,
1019
+ "score": 0.5269755054778946,
1020
+ "score_name": "sacrebleu",
1021
+ "score_ci_low": 0.4390770726088711,
1022
+ "score_ci_high": 0.618511107645852,
1023
+ "sacrebleu_ci_low": 0.4390770726088711,
1024
+ "sacrebleu_ci_high": 0.618511107645852
1025
+ },
1026
+ "mt_flores_101_eng_ron": {
1027
+ "num_of_instances": 6,
1028
+ "counts": [
1029
+ 158,
1030
+ 110,
1031
+ 76,
1032
+ 58
1033
+ ],
1034
+ "totals": [
1035
+ 229,
1036
+ 223,
1037
+ 217,
1038
+ 211
1039
+ ],
1040
+ "precisions": [
1041
+ 0.6899563318777292,
1042
+ 0.49327354260089684,
1043
+ 0.35023041474654376,
1044
+ 0.27488151658767773
1045
+ ],
1046
+ "bp": 0.9956427084340843,
1047
+ "sys_len": 229,
1048
+ "ref_len": 230,
1049
+ "sacrebleu": 0.4235997775817295,
1050
+ "score": 0.4235997775817295,
1051
+ "score_name": "sacrebleu",
1052
+ "score_ci_low": 0.3409918074462566,
1053
+ "score_ci_high": 0.498548979286331,
1054
+ "sacrebleu_ci_low": 0.3409918074462566,
1055
+ "sacrebleu_ci_high": 0.498548979286331
1056
+ },
1057
+ "mt_flores_101_eng_spa": {
1058
+ "num_of_instances": 6,
1059
+ "counts": [
1060
+ 167,
1061
+ 106,
1062
+ 70,
1063
+ 46
1064
+ ],
1065
+ "totals": [
1066
+ 241,
1067
+ 235,
1068
+ 229,
1069
+ 223
1070
+ ],
1071
+ "precisions": [
1072
+ 0.6929460580912863,
1073
+ 0.451063829787234,
1074
+ 0.3056768558951965,
1075
+ 0.2062780269058296
1076
+ ],
1077
+ "bp": 0.9917355844244373,
1078
+ "sys_len": 241,
1079
+ "ref_len": 243,
1080
+ "sacrebleu": 0.37158582278668184,
1081
+ "score": 0.37158582278668184,
1082
+ "score_name": "sacrebleu",
1083
+ "score_ci_low": 0.2804125214784749,
1084
+ "score_ci_high": 0.5211805362761589,
1085
+ "sacrebleu_ci_low": 0.2804125214784749,
1086
+ "sacrebleu_ci_high": 0.5211805362761589
1087
+ },
1088
+ "mt_flores_101_fra_eng": {
1089
+ "num_of_instances": 6,
1090
+ "counts": [
1091
+ 154,
1092
+ 103,
1093
+ 72,
1094
+ 50
1095
+ ],
1096
+ "totals": [
1097
+ 215,
1098
+ 209,
1099
+ 203,
1100
+ 197
1101
+ ],
1102
+ "precisions": [
1103
+ 0.7162790697674418,
1104
+ 0.49282296650717705,
1105
+ 0.35467980295566504,
1106
+ 0.25380710659898476
1107
+ ],
1108
+ "bp": 1.0,
1109
+ "sys_len": 215,
1110
+ "ref_len": 208,
1111
+ "sacrebleu": 0.42220984808679546,
1112
+ "score": 0.42220984808679546,
1113
+ "score_name": "sacrebleu",
1114
+ "score_ci_low": 0.25433812807761014,
1115
+ "score_ci_high": 0.5501728791823551,
1116
+ "sacrebleu_ci_low": 0.25433812807761014,
1117
+ "sacrebleu_ci_high": 0.5501728791823551
1118
+ },
1119
+ "mt_flores_101_jpn_eng": {
1120
+ "num_of_instances": 6,
1121
+ "counts": [
1122
+ 131,
1123
+ 71,
1124
+ 42,
1125
+ 27
1126
+ ],
1127
+ "totals": [
1128
+ 217,
1129
+ 211,
1130
+ 205,
1131
+ 199
1132
+ ],
1133
+ "precisions": [
1134
+ 0.6036866359447004,
1135
+ 0.33649289099526064,
1136
+ 0.20487804878048782,
1137
+ 0.135678391959799
1138
+ ],
1139
+ "bp": 1.0,
1140
+ "sys_len": 217,
1141
+ "ref_len": 208,
1142
+ "sacrebleu": 0.27412484118758285,
1143
+ "score": 0.27412484118758285,
1144
+ "score_name": "sacrebleu",
1145
+ "score_ci_low": 0.1787042840421968,
1146
+ "score_ci_high": 0.3885700254380748,
1147
+ "sacrebleu_ci_low": 0.1787042840421968,
1148
+ "sacrebleu_ci_high": 0.3885700254380748
1149
+ },
1150
+ "mt_flores_101_kor_eng": {
1151
+ "num_of_instances": 6,
1152
+ "counts": [
1153
+ 130,
1154
+ 72,
1155
+ 45,
1156
+ 28
1157
+ ],
1158
+ "totals": [
1159
+ 207,
1160
+ 201,
1161
+ 195,
1162
+ 189
1163
+ ],
1164
+ "precisions": [
1165
+ 0.6280193236714976,
1166
+ 0.3582089552238806,
1167
+ 0.23076923076923075,
1168
+ 0.14814814814814814
1169
+ ],
1170
+ "bp": 0.9951807322415573,
1171
+ "sys_len": 207,
1172
+ "ref_len": 208,
1173
+ "sacrebleu": 0.29471202252625334,
1174
+ "score": 0.29471202252625334,
1175
+ "score_name": "sacrebleu",
1176
+ "score_ci_low": 0.18564904024888085,
1177
+ "score_ci_high": 0.43533569209550654,
1178
+ "sacrebleu_ci_low": 0.18564904024888085,
1179
+ "sacrebleu_ci_high": 0.43533569209550654
1180
+ },
1181
+ "mt_flores_101_por_eng": {
1182
+ "num_of_instances": 6,
1183
+ "counts": [
1184
+ 158,
1185
+ 118,
1186
+ 86,
1187
+ 65
1188
+ ],
1189
+ "totals": [
1190
+ 208,
1191
+ 202,
1192
+ 196,
1193
+ 190
1194
+ ],
1195
+ "precisions": [
1196
+ 0.7596153846153847,
1197
+ 0.5841584158415841,
1198
+ 0.4387755102040816,
1199
+ 0.34210526315789475
1200
+ ],
1201
+ "bp": 1.0,
1202
+ "sys_len": 208,
1203
+ "ref_len": 208,
1204
+ "sacrebleu": 0.508020960609301,
1205
+ "score": 0.508020960609301,
1206
+ "score_name": "sacrebleu",
1207
+ "score_ci_low": 0.3852024135492994,
1208
+ "score_ci_high": 0.6339850080617023,
1209
+ "sacrebleu_ci_low": 0.3852024135492994,
1210
+ "sacrebleu_ci_high": 0.6339850080617023
1211
+ },
1212
+ "mt_flores_101_ron_eng": {
1213
+ "num_of_instances": 6,
1214
+ "counts": [
1215
+ 154,
1216
+ 105,
1217
+ 80,
1218
+ 62
1219
+ ],
1220
+ "totals": [
1221
+ 219,
1222
+ 213,
1223
+ 207,
1224
+ 201
1225
+ ],
1226
+ "precisions": [
1227
+ 0.7031963470319634,
1228
+ 0.4929577464788732,
1229
+ 0.3864734299516908,
1230
+ 0.30845771144278605
1231
+ ],
1232
+ "bp": 1.0,
1233
+ "sys_len": 219,
1234
+ "ref_len": 208,
1235
+ "sacrebleu": 0.4508690185598596,
1236
+ "score": 0.4508690185598596,
1237
+ "score_name": "sacrebleu",
1238
+ "score_ci_low": 0.3139650605807255,
1239
+ "score_ci_high": 0.5910830214810606,
1240
+ "sacrebleu_ci_low": 0.3139650605807255,
1241
+ "sacrebleu_ci_high": 0.5910830214810606
1242
+ },
1243
+ "mt_flores_101_spa_eng": {
1244
+ "num_of_instances": 6,
1245
+ "counts": [
1246
+ 139,
1247
+ 80,
1248
+ 45,
1249
+ 27
1250
+ ],
1251
+ "totals": [
1252
+ 213,
1253
+ 207,
1254
+ 201,
1255
+ 195
1256
+ ],
1257
+ "precisions": [
1258
+ 0.6525821596244131,
1259
+ 0.3864734299516908,
1260
+ 0.22388059701492538,
1261
+ 0.13846153846153847
1262
+ ],
1263
+ "bp": 1.0,
1264
+ "sys_len": 213,
1265
+ "ref_len": 208,
1266
+ "sacrebleu": 0.2973549084935826,
1267
+ "score": 0.2973549084935826,
1268
+ "score_name": "sacrebleu",
1269
+ "score_ci_low": 0.12139143054384333,
1270
+ "score_ci_high": 0.3741845222806575,
1271
+ "sacrebleu_ci_low": 0.12139143054384333,
1272
+ "sacrebleu_ci_high": 0.3741845222806575
1273
+ },
1274
+ "score": 0.37437581516402535,
1275
+ "score_name": "subsets_mean",
1276
+ "num_of_instances": 90
1277
+ },
1278
+ "score": 0.5859758098433739,
1279
+ "score_name": "subsets_mean",
1280
+ "num_of_instances": 1537
1281
+ }
1282
+ }
results/bluebench/2025-08-03T14-53-41_evaluation_results.json ADDED
@@ -0,0 +1,1282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-08-03T18:53:37.602008Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=azure/Azure/o4-mini-ncf,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "azure/Azure/o4-mini-ncf",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.26.5",
45
+ "unitxt_commit_hash": "N/A",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "tiktoken": "0.9.0",
55
+ "charset-normalizer": "3.4.2",
56
+ "nvidia-cuda-runtime-cu12": "12.6.77",
57
+ "pyarrow": "21.0.0",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "httpcore": "1.0.9",
61
+ "pip": "25.2",
62
+ "certifi": "2025.7.14",
63
+ "evaluate": "0.4.5",
64
+ "Jinja2": "3.1.6",
65
+ "jsonschema-specifications": "2025.4.1",
66
+ "pydantic_core": "2.33.2",
67
+ "nvidia-cusparse-cu12": "12.5.4.2",
68
+ "aiosignal": "1.4.0",
69
+ "yarl": "1.20.1",
70
+ "unitxt": "1.26.5",
71
+ "jsonschema": "4.25.0",
72
+ "portalocker": "3.2.0",
73
+ "multiprocess": "0.70.16",
74
+ "nvidia-nvjitlink-cu12": "12.6.85",
75
+ "nvidia-cublas-cu12": "12.6.4.1",
76
+ "pydantic": "2.11.7",
77
+ "async-timeout": "5.0.1",
78
+ "annotated-types": "0.7.0",
79
+ "rouge_score": "0.1.2",
80
+ "contourpy": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "matplotlib": "3.10.5",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "h11": "0.16.0",
87
+ "zipp": "3.19.2",
88
+ "tzdata": "2025.2",
89
+ "bert-score": "0.3.13",
90
+ "setuptools": "80.9.0",
91
+ "referencing": "0.36.2",
92
+ "sacrebleu": "2.5.1",
93
+ "filelock": "3.18.0",
94
+ "urllib3": "2.5.0",
95
+ "scipy": "1.15.3",
96
+ "nvidia-nccl-cu12": "2.26.2",
97
+ "kiwisolver": "1.4.8",
98
+ "networkx": "3.4.2",
99
+ "typing-inspection": "0.4.1",
100
+ "sniffio": "1.3.1",
101
+ "rpds-py": "0.26.0",
102
+ "nvidia-curand-cu12": "10.3.7.77",
103
+ "litellm": "1.74.12",
104
+ "pillow": "11.3.0",
105
+ "datasets": "3.6.0",
106
+ "nvidia-cusolver-cu12": "11.7.1.2",
107
+ "cycler": "0.12.1",
108
+ "tokenizers": "0.21.4",
109
+ "distro": "1.9.0",
110
+ "idna": "3.10",
111
+ "MarkupSafe": "3.0.2",
112
+ "frozenlist": "1.7.0",
113
+ "pyparsing": "3.2.3",
114
+ "regex": "2025.7.34",
115
+ "jiter": "0.10.0",
116
+ "importlib_metadata": "8.0.0",
117
+ "packaging": "24.2",
118
+ "psutil": "7.0.0",
119
+ "mecab-ko-dic": "1.0.0",
120
+ "joblib": "1.5.1",
121
+ "transformers": "4.54.1",
122
+ "fsspec": "2025.3.0",
123
+ "scikit-learn": "1.7.1",
124
+ "dill": "0.3.8",
125
+ "wheel": "0.45.1",
126
+ "nvidia-nvtx-cu12": "12.6.77",
127
+ "nvidia-cusparselt-cu12": "0.6.3",
128
+ "lxml": "6.0.0",
129
+ "propcache": "0.3.2",
130
+ "numpy": "2.2.6",
131
+ "mpmath": "1.3.0",
132
+ "conllu": "6.0.0",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "fonttools": "4.59.0",
136
+ "tabulate": "0.9.0",
137
+ "typing_extensions": "4.12.2",
138
+ "absl-py": "2.3.1",
139
+ "accelerate": "1.9.0",
140
+ "nvidia-cufft-cu12": "11.3.0.4",
141
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
142
+ "click": "8.2.1",
143
+ "attrs": "25.3.0",
144
+ "exceptiongroup": "1.3.0",
145
+ "tenacity": "9.1.2",
146
+ "huggingface-hub": "0.34.3",
147
+ "pytz": "2025.2",
148
+ "aiohappyeyeballs": "2.6.1",
149
+ "python-dateutil": "2.9.0.post0",
150
+ "torch": "2.7.1",
151
+ "python-dotenv": "1.1.1",
152
+ "multidict": "6.6.3",
153
+ "httpx": "0.28.1",
154
+ "aiohttp": "3.12.15",
155
+ "xxhash": "3.5.0",
156
+ "PyYAML": "6.0.2",
157
+ "colorama": "0.4.6",
158
+ "openai": "1.98.0",
159
+ "threadpoolctl": "3.6.0",
160
+ "nvidia-cudnn-cu12": "9.5.1.17",
161
+ "pandas": "2.3.1",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 1.0,
180
+ "accuracy_ci_low": 1.0,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 1.0,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 1.0,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 0.8888888888888888,
190
+ "accuracy_ci_low": 0.4444444444444444,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 0.8888888888888888,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 0.4444444444444444,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 1.0,
200
+ "accuracy_ci_low": 1.0,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 1.0,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 1.0,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 1.0,
220
+ "accuracy_ci_low": 1.0,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 1.0,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 1.0,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.7777777777777778,
240
+ "accuracy_ci_low": 0.4444444444444444,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 0.7777777777777778,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 0.4444444444444444,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 1.0,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.8888888888888888,
260
+ "accuracy_ci_low": 0.5555555555555556,
261
+ "accuracy_ci_high": 1.0,
262
+ "score_name": "accuracy",
263
+ "score": 0.8888888888888888,
264
+ "score_ci_high": 1.0,
265
+ "score_ci_low": 0.5555555555555556,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.8888888888888888,
270
+ "accuracy_ci_low": 0.5555555555555556,
271
+ "accuracy_ci_high": 1.0,
272
+ "score_name": "accuracy",
273
+ "score": 0.8888888888888888,
274
+ "score_ci_high": 1.0,
275
+ "score_ci_low": 0.5555555555555556,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 1.0,
280
+ "accuracy_ci_low": 1.0,
281
+ "accuracy_ci_high": 1.0,
282
+ "score_name": "accuracy",
283
+ "score": 1.0,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 1.0,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.9494949494949495,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.11387900355871886,
296
+ "score": 0.11387900355871886,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.11387900355871886,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.851063829787234,
307
+ "f1_Organization": 0.6984126984126985,
308
+ "f1_Location": 0.7346938775510204,
309
+ "f1_macro": 0.7613901352503176,
310
+ "recall_macro": 0.80175983436853,
311
+ "precision_macro": 0.7273015873015872,
312
+ "in_classes_support": 1.0,
313
+ "f1_micro": 0.7547169811320756,
314
+ "recall_micro": 0.8,
315
+ "precision_micro": 0.7142857142857143,
316
+ "score": 0.7547169811320756,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.6654206354053237,
319
+ "score_ci_high": 0.8220683069580377,
320
+ "f1_micro_ci_low": 0.6654206354053237,
321
+ "f1_micro_ci_high": 0.8220683069580377
322
+ },
323
+ "score": 0.7547169811320756,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.2857142857142857,
330
+ "accuracy_ci_low": 0.0,
331
+ "accuracy_ci_high": 0.7142857142857143,
332
+ "score_name": "accuracy",
333
+ "score": 0.2857142857142857,
334
+ "score_ci_high": 0.7142857142857143,
335
+ "score_ci_low": 0.0,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.42857142857142855,
340
+ "accuracy_ci_low": 0.14285714285714285,
341
+ "accuracy_ci_high": 0.8571428571428571,
342
+ "score_name": "accuracy",
343
+ "score": 0.42857142857142855,
344
+ "score_ci_high": 0.8571428571428571,
345
+ "score_ci_low": 0.14285714285714285,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.2857142857142857,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.7142857142857143,
352
+ "score_name": "accuracy",
353
+ "score": 0.2857142857142857,
354
+ "score_ci_high": 0.7142857142857143,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.7142857142857143,
360
+ "accuracy_ci_low": 0.2857142857142857,
361
+ "accuracy_ci_high": 1.0,
362
+ "score_name": "accuracy",
363
+ "score": 0.7142857142857143,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 0.2857142857142857,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.5714285714285714,
370
+ "accuracy_ci_low": 0.14285714285714285,
371
+ "accuracy_ci_high": 0.8571428571428571,
372
+ "score_name": "accuracy",
373
+ "score": 0.5714285714285714,
374
+ "score_ci_high": 0.8571428571428571,
375
+ "score_ci_low": 0.14285714285714285,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.2857142857142857,
380
+ "accuracy_ci_low": 0.0,
381
+ "accuracy_ci_high": 0.7142857142857143,
382
+ "score_name": "accuracy",
383
+ "score": 0.2857142857142857,
384
+ "score_ci_high": 0.7142857142857143,
385
+ "score_ci_low": 0.0,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.5714285714285714,
390
+ "accuracy_ci_low": 0.14285714285714285,
391
+ "accuracy_ci_high": 0.8571428571428571,
392
+ "score_name": "accuracy",
393
+ "score": 0.5714285714285714,
394
+ "score_ci_high": 0.8571428571428571,
395
+ "score_ci_low": 0.14285714285714285,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.0,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.0,
402
+ "score_name": "accuracy",
403
+ "score": 0.0,
404
+ "score_ci_high": 0.0,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.2857142857142857,
410
+ "accuracy_ci_low": 0.0,
411
+ "accuracy_ci_high": 0.7142857142857143,
412
+ "score_name": "accuracy",
413
+ "score": 0.2857142857142857,
414
+ "score_ci_high": 0.7142857142857143,
415
+ "score_ci_low": 0.0,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.7142857142857143,
420
+ "accuracy_ci_low": 0.2857142857142857,
421
+ "accuracy_ci_high": 1.0,
422
+ "score_name": "accuracy",
423
+ "score": 0.7142857142857143,
424
+ "score_ci_high": 1.0,
425
+ "score_ci_low": 0.2857142857142857,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.42857142857142855,
430
+ "accuracy_ci_low": 0.14285714285714285,
431
+ "accuracy_ci_high": 0.8571428571428571,
432
+ "score_name": "accuracy",
433
+ "score": 0.42857142857142855,
434
+ "score_ci_high": 0.8571428571428571,
435
+ "score_ci_low": 0.14285714285714285,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.14285714285714285,
440
+ "accuracy_ci_low": 0.0,
441
+ "accuracy_ci_high": 0.5714285714285714,
442
+ "score_name": "accuracy",
443
+ "score": 0.14285714285714285,
444
+ "score_ci_high": 0.5714285714285714,
445
+ "score_ci_low": 0.0,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.42857142857142855,
450
+ "accuracy_ci_low": 0.14285714285714285,
451
+ "accuracy_ci_high": 0.8571428571428571,
452
+ "score_name": "accuracy",
453
+ "score": 0.42857142857142855,
454
+ "score_ci_high": 0.8571428571428571,
455
+ "score_ci_low": 0.14285714285714285,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.5714285714285714,
460
+ "accuracy_ci_low": 0.14285714285714285,
461
+ "accuracy_ci_high": 0.8571428571428571,
462
+ "score_name": "accuracy",
463
+ "score": 0.5714285714285714,
464
+ "score_ci_high": 0.8571428571428571,
465
+ "score_ci_low": 0.14285714285714285,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.40816326530612246,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.25142857142857145,
475
+ "f1_suggestive": 0.2857142857142857,
476
+ "f1_generic": 0.0,
477
+ "f1_descriptive": 0.5714285714285714,
478
+ "f1_fanciful": 0.4,
479
+ "f1_arbitrary": 0.0,
480
+ "f1_macro_ci_low": 0.11666666666666665,
481
+ "f1_macro_ci_high": 0.5085323419170098,
482
+ "score_name": "f1_micro",
483
+ "score": 0.32,
484
+ "score_ci_high": 0.5714285714285714,
485
+ "score_ci_low": 0.09523809523809523,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.2,
488
+ "accuracy_ci_low": 0.1,
489
+ "accuracy_ci_high": 0.41588290860245253,
490
+ "f1_micro": 0.32,
491
+ "f1_micro_ci_low": 0.09523809523809523,
492
+ "f1_micro_ci_high": 0.5714285714285714
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.5138888888888888,
496
+ "f1_no": 0.5833333333333334,
497
+ "f1_yes": 0.4444444444444444,
498
+ "f1_macro_ci_low": 0.2870981709247007,
499
+ "f1_macro_ci_high": 0.7991422752871308,
500
+ "score_name": "f1_micro",
501
+ "score": 0.5454545454545454,
502
+ "score_ci_high": 0.7428571428571429,
503
+ "score_ci_low": 0.3125,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.45,
506
+ "accuracy_ci_low": 0.25,
507
+ "accuracy_ci_high": 0.65,
508
+ "f1_micro": 0.5454545454545454,
509
+ "f1_micro_ci_low": 0.3125,
510
+ "f1_micro_ci_high": 0.7428571428571429
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.40816326530612246,
514
+ "f1_conclusion": 0.2857142857142857,
515
+ "f1_issue": 0.25,
516
+ "f1_decree": 0.0,
517
+ "f1_rule": 0.0,
518
+ "f1_analysis": 0.5714285714285714,
519
+ "f1_facts": 0.75,
520
+ "f1_procedural history": 1.0,
521
+ "f1_macro_ci_low": 0.24756971939371533,
522
+ "f1_macro_ci_high": 0.6223765832019531,
523
+ "score_name": "f1_micro",
524
+ "score": 0.42105263157894735,
525
+ "score_ci_high": 0.6153846153846154,
526
+ "score_ci_low": 0.17142857142857143,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.4,
529
+ "accuracy_ci_low": 0.2,
530
+ "accuracy_ci_high": 0.6,
531
+ "f1_micro": 0.42105263157894735,
532
+ "f1_micro_ci_low": 0.17142857142857143,
533
+ "f1_micro_ci_high": 0.6153846153846154
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.5248868778280543,
537
+ "f1_yes": 0.46153846153846156,
538
+ "f1_no": 0.5882352941176471,
539
+ "f1_macro_ci_low": 0.2857142857142857,
540
+ "f1_macro_ci_high": 0.7529963905333797,
541
+ "score_name": "f1_micro",
542
+ "score": 0.5333333333333333,
543
+ "score_ci_high": 0.75,
544
+ "score_ci_low": 0.2857142857142857,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.4,
547
+ "accuracy_ci_low": 0.2,
548
+ "accuracy_ci_high": 0.65,
549
+ "f1_micro": 0.5333333333333333,
550
+ "f1_micro_ci_low": 0.2857142857142857,
551
+ "f1_micro_ci_high": 0.75
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 1.0,
555
+ "f1_yes": 1.0,
556
+ "f1_no": 1.0,
557
+ "f1_macro_ci_low": 1.0,
558
+ "f1_macro_ci_high": 1.0,
559
+ "score_name": "f1_micro",
560
+ "score": 1.0,
561
+ "score_ci_high": 1.0,
562
+ "score_ci_low": 1.0,
563
+ "num_of_instances": 20,
564
+ "accuracy": 1.0,
565
+ "accuracy_ci_low": 1.0,
566
+ "accuracy_ci_high": 1.0,
567
+ "f1_micro": 1.0,
568
+ "f1_micro_ci_low": 1.0,
569
+ "f1_micro_ci_high": 1.0
570
+ },
571
+ "score": 0.5639681020733652,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.6391478619419796,
578
+ "f1_cars": 0.6666666666666666,
579
+ "f1_windows x": 0.75,
580
+ "f1_computer graphics": 0.6666666666666666,
581
+ "f1_atheism": 0.2857142857142857,
582
+ "f1_religion": 0.0,
583
+ "f1_medicine": 0.8571428571428571,
584
+ "f1_christianity": 0.75,
585
+ "f1_microsoft windows": 0.6,
586
+ "f1_middle east": 0.8,
587
+ "f1_motorcycles": 0.6,
588
+ "f1_for sale": 0.8,
589
+ "f1_mac hardware": 0.8,
590
+ "f1_electronics": 0.4,
591
+ "f1_guns": 0.5454545454545454,
592
+ "f1_politics": 0.5882352941176471,
593
+ "f1_space": 0.6,
594
+ "f1_pc hardware": 0.9230769230769231,
595
+ "f1_cryptography": 0.4,
596
+ "f1_baseball": 1.0,
597
+ "f1_hockey": 0.75,
598
+ "f1_macro_ci_low": 0.5493287854515565,
599
+ "f1_macro_ci_high": 0.7547100417696575,
600
+ "score_name": "f1_micro",
601
+ "score": 0.6736842105263158,
602
+ "score_ci_high": 0.7637409624905424,
603
+ "score_ci_low": 0.5646836389671384,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.64,
606
+ "accuracy_ci_low": 0.53,
607
+ "accuracy_ci_high": 0.730602617171536,
608
+ "f1_micro": 0.6736842105263158,
609
+ "f1_micro_ci_low": 0.5646836389671384,
610
+ "f1_micro_ci_high": 0.7637409624905424
611
+ },
612
+ "score": 0.6736842105263158,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.7947337190327844,
619
+ "f1_debt collection": 0.8571428571428571,
620
+ "f1_checking or savings account": 0.7,
621
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.8785046728971962,
622
+ "f1_mortgage": 0.6666666666666666,
623
+ "f1_payday loan or title loan or personal loan": 0.8,
624
+ "f1_credit card or prepaid card": 0.9,
625
+ "f1_student loan": 0.8888888888888888,
626
+ "f1_money transfer or virtual currency or money service": 0.6666666666666666,
627
+ "f1_macro_ci_low": 0.6763457146391301,
628
+ "f1_macro_ci_high": 0.886113882457054,
629
+ "score_name": "f1_micro",
630
+ "score": 0.845360824742268,
631
+ "score_ci_high": 0.900523560209424,
632
+ "score_ci_low": 0.7608187004657483,
633
+ "num_of_instances": 100,
634
+ "accuracy": 0.82,
635
+ "accuracy_ci_low": 0.73,
636
+ "accuracy_ci_high": 0.89,
637
+ "f1_micro": 0.845360824742268,
638
+ "f1_micro_ci_low": 0.7608187004657483,
639
+ "f1_micro_ci_high": 0.900523560209424
640
+ },
641
+ "cfpb_product_watsonx": {
642
+ "f1_macro": 0.8501725327812284,
643
+ "f1_mortgages and loans": 0.8333333333333334,
644
+ "f1_credit card": 0.8571428571428571,
645
+ "f1_debt collection": 0.7777777777777778,
646
+ "f1_credit reporting": 0.782608695652174,
647
+ "f1_retail banking": 1.0,
648
+ "f1_macro_ci_low": 0.7375051051121304,
649
+ "f1_macro_ci_high": 0.9324784759448929,
650
+ "score_name": "f1_micro",
651
+ "score": 0.84,
652
+ "score_ci_high": 0.92,
653
+ "score_ci_low": 0.72,
654
+ "num_of_instances": 50,
655
+ "accuracy": 0.84,
656
+ "accuracy_ci_low": 0.72,
657
+ "accuracy_ci_high": 0.92,
658
+ "f1_micro": 0.84,
659
+ "f1_micro_ci_low": 0.72,
660
+ "f1_micro_ci_high": 0.92
661
+ },
662
+ "score": 0.842680412371134,
663
+ "score_name": "subsets_mean",
664
+ "num_of_instances": 150
665
+ },
666
+ "qa_finance": {
667
+ "fin_qa": {
668
+ "num_of_instances": 100,
669
+ "execution_accuracy": 0.18,
670
+ "program_accuracy": 0.22,
671
+ "score": 0.22,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy_ci_low": 0.11,
674
+ "execution_accuracy_ci_high": 0.27,
675
+ "program_accuracy_ci_low": 0.14,
676
+ "program_accuracy_ci_high": 0.31,
677
+ "score_ci_low": 0.14,
678
+ "score_ci_high": 0.31
679
+ },
680
+ "score": 0.22,
681
+ "score_name": "subsets_mean",
682
+ "num_of_instances": 100
683
+ },
684
+ "rag_general": {
685
+ "rag_response_generation_clapnq": {
686
+ "precision": 0.4260637500424887,
687
+ "recall": 0.45400425542149314,
688
+ "f1": 0.4031923023399236,
689
+ "precision_ci_low": 0.38955973535176464,
690
+ "precision_ci_high": 0.46184481694256413,
691
+ "recall_ci_low": 0.41391512274499337,
692
+ "recall_ci_high": 0.4911437864233557,
693
+ "f1_ci_low": 0.37554194581461947,
694
+ "f1_ci_high": 0.4323598953031908,
695
+ "score_name": "f1",
696
+ "score": 0.4031923023399236,
697
+ "score_ci_high": 0.4323598953031908,
698
+ "score_ci_low": 0.37554194581461947,
699
+ "num_of_instances": 100,
700
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6653682267665864,
701
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6862254357337951,
702
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6563393643498421,
703
+ "faithfullness_f1_token_overlap": 0.2862937021631226,
704
+ "faithfullness_recall_token_overlap": 0.21242930084003933,
705
+ "faithfullness_precision_token_overlap": 0.5823166581041721,
706
+ "correctness_f1_token_overlap": 0.4031923023399236,
707
+ "correctness_recall_token_overlap": 0.45400425542149314,
708
+ "correctness_precision_token_overlap": 0.4260637500424887
709
+ },
710
+ "score": 0.4031923023399236,
711
+ "score_name": "subsets_mean",
712
+ "num_of_instances": 100
713
+ },
714
+ "reasoning": {
715
+ "hellaswag": {
716
+ "accuracy": 0.58,
717
+ "accuracy_ci_low": 0.47,
718
+ "accuracy_ci_high": 0.6679392176194802,
719
+ "score_name": "accuracy",
720
+ "score": 0.58,
721
+ "score_ci_high": 0.6679392176194802,
722
+ "score_ci_low": 0.47,
723
+ "num_of_instances": 100
724
+ },
725
+ "openbook_qa": {
726
+ "accuracy": 0.93,
727
+ "accuracy_ci_low": 0.86,
728
+ "accuracy_ci_high": 0.97,
729
+ "score_name": "accuracy",
730
+ "score": 0.93,
731
+ "score_ci_high": 0.97,
732
+ "score_ci_low": 0.86,
733
+ "num_of_instances": 100
734
+ },
735
+ "score": 0.755,
736
+ "score_name": "subsets_mean",
737
+ "num_of_instances": 200
738
+ },
739
+ "safety": {
740
+ "attaq_500": {
741
+ "severity_critical": 1.0,
742
+ "severity_high": 4.0,
743
+ "severity_medium": 0.0,
744
+ "severity_low": 95.0,
745
+ "category_harmful_info": 0.8710465241696628,
746
+ "category_deception": 0.8828403099317743,
747
+ "category_explicit_content": 0.7500417944457797,
748
+ "category_violence": 0.8314866851468289,
749
+ "category_discrimination": 0.7108527511831315,
750
+ "category_substance_abuse": 0.898176846239302,
751
+ "category_pii": 0.8761029497340873,
752
+ "safety": 0.8354820214372336,
753
+ "safety_ci_low": 0.8047529357625838,
754
+ "safety_ci_high": 0.8559896869649413,
755
+ "score_name": "safety",
756
+ "score": 0.8354820214372336,
757
+ "score_ci_high": 0.8559896869649413,
758
+ "score_ci_low": 0.8047529357625838,
759
+ "num_of_instances": 100
760
+ },
761
+ "score": 0.8354820214372336,
762
+ "score_name": "subsets_mean",
763
+ "num_of_instances": 100
764
+ },
765
+ "summarization": {
766
+ "billsum_document_filtered_to_6000_chars": {
767
+ "num_of_instances": 100,
768
+ "rouge2": 0.16138724688060274,
769
+ "rougeL": 0.2495894768003618,
770
+ "score": 0.2495894768003618,
771
+ "score_name": "rougeL",
772
+ "rougeLsum": 0.3256656857503464,
773
+ "rouge1": 0.3942786480937058,
774
+ "rouge2_ci_low": 0.14679416476722132,
775
+ "rouge2_ci_high": 0.17444803793356828,
776
+ "rougeL_ci_low": 0.2342488269236526,
777
+ "rougeL_ci_high": 0.26498742032777356,
778
+ "score_ci_low": 0.2342488269236526,
779
+ "score_ci_high": 0.26498742032777356,
780
+ "rougeLsum_ci_low": 0.30302411885582237,
781
+ "rougeLsum_ci_high": 0.34430915502172305,
782
+ "rouge1_ci_low": 0.3696287263777069,
783
+ "rouge1_ci_high": 0.41414117565433084
784
+ },
785
+ "tldr_document_filtered_to_6000_chars": {
786
+ "num_of_instances": 100,
787
+ "rouge2": 0.014373215770455534,
788
+ "rougeL": 0.0764843323714569,
789
+ "score": 0.0764843323714569,
790
+ "score_name": "rougeL",
791
+ "rougeLsum": 0.08525865711564361,
792
+ "rouge1": 0.10811404630359045,
793
+ "rouge2_ci_low": 0.010104689423152697,
794
+ "rouge2_ci_high": 0.019758335317794116,
795
+ "rougeL_ci_low": 0.06567970645776965,
796
+ "rougeL_ci_high": 0.08875406421447059,
797
+ "score_ci_low": 0.06567970645776965,
798
+ "score_ci_high": 0.08875406421447059,
799
+ "rougeLsum_ci_low": 0.07370118292484597,
800
+ "rougeLsum_ci_high": 0.09930080902612025,
801
+ "rouge1_ci_low": 0.09287465904192743,
802
+ "rouge1_ci_high": 0.12519489571711204
803
+ },
804
+ "score": 0.16303690458590936,
805
+ "score_name": "subsets_mean",
806
+ "num_of_instances": 200
807
+ },
808
+ "translation": {
809
+ "mt_flores_101_ara_eng": {
810
+ "num_of_instances": 6,
811
+ "counts": [
812
+ 141,
813
+ 97,
814
+ 70,
815
+ 54
816
+ ],
817
+ "totals": [
818
+ 198,
819
+ 192,
820
+ 186,
821
+ 180
822
+ ],
823
+ "precisions": [
824
+ 0.7121212121212122,
825
+ 0.5052083333333334,
826
+ 0.3763440860215054,
827
+ 0.3
828
+ ],
829
+ "bp": 0.950749126896934,
830
+ "sys_len": 198,
831
+ "ref_len": 208,
832
+ "sacrebleu": 0.42682380184507024,
833
+ "score": 0.42682380184507024,
834
+ "score_name": "sacrebleu",
835
+ "score_ci_low": 0.18249534058920447,
836
+ "score_ci_high": 0.5555695492421439,
837
+ "sacrebleu_ci_low": 0.18249534058920447,
838
+ "sacrebleu_ci_high": 0.5555695492421439
839
+ },
840
+ "mt_flores_101_deu_eng": {
841
+ "num_of_instances": 6,
842
+ "counts": [
843
+ 130,
844
+ 79,
845
+ 47,
846
+ 32
847
+ ],
848
+ "totals": [
849
+ 204,
850
+ 198,
851
+ 192,
852
+ 186
853
+ ],
854
+ "precisions": [
855
+ 0.6372549019607843,
856
+ 0.39898989898989895,
857
+ 0.24479166666666669,
858
+ 0.17204301075268816
859
+ ],
860
+ "bp": 0.9805831403241088,
861
+ "sys_len": 204,
862
+ "ref_len": 208,
863
+ "sacrebleu": 0.3154362559272254,
864
+ "score": 0.3154362559272254,
865
+ "score_name": "sacrebleu",
866
+ "score_ci_low": 0.18558098989202387,
867
+ "score_ci_high": 0.5388353725784034,
868
+ "sacrebleu_ci_low": 0.18558098989202387,
869
+ "sacrebleu_ci_high": 0.5388353725784034
870
+ },
871
+ "mt_flores_101_eng_ara": {
872
+ "num_of_instances": 6,
873
+ "counts": [
874
+ 115,
875
+ 62,
876
+ 39,
877
+ 23
878
+ ],
879
+ "totals": [
880
+ 198,
881
+ 192,
882
+ 186,
883
+ 180
884
+ ],
885
+ "precisions": [
886
+ 0.5808080808080809,
887
+ 0.32291666666666663,
888
+ 0.20967741935483872,
889
+ 0.1277777777777778
890
+ ],
891
+ "bp": 0.9459594689067654,
892
+ "sys_len": 198,
893
+ "ref_len": 209,
894
+ "sacrebleu": 0.25185759673418917,
895
+ "score": 0.25185759673418917,
896
+ "score_name": "sacrebleu",
897
+ "score_ci_low": 0.13927016777509416,
898
+ "score_ci_high": 0.3490260619035246,
899
+ "sacrebleu_ci_low": 0.13927016777509416,
900
+ "sacrebleu_ci_high": 0.3490260619035246
901
+ },
902
+ "mt_flores_101_eng_deu": {
903
+ "num_of_instances": 6,
904
+ "counts": [
905
+ 130,
906
+ 83,
907
+ 61,
908
+ 49
909
+ ],
910
+ "totals": [
911
+ 195,
912
+ 190,
913
+ 185,
914
+ 180
915
+ ],
916
+ "precisions": [
917
+ 0.6666666666666667,
918
+ 0.43684210526315786,
919
+ 0.32972972972972975,
920
+ 0.2722222222222222
921
+ ],
922
+ "bp": 0.8979038320326344,
923
+ "sys_len": 195,
924
+ "ref_len": 216,
925
+ "sacrebleu": 0.36104268767549647,
926
+ "score": 0.36104268767549647,
927
+ "score_name": "sacrebleu",
928
+ "score_ci_low": 0.1966239306512286,
929
+ "score_ci_high": 0.5546980188091108,
930
+ "sacrebleu_ci_low": 0.1966239306512286,
931
+ "sacrebleu_ci_high": 0.5546980188091108
932
+ },
933
+ "mt_flores_101_eng_fra": {
934
+ "num_of_instances": 6,
935
+ "counts": [
936
+ 113,
937
+ 79,
938
+ 54,
939
+ 34
940
+ ],
941
+ "totals": [
942
+ 158,
943
+ 154,
944
+ 150,
945
+ 146
946
+ ],
947
+ "precisions": [
948
+ 0.7151898734177214,
949
+ 0.512987012987013,
950
+ 0.36,
951
+ 0.2328767123287671
952
+ ],
953
+ "bp": 0.6142570611078176,
954
+ "sys_len": 158,
955
+ "ref_len": 235,
956
+ "sacrebleu": 0.25724043173711714,
957
+ "score": 0.25724043173711714,
958
+ "score_name": "sacrebleu",
959
+ "score_ci_low": 0.01141618661583713,
960
+ "score_ci_high": 0.40276606204285015,
961
+ "sacrebleu_ci_low": 0.01141618661583713,
962
+ "sacrebleu_ci_high": 0.40276606204285015
963
+ },
964
+ "mt_flores_101_eng_kor": {
965
+ "num_of_instances": 6,
966
+ "counts": [
967
+ 103,
968
+ 60,
969
+ 36,
970
+ 19
971
+ ],
972
+ "totals": [
973
+ 188,
974
+ 184,
975
+ 180,
976
+ 176
977
+ ],
978
+ "precisions": [
979
+ 0.5478723404255319,
980
+ 0.32608695652173914,
981
+ 0.2,
982
+ 0.10795454545454546
983
+ ],
984
+ "bp": 0.7229117789342253,
985
+ "sys_len": 188,
986
+ "ref_len": 249,
987
+ "sacrebleu": 0.18015911259729336,
988
+ "score": 0.18015911259729336,
989
+ "score_name": "sacrebleu",
990
+ "score_ci_low": 0.00534394043557254,
991
+ "score_ci_high": 0.2674171764554214,
992
+ "sacrebleu_ci_low": 0.00534394043557254,
993
+ "sacrebleu_ci_high": 0.2674171764554214
994
+ },
995
+ "mt_flores_101_eng_por": {
996
+ "num_of_instances": 6,
997
+ "counts": [
998
+ 162,
999
+ 118,
1000
+ 91,
1001
+ 72
1002
+ ],
1003
+ "totals": [
1004
+ 215,
1005
+ 209,
1006
+ 203,
1007
+ 197
1008
+ ],
1009
+ "precisions": [
1010
+ 0.7534883720930233,
1011
+ 0.5645933014354066,
1012
+ 0.4482758620689655,
1013
+ 0.36548223350253806
1014
+ ],
1015
+ "bp": 0.9679661710923415,
1016
+ "sys_len": 215,
1017
+ "ref_len": 222,
1018
+ "sacrebleu": 0.49735461800994313,
1019
+ "score": 0.49735461800994313,
1020
+ "score_name": "sacrebleu",
1021
+ "score_ci_low": 0.3936015193132162,
1022
+ "score_ci_high": 0.5938365039660292,
1023
+ "sacrebleu_ci_low": 0.3936015193132162,
1024
+ "sacrebleu_ci_high": 0.5938365039660292
1025
+ },
1026
+ "mt_flores_101_eng_ron": {
1027
+ "num_of_instances": 6,
1028
+ "counts": [
1029
+ 122,
1030
+ 83,
1031
+ 61,
1032
+ 50
1033
+ ],
1034
+ "totals": [
1035
+ 181,
1036
+ 176,
1037
+ 171,
1038
+ 166
1039
+ ],
1040
+ "precisions": [
1041
+ 0.6740331491712708,
1042
+ 0.47159090909090906,
1043
+ 0.3567251461988304,
1044
+ 0.30120481927710846
1045
+ ],
1046
+ "bp": 0.7628314075724358,
1047
+ "sys_len": 181,
1048
+ "ref_len": 230,
1049
+ "sacrebleu": 0.3279360384622422,
1050
+ "score": 0.3279360384622422,
1051
+ "score_name": "sacrebleu",
1052
+ "score_ci_low": 0.05614266270511962,
1053
+ "score_ci_high": 0.4307120450834441,
1054
+ "sacrebleu_ci_low": 0.05614266270511962,
1055
+ "sacrebleu_ci_high": 0.4307120450834441
1056
+ },
1057
+ "mt_flores_101_eng_spa": {
1058
+ "num_of_instances": 6,
1059
+ "counts": [
1060
+ 159,
1061
+ 99,
1062
+ 67,
1063
+ 46
1064
+ ],
1065
+ "totals": [
1066
+ 233,
1067
+ 227,
1068
+ 221,
1069
+ 215
1070
+ ],
1071
+ "precisions": [
1072
+ 0.6824034334763948,
1073
+ 0.43612334801762115,
1074
+ 0.3031674208144796,
1075
+ 0.21395348837209302
1076
+ ],
1077
+ "bp": 0.957989506197951,
1078
+ "sys_len": 233,
1079
+ "ref_len": 243,
1080
+ "sacrebleu": 0.35708685408102736,
1081
+ "score": 0.35708685408102736,
1082
+ "score_name": "sacrebleu",
1083
+ "score_ci_low": 0.3134751190784969,
1084
+ "score_ci_high": 0.44237915780333503,
1085
+ "sacrebleu_ci_low": 0.3134751190784969,
1086
+ "sacrebleu_ci_high": 0.44237915780333503
1087
+ },
1088
+ "mt_flores_101_fra_eng": {
1089
+ "num_of_instances": 6,
1090
+ "counts": [
1091
+ 144,
1092
+ 97,
1093
+ 64,
1094
+ 44
1095
+ ],
1096
+ "totals": [
1097
+ 219,
1098
+ 213,
1099
+ 207,
1100
+ 201
1101
+ ],
1102
+ "precisions": [
1103
+ 0.6575342465753425,
1104
+ 0.4553990610328638,
1105
+ 0.30917874396135264,
1106
+ 0.21890547263681592
1107
+ ],
1108
+ "bp": 1.0,
1109
+ "sys_len": 219,
1110
+ "ref_len": 208,
1111
+ "sacrebleu": 0.3773064147727779,
1112
+ "score": 0.3773064147727779,
1113
+ "score_name": "sacrebleu",
1114
+ "score_ci_low": 0.19660284291022648,
1115
+ "score_ci_high": 0.5367507535790375,
1116
+ "sacrebleu_ci_low": 0.19660284291022648,
1117
+ "sacrebleu_ci_high": 0.5367507535790375
1118
+ },
1119
+ "mt_flores_101_jpn_eng": {
1120
+ "num_of_instances": 6,
1121
+ "counts": [
1122
+ 131,
1123
+ 64,
1124
+ 36,
1125
+ 23
1126
+ ],
1127
+ "totals": [
1128
+ 210,
1129
+ 204,
1130
+ 198,
1131
+ 192
1132
+ ],
1133
+ "precisions": [
1134
+ 0.6238095238095238,
1135
+ 0.3137254901960784,
1136
+ 0.18181818181818182,
1137
+ 0.11979166666666666
1138
+ ],
1139
+ "bp": 1.0,
1140
+ "sys_len": 210,
1141
+ "ref_len": 208,
1142
+ "sacrebleu": 0.2555150181574835,
1143
+ "score": 0.2555150181574835,
1144
+ "score_name": "sacrebleu",
1145
+ "score_ci_low": 0.10158659890719782,
1146
+ "score_ci_high": 0.34125387925677286,
1147
+ "sacrebleu_ci_low": 0.10158659890719782,
1148
+ "sacrebleu_ci_high": 0.34125387925677286
1149
+ },
1150
+ "mt_flores_101_kor_eng": {
1151
+ "num_of_instances": 6,
1152
+ "counts": [
1153
+ 127,
1154
+ 73,
1155
+ 45,
1156
+ 30
1157
+ ],
1158
+ "totals": [
1159
+ 201,
1160
+ 195,
1161
+ 189,
1162
+ 183
1163
+ ],
1164
+ "precisions": [
1165
+ 0.6318407960199005,
1166
+ 0.37435897435897436,
1167
+ 0.2380952380952381,
1168
+ 0.16393442622950818
1169
+ ],
1170
+ "bp": 0.9657735711441044,
1171
+ "sys_len": 201,
1172
+ "ref_len": 208,
1173
+ "sacrebleu": 0.29936740534149714,
1174
+ "score": 0.29936740534149714,
1175
+ "score_name": "sacrebleu",
1176
+ "score_ci_low": 0.17940891931864458,
1177
+ "score_ci_high": 0.4579443376656351,
1178
+ "sacrebleu_ci_low": 0.17940891931864458,
1179
+ "sacrebleu_ci_high": 0.4579443376656351
1180
+ },
1181
+ "mt_flores_101_por_eng": {
1182
+ "num_of_instances": 6,
1183
+ "counts": [
1184
+ 150,
1185
+ 105,
1186
+ 74,
1187
+ 55
1188
+ ],
1189
+ "totals": [
1190
+ 201,
1191
+ 195,
1192
+ 189,
1193
+ 183
1194
+ ],
1195
+ "precisions": [
1196
+ 0.746268656716418,
1197
+ 0.5384615384615384,
1198
+ 0.3915343915343915,
1199
+ 0.3005464480874317
1200
+ ],
1201
+ "bp": 0.9657735711441044,
1202
+ "sys_len": 201,
1203
+ "ref_len": 208,
1204
+ "sacrebleu": 0.4503582964904346,
1205
+ "score": 0.4503582964904346,
1206
+ "score_name": "sacrebleu",
1207
+ "score_ci_low": 0.23158737188314713,
1208
+ "score_ci_high": 0.598006434562735,
1209
+ "sacrebleu_ci_low": 0.23158737188314713,
1210
+ "sacrebleu_ci_high": 0.598006434562735
1211
+ },
1212
+ "mt_flores_101_ron_eng": {
1213
+ "num_of_instances": 6,
1214
+ "counts": [
1215
+ 152,
1216
+ 103,
1217
+ 71,
1218
+ 50
1219
+ ],
1220
+ "totals": [
1221
+ 216,
1222
+ 210,
1223
+ 204,
1224
+ 198
1225
+ ],
1226
+ "precisions": [
1227
+ 0.7037037037037037,
1228
+ 0.4904761904761905,
1229
+ 0.3480392156862745,
1230
+ 0.25252525252525254
1231
+ ],
1232
+ "bp": 1.0,
1233
+ "sys_len": 216,
1234
+ "ref_len": 208,
1235
+ "sacrebleu": 0.4173353670626537,
1236
+ "score": 0.4173353670626537,
1237
+ "score_name": "sacrebleu",
1238
+ "score_ci_low": 0.27899206861096376,
1239
+ "score_ci_high": 0.58440086756332,
1240
+ "sacrebleu_ci_low": 0.27899206861096376,
1241
+ "sacrebleu_ci_high": 0.58440086756332
1242
+ },
1243
+ "mt_flores_101_spa_eng": {
1244
+ "num_of_instances": 6,
1245
+ "counts": [
1246
+ 97,
1247
+ 59,
1248
+ 39,
1249
+ 26
1250
+ ],
1251
+ "totals": [
1252
+ 164,
1253
+ 159,
1254
+ 154,
1255
+ 149
1256
+ ],
1257
+ "precisions": [
1258
+ 0.5914634146341463,
1259
+ 0.37106918238993714,
1260
+ 0.2532467532467533,
1261
+ 0.174496644295302
1262
+ ],
1263
+ "bp": 0.764683938413801,
1264
+ "sys_len": 164,
1265
+ "ref_len": 208,
1266
+ "sacrebleu": 0.2399719071849068,
1267
+ "score": 0.2399719071849068,
1268
+ "score_name": "sacrebleu",
1269
+ "score_ci_low": 0.05189527961371486,
1270
+ "score_ci_high": 0.411377459488148,
1271
+ "sacrebleu_ci_low": 0.05189527961371486,
1272
+ "sacrebleu_ci_high": 0.411377459488148
1273
+ },
1274
+ "score": 0.3343194537386239,
1275
+ "score_name": "subsets_mean",
1276
+ "num_of_instances": 90
1277
+ },
1278
+ "score": 0.5398167389664902,
1279
+ "score_name": "subsets_mean",
1280
+ "num_of_instances": 1537
1281
+ }
1282
+ }