66
66
class ExecutiveSummary :
67
67
"""
68
68
Summary of the output of benchmarking.
69
- This is always loaded by the frontend, so keep this small
69
+ This is always loaded by the frontend, so keep this small.
70
+
71
+ A note on the relation between `release`, `suites`, and `suite`:
72
+ There are two modes for releasing runs: `release` and `suite`.
73
+ `releases` contain a package of suites. When the `release` mode
74
+ is used, `release` and `suites` will not be None and `suite`will be None.
75
+ When `suite` mode is used, `suite` will not be None and `release`
76
+ and `suites` will be None
70
77
"""
71
78
72
- suite : str
79
+ release : Optional [str ]
80
+ suites : Optional [List [str ]]
81
+ suite : Optional [str ]
73
82
date : str
74
83
75
84
# TODO: later, put model rankings, etc. here
@@ -244,12 +253,44 @@ class Summarizer:
244
253
"selective_acc@10" ,
245
254
}
246
255
247
- def __init__ (self , suite : str , output_path : str , verbose : bool , num_threads : int ):
248
- self .suite : str = suite
249
- self .run_suite_path : str = os .path .join (output_path , "runs" , suite )
256
+ def __init__ (
257
+ self ,
258
+ release : Optional [str ],
259
+ suites : Optional [List [str ]],
260
+ suite : Optional [str ],
261
+ output_path : str ,
262
+ verbose : bool ,
263
+ num_threads : int ,
264
+ ):
265
+ """
266
+ A note on the relation between `release`, `suites`, and `suite`:
267
+ There are two modes for releasing runs: `release` and `suite`.
268
+ `releases` contain a package of suites. When the `release` mode
269
+ is used, `release` and `suites` will not be None and `suite`will be None.
270
+ When `suite` mode is used, `suite` will not be None and `release`
271
+ and `suites` will be None
272
+ """
273
+ self .output_path : str = output_path
274
+ self .run_release_path : str
275
+ self .suites : List [str ]
276
+ self .run_suite_paths : List [str ]
277
+ self .suite : Optional [str ] = None
278
+ self .release : Optional [str ] = None
279
+ if suite :
280
+ self .suite = suite
281
+ self .run_release_path = os .path .join (output_path , "runs" , suite )
282
+ self .run_suite_paths = [self .run_release_path ]
283
+ self .suites = [suite ]
284
+ elif release and suites :
285
+ self .release = release
286
+ self .suites = suites
287
+ self .run_release_path = os .path .join (output_path , "releases" , release )
288
+ self .run_suite_paths = [os .path .join (output_path , "runs" , suite ) for suite in suites ]
250
289
self .verbose : bool = verbose
251
290
self .num_threads : int = num_threads
252
291
292
+ ensure_directory_exists (self .run_release_path )
293
+
253
294
self .schema = read_schema ()
254
295
self .contamination = read_contamination ()
255
296
validate_contamination (self .contamination , self .schema )
@@ -297,36 +338,48 @@ def filter_runs_by_visibility(self, runs: List[Run], group: RunGroup) -> List[Ru
297
338
filtered_runs .append (run )
298
339
return filtered_runs
299
340
300
- def read_runs (self ):
341
+ def read_runs_for_suite (self , suite , run_suite_path ):
301
342
"""Load the runs in the run suite path."""
302
- self .runs : List [Run ] = []
303
343
# run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
304
344
# so filter them out.
305
- run_dir_names = sorted ([p for p in os .listdir (self . run_suite_path ) if p != "eval_cache" and p != "groups" ])
345
+ run_dir_names = sorted ([p for p in os .listdir (run_suite_path ) if p != "eval_cache" and p != "groups" ])
306
346
for run_dir_name in tqdm (run_dir_names , disable = None ):
307
- run_spec_path : str = os .path .join (self . run_suite_path , run_dir_name , "run_spec.json" )
308
- stats_path : str = os .path .join (self . run_suite_path , run_dir_name , "stats.json" )
347
+ run_spec_path : str = os .path .join (run_suite_path , run_dir_name , "run_spec.json" )
348
+ stats_path : str = os .path .join (run_suite_path , run_dir_name , "stats.json" )
309
349
if not os .path .exists (run_spec_path ) or not os .path .exists (stats_path ):
310
350
hlog (f"WARNING: { run_dir_name } doesn't have run_spec.json or stats.json, skipping" )
311
351
continue
312
- run_path : str = os .path .join (self . run_suite_path , run_dir_name )
352
+ run_path : str = os .path .join (run_suite_path , run_dir_name )
313
353
self .runs .append (self .read_run (run_path ))
314
354
315
355
# For each group (e.g., natural_qa), map
316
356
# (i) scenario spec (e.g., subject=philosophy) [optional] and
317
357
# (ii) adapter spec (e.g., model = openai/davinci)
318
358
# to list of runs
319
- self .group_adapter_to_runs : Dict [str , Dict [AdapterSpec , List [Run ]]] = defaultdict (lambda : defaultdict (list ))
320
- self .group_scenario_adapter_to_runs : Dict [str , Dict [ScenarioSpec , Dict [AdapterSpec , List [Run ]]]] = defaultdict (
321
- lambda : defaultdict (lambda : defaultdict (list ))
322
- )
323
359
for run in self .runs :
360
+ if run .run_spec .name in self .runs_to_run_suites :
361
+ hlog (
362
+ f"WARNING: Run entry { run .run_spec .name } is present in two different Run Suites. "
363
+ f"Defaulting to the latest assigned suite: { suite } "
364
+ )
365
+ self .runs_to_run_suites [run .run_spec .name ] = suite
366
+
324
367
scenario_spec = run .run_spec .scenario_spec
325
368
adapter_spec = run .run_spec .adapter_spec
326
369
for group_name in run .run_spec .groups :
327
370
self .group_adapter_to_runs [group_name ][adapter_spec ].append (run )
328
371
self .group_scenario_adapter_to_runs [group_name ][scenario_spec ][adapter_spec ].append (run )
329
372
373
+ def read_runs (self ):
374
+ self .runs : List [Run ] = []
375
+ self .runs_to_run_suites : Dict [str , str ] = {}
376
+ self .group_adapter_to_runs : Dict [str , Dict [AdapterSpec , List [Run ]]] = defaultdict (lambda : defaultdict (list ))
377
+ self .group_scenario_adapter_to_runs : Dict [str , Dict [ScenarioSpec , Dict [AdapterSpec , List [Run ]]]] = defaultdict (
378
+ lambda : defaultdict (lambda : defaultdict (list ))
379
+ )
380
+ for suite , run_suite_path in zip (self .suites , self .run_suite_paths ):
381
+ self .read_runs_for_suite (suite , run_suite_path )
382
+
330
383
def read_overlap_stats (self ):
331
384
"""
332
385
Load the overlap stats in the run suite path.
@@ -391,7 +444,7 @@ def get_stats_file_metadata(data_overlap_dir: str) -> Dict[str, List[str]]:
391
444
392
445
self ._model_group_overlap_stats : Dict [Tuple [str , str ], GroupOverlapStats ] = {}
393
446
394
- data_overlap_dir = os .path .join (self .run_suite_path , "data_overlap" )
447
+ data_overlap_dir = os .path .join (self .run_release_path , "data_overlap" )
395
448
if not os .path .isdir (data_overlap_dir ):
396
449
hlog (f"Directory { data_overlap_dir } not found; skipped import of overlap results." )
397
450
return
@@ -481,11 +534,14 @@ def write_executive_summary(self):
481
534
date = datetime .date .today ().strftime ("%Y-%m-%d" )
482
535
483
536
summary = ExecutiveSummary (
537
+ release = self .release ,
538
+ suites = self .suites ,
484
539
suite = self .suite ,
485
540
date = date ,
486
541
)
542
+
487
543
write (
488
- os .path .join (self .run_suite_path , "summary.json" ),
544
+ os .path .join (self .run_release_path , "summary.json" ),
489
545
json .dumps (asdict_without_nones (summary ), indent = 2 ),
490
546
)
491
547
@@ -507,22 +563,28 @@ def write_cost_report(self):
507
563
costs ["total_tokens" ] = costs ["num_prompt_tokens" ] + costs ["num_completion_tokens" ]
508
564
509
565
write (
510
- os .path .join (self .run_suite_path , "costs.json" ),
566
+ os .path .join (self .run_release_path , "costs.json" ),
511
567
json .dumps (models_to_costs , indent = 2 ),
512
568
)
513
569
514
570
def write_runs (self ):
515
571
write (
516
- os .path .join (self .run_suite_path , "runs.json" ),
572
+ os .path .join (self .run_release_path , "runs.json" ),
517
573
json .dumps (list (map (asdict_without_nones , self .runs )), indent = 2 ),
518
574
)
519
575
520
576
def write_run_specs (self ):
521
577
write (
522
- os .path .join (self .run_suite_path , "run_specs.json" ),
578
+ os .path .join (self .run_release_path , "run_specs.json" ),
523
579
json .dumps (list (map (asdict_without_nones , [run .run_spec for run in self .runs ])), indent = 2 ),
524
580
)
525
581
582
+ def write_runs_to_run_suites (self ):
583
+ write (
584
+ os .path .join (self .run_release_path , "runs_to_run_suites.json" ),
585
+ json .dumps (self .runs_to_run_suites , indent = 2 ),
586
+ )
587
+
526
588
def expand_subgroups (self , group : RunGroup ) -> List [RunGroup ]:
527
589
"""Given a RunGroup, collect a list of its subgroups by traversing the subgroup tree."""
528
590
@@ -1048,18 +1110,18 @@ def write_groups(self):
1048
1110
1049
1111
# Write out index file with all the groups and basic stats
1050
1112
write (
1051
- os .path .join (self .run_suite_path , "groups.json" ),
1113
+ os .path .join (self .run_release_path , "groups.json" ),
1052
1114
json .dumps (list (map (asdict_without_nones , self .create_index_tables ())), indent = 2 ),
1053
1115
)
1054
1116
1055
1117
# Write out metadata file for all groups
1056
1118
write (
1057
- os .path .join (self .run_suite_path , "groups_metadata.json" ),
1119
+ os .path .join (self .run_release_path , "groups_metadata.json" ),
1058
1120
json .dumps (self .create_groups_metadata (), indent = 2 ),
1059
1121
)
1060
1122
1061
1123
# Write out a separate JSON for each group
1062
- groups_path = os .path .join (self .run_suite_path , "groups" )
1124
+ groups_path = os .path .join (self .run_release_path , "groups" )
1063
1125
ensure_directory_exists (groups_path )
1064
1126
for group in self .schema .run_groups :
1065
1127
if group .subgroup_display_mode == BY_GROUP or len (self .expand_subgroups (group )) == 1 :
@@ -1114,7 +1176,7 @@ def read_scenario_spec_instance_ids(self, num_instances) -> None:
1114
1176
"""
1115
1177
self .scenario_spec_instance_id_dict : Dict [ScenarioSpec , List [str ]] = dict ()
1116
1178
1117
- data_overlap_dir = os .path .join (self .run_suite_path , "data_overlap" )
1179
+ data_overlap_dir = os .path .join (self .run_release_path , "data_overlap" )
1118
1180
if not os .path .isdir (data_overlap_dir ):
1119
1181
hlog (f"Directory { data_overlap_dir } not found; skipped producing instance ids file." )
1120
1182
return
@@ -1163,18 +1225,16 @@ def write_scenario_spec_instance_ids_json(self, file_path) -> None:
1163
1225
for scenario_spec_instance_ids in all_scenario_spec_instance_ids
1164
1226
)
1165
1227
1166
-
1167
- def symlink_latest (output_path : str , suite : str ) -> None :
1168
- # Create a symlink runs/latest -> runs/<name_of_suite>,
1169
- # so runs/latest always points to the latest run suite.
1170
- runs_dir : str = os .path .join (output_path , "runs" )
1171
- suite_dir : str = os .path .join (runs_dir , suite )
1172
- symlink_path : str = os .path .abspath (os .path .join (runs_dir , LATEST_SYMLINK ))
1173
- hlog (f"Symlinking { suite_dir } to { LATEST_SYMLINK } ." )
1174
- if os .path .islink (symlink_path ):
1175
- # Remove the previous symlink if it exists.
1176
- os .unlink (symlink_path )
1177
- os .symlink (os .path .abspath (suite_dir ), symlink_path )
1228
+ def symlink_latest (self ) -> None :
1229
+ # Create a symlink runs/latest -> runs/<name_of_suite>,
1230
+ # so runs/latest always points to the latest run suite.
1231
+ releases_dir : str = os .path .dirname (self .run_release_path )
1232
+ symlink_path : str = os .path .abspath (os .path .join (releases_dir , LATEST_SYMLINK ))
1233
+ hlog (f"Symlinking { self .run_release_path } to { LATEST_SYMLINK } ." )
1234
+ if os .path .islink (symlink_path ):
1235
+ # Remove the previous symlink if it exists.
1236
+ os .unlink (symlink_path )
1237
+ os .symlink (os .path .abspath (self .run_release_path ), symlink_path )
1178
1238
1179
1239
1180
1240
@htrack (None )
@@ -1186,8 +1246,15 @@ def main():
1186
1246
parser .add_argument (
1187
1247
"--suite" ,
1188
1248
type = str ,
1189
- help = "Name of the suite this run belongs to (default is today's date)." ,
1190
- required = True ,
1249
+ help = "Name of the suite this summarization should go under." ,
1250
+ )
1251
+ parser .add_argument (
1252
+ "--release" ,
1253
+ type = str ,
1254
+ help = "Experimental: Name of the release this summarization should go under." ,
1255
+ )
1256
+ parser .add_argument (
1257
+ "--suites" , type = str , nargs = "+" , help = "Experimental: List of suites to summarize for this this release."
1191
1258
)
1192
1259
parser .add_argument ("-n" , "--num-threads" , type = int , help = "Max number of threads used to summarize" , default = 8 )
1193
1260
parser .add_argument (
@@ -1208,9 +1275,35 @@ def main():
1208
1275
)
1209
1276
args = parser .parse_args ()
1210
1277
1278
+ release : Optional [str ] = None
1279
+ suites : Optional [str ] = None
1280
+ suite : Optional [str ] = None
1281
+ if args .suite and (args .release or args .suites ):
1282
+ raise ValueError ("If --suite is specified, then --release and --suites must NOT be specified." )
1283
+ elif args .suite :
1284
+ # Comment this out while we have a trial period for the `release` method.
1285
+ # hlog(
1286
+ # "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
1287
+ # "where --release specifies the name of a release and --suites specifies several run suites "
1288
+ # "to be included in that release."
1289
+ # )
1290
+ suite = args .suite
1291
+ elif args .release or args .suites :
1292
+ if not args .release or not args .suites :
1293
+ raise ValueError ("If --release is specified, then --suites must also be specified and vice versa" )
1294
+ release = args .release
1295
+ suites = args .suites
1296
+ else :
1297
+ raise ValueError ("Exactly one of --release or --suite must be specified." )
1298
+
1211
1299
# Output JSON files summarizing the benchmark results which will be loaded in the web interface
1212
1300
summarizer = Summarizer (
1213
- suite = args .suite , output_path = args .output_path , verbose = args .debug , num_threads = args .num_threads
1301
+ release = release ,
1302
+ suites = suites ,
1303
+ suite = suite ,
1304
+ output_path = args .output_path ,
1305
+ verbose = args .debug ,
1306
+ num_threads = args .num_threads ,
1214
1307
)
1215
1308
summarizer .read_runs ()
1216
1309
summarizer .check_metrics_defined ()
@@ -1228,10 +1321,11 @@ def main():
1228
1321
summarizer .write_executive_summary ()
1229
1322
summarizer .write_runs ()
1230
1323
summarizer .write_run_specs ()
1324
+ summarizer .write_runs_to_run_suites ()
1231
1325
summarizer .write_groups ()
1232
1326
summarizer .write_cost_report ()
1233
1327
1234
- symlink_latest (args . output_path , args . suite )
1328
+ summarizer . symlink_latest ()
1235
1329
hlog ("Done." )
1236
1330
1237
1331
0 commit comments