Skip to content

Commit 65fdf7a

Browse files
authored
Storage Cost Reduction (stanford-crfm#1657)
1 parent 4810bd5 commit 65fdf7a

File tree

7 files changed

+350
-152
lines changed

7 files changed

+350
-152
lines changed

json-urls-root.js

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
// Base URL for benchmark_output JSON files
2+
const BENCHMARK_OUTPUT_BASE_URL = "benchmark_output";

json-urls.js

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
////////////////////////////////////////////////////////////
2+
// Helper functions for getting URLs of JSON files
3+
4+
function runManifestJsonUrl(release) {
5+
return `${BENCHMARK_OUTPUT_BASE_URL}/releases/${release}/run_manifest.json`;
6+
}
7+
8+
function summaryJsonUrl(release) {
9+
return `${BENCHMARK_OUTPUT_BASE_URL}/releases/${release}/summary.json`;
10+
}
11+
12+
function runSpecsJsonUrl(release) {
13+
return `${BENCHMARK_OUTPUT_BASE_URL}/releases/${release}/run_specs.json`;
14+
}
15+
16+
function groupsMetadataJsonUrl(release) {
17+
return `${BENCHMARK_OUTPUT_BASE_URL}/releases/${release}/groups_metadata.json`;
18+
}
19+
20+
function groupsJsonUrl(release) {
21+
return `${BENCHMARK_OUTPUT_BASE_URL}/releases/${release}/groups.json`;
22+
}
23+
24+
function groupJsonUrl(release, groupName) {
25+
return `${BENCHMARK_OUTPUT_BASE_URL}/releases/${release}/groups/${groupName}.json`;
26+
}
27+
28+
function runSpecJsonUrl(suite, runSpecName) {
29+
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/run_spec.json`
30+
}
31+
32+
function scenarioJsonUrl(suite, runSpecName) {
33+
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/scenario.json`;
34+
}
35+
36+
function scenarioStateJsonUrl(suite, runSpecName) {
37+
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/scenario_state.json`;
38+
}
39+
40+
function statsJsonUrl(suite, runSpecName) {
41+
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/stats.json`;
42+
}
43+
44+
function instancesJsonUrl(suite, runSpecName) {
45+
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/instances.json`;
46+
}
47+
48+
function predictionsJsonUrl(suite, runSpecName) {
49+
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/display_predictions.json`;
50+
}
51+
52+
function requestsJsonUrl(suite, runSpecName) {
53+
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/display_requests.json`;
54+
}
55+
56+
function plotUrl(suite, plotName) {
57+
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/plots/${plotName}.png`;
58+
}

src/helm/benchmark/presentation/summarize.py

+134-40
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,19 @@
6666
class ExecutiveSummary:
6767
"""
6868
Summary of the output of benchmarking.
69-
This is always loaded by the frontend, so keep this small
69+
This is always loaded by the frontend, so keep this small.
70+
71+
A note on the relation between `release`, `suites`, and `suite`:
72+
There are two modes for releasing runs: `release` and `suite`.
73+
`releases` contain a package of suites. When the `release` mode
74+
is used, `release` and `suites` will not be None and `suite`will be None.
75+
When `suite` mode is used, `suite` will not be None and `release`
76+
and `suites` will be None
7077
"""
7178

72-
suite: str
79+
release: Optional[str]
80+
suites: Optional[List[str]]
81+
suite: Optional[str]
7382
date: str
7483

7584
# TODO: later, put model rankings, etc. here
@@ -244,12 +253,44 @@ class Summarizer:
244253
"selective_acc@10",
245254
}
246255

247-
def __init__(self, suite: str, output_path: str, verbose: bool, num_threads: int):
248-
self.suite: str = suite
249-
self.run_suite_path: str = os.path.join(output_path, "runs", suite)
256+
def __init__(
257+
self,
258+
release: Optional[str],
259+
suites: Optional[List[str]],
260+
suite: Optional[str],
261+
output_path: str,
262+
verbose: bool,
263+
num_threads: int,
264+
):
265+
"""
266+
A note on the relation between `release`, `suites`, and `suite`:
267+
There are two modes for releasing runs: `release` and `suite`.
268+
`releases` contain a package of suites. When the `release` mode
269+
is used, `release` and `suites` will not be None and `suite`will be None.
270+
When `suite` mode is used, `suite` will not be None and `release`
271+
and `suites` will be None
272+
"""
273+
self.output_path: str = output_path
274+
self.run_release_path: str
275+
self.suites: List[str]
276+
self.run_suite_paths: List[str]
277+
self.suite: Optional[str] = None
278+
self.release: Optional[str] = None
279+
if suite:
280+
self.suite = suite
281+
self.run_release_path = os.path.join(output_path, "runs", suite)
282+
self.run_suite_paths = [self.run_release_path]
283+
self.suites = [suite]
284+
elif release and suites:
285+
self.release = release
286+
self.suites = suites
287+
self.run_release_path = os.path.join(output_path, "releases", release)
288+
self.run_suite_paths = [os.path.join(output_path, "runs", suite) for suite in suites]
250289
self.verbose: bool = verbose
251290
self.num_threads: int = num_threads
252291

292+
ensure_directory_exists(self.run_release_path)
293+
253294
self.schema = read_schema()
254295
self.contamination = read_contamination()
255296
validate_contamination(self.contamination, self.schema)
@@ -297,36 +338,48 @@ def filter_runs_by_visibility(self, runs: List[Run], group: RunGroup) -> List[Ru
297338
filtered_runs.append(run)
298339
return filtered_runs
299340

300-
def read_runs(self):
341+
def read_runs_for_suite(self, suite, run_suite_path):
301342
"""Load the runs in the run suite path."""
302-
self.runs: List[Run] = []
303343
# run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
304344
# so filter them out.
305-
run_dir_names = sorted([p for p in os.listdir(self.run_suite_path) if p != "eval_cache" and p != "groups"])
345+
run_dir_names = sorted([p for p in os.listdir(run_suite_path) if p != "eval_cache" and p != "groups"])
306346
for run_dir_name in tqdm(run_dir_names, disable=None):
307-
run_spec_path: str = os.path.join(self.run_suite_path, run_dir_name, "run_spec.json")
308-
stats_path: str = os.path.join(self.run_suite_path, run_dir_name, "stats.json")
347+
run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
348+
stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
309349
if not os.path.exists(run_spec_path) or not os.path.exists(stats_path):
310350
hlog(f"WARNING: {run_dir_name} doesn't have run_spec.json or stats.json, skipping")
311351
continue
312-
run_path: str = os.path.join(self.run_suite_path, run_dir_name)
352+
run_path: str = os.path.join(run_suite_path, run_dir_name)
313353
self.runs.append(self.read_run(run_path))
314354

315355
# For each group (e.g., natural_qa), map
316356
# (i) scenario spec (e.g., subject=philosophy) [optional] and
317357
# (ii) adapter spec (e.g., model = openai/davinci)
318358
# to list of runs
319-
self.group_adapter_to_runs: Dict[str, Dict[AdapterSpec, List[Run]]] = defaultdict(lambda: defaultdict(list))
320-
self.group_scenario_adapter_to_runs: Dict[str, Dict[ScenarioSpec, Dict[AdapterSpec, List[Run]]]] = defaultdict(
321-
lambda: defaultdict(lambda: defaultdict(list))
322-
)
323359
for run in self.runs:
360+
if run.run_spec.name in self.runs_to_run_suites:
361+
hlog(
362+
f"WARNING: Run entry {run.run_spec.name} is present in two different Run Suites. "
363+
f"Defaulting to the latest assigned suite: {suite}"
364+
)
365+
self.runs_to_run_suites[run.run_spec.name] = suite
366+
324367
scenario_spec = run.run_spec.scenario_spec
325368
adapter_spec = run.run_spec.adapter_spec
326369
for group_name in run.run_spec.groups:
327370
self.group_adapter_to_runs[group_name][adapter_spec].append(run)
328371
self.group_scenario_adapter_to_runs[group_name][scenario_spec][adapter_spec].append(run)
329372

373+
def read_runs(self):
374+
self.runs: List[Run] = []
375+
self.runs_to_run_suites: Dict[str, str] = {}
376+
self.group_adapter_to_runs: Dict[str, Dict[AdapterSpec, List[Run]]] = defaultdict(lambda: defaultdict(list))
377+
self.group_scenario_adapter_to_runs: Dict[str, Dict[ScenarioSpec, Dict[AdapterSpec, List[Run]]]] = defaultdict(
378+
lambda: defaultdict(lambda: defaultdict(list))
379+
)
380+
for suite, run_suite_path in zip(self.suites, self.run_suite_paths):
381+
self.read_runs_for_suite(suite, run_suite_path)
382+
330383
def read_overlap_stats(self):
331384
"""
332385
Load the overlap stats in the run suite path.
@@ -391,7 +444,7 @@ def get_stats_file_metadata(data_overlap_dir: str) -> Dict[str, List[str]]:
391444

392445
self._model_group_overlap_stats: Dict[Tuple[str, str], GroupOverlapStats] = {}
393446

394-
data_overlap_dir = os.path.join(self.run_suite_path, "data_overlap")
447+
data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
395448
if not os.path.isdir(data_overlap_dir):
396449
hlog(f"Directory {data_overlap_dir} not found; skipped import of overlap results.")
397450
return
@@ -481,11 +534,14 @@ def write_executive_summary(self):
481534
date = datetime.date.today().strftime("%Y-%m-%d")
482535

483536
summary = ExecutiveSummary(
537+
release=self.release,
538+
suites=self.suites,
484539
suite=self.suite,
485540
date=date,
486541
)
542+
487543
write(
488-
os.path.join(self.run_suite_path, "summary.json"),
544+
os.path.join(self.run_release_path, "summary.json"),
489545
json.dumps(asdict_without_nones(summary), indent=2),
490546
)
491547

@@ -507,22 +563,28 @@ def write_cost_report(self):
507563
costs["total_tokens"] = costs["num_prompt_tokens"] + costs["num_completion_tokens"]
508564

509565
write(
510-
os.path.join(self.run_suite_path, "costs.json"),
566+
os.path.join(self.run_release_path, "costs.json"),
511567
json.dumps(models_to_costs, indent=2),
512568
)
513569

514570
def write_runs(self):
515571
write(
516-
os.path.join(self.run_suite_path, "runs.json"),
572+
os.path.join(self.run_release_path, "runs.json"),
517573
json.dumps(list(map(asdict_without_nones, self.runs)), indent=2),
518574
)
519575

520576
def write_run_specs(self):
521577
write(
522-
os.path.join(self.run_suite_path, "run_specs.json"),
578+
os.path.join(self.run_release_path, "run_specs.json"),
523579
json.dumps(list(map(asdict_without_nones, [run.run_spec for run in self.runs])), indent=2),
524580
)
525581

582+
def write_runs_to_run_suites(self):
583+
write(
584+
os.path.join(self.run_release_path, "runs_to_run_suites.json"),
585+
json.dumps(self.runs_to_run_suites, indent=2),
586+
)
587+
526588
def expand_subgroups(self, group: RunGroup) -> List[RunGroup]:
527589
"""Given a RunGroup, collect a list of its subgroups by traversing the subgroup tree."""
528590

@@ -1048,18 +1110,18 @@ def write_groups(self):
10481110

10491111
# Write out index file with all the groups and basic stats
10501112
write(
1051-
os.path.join(self.run_suite_path, "groups.json"),
1113+
os.path.join(self.run_release_path, "groups.json"),
10521114
json.dumps(list(map(asdict_without_nones, self.create_index_tables())), indent=2),
10531115
)
10541116

10551117
# Write out metadata file for all groups
10561118
write(
1057-
os.path.join(self.run_suite_path, "groups_metadata.json"),
1119+
os.path.join(self.run_release_path, "groups_metadata.json"),
10581120
json.dumps(self.create_groups_metadata(), indent=2),
10591121
)
10601122

10611123
# Write out a separate JSON for each group
1062-
groups_path = os.path.join(self.run_suite_path, "groups")
1124+
groups_path = os.path.join(self.run_release_path, "groups")
10631125
ensure_directory_exists(groups_path)
10641126
for group in self.schema.run_groups:
10651127
if group.subgroup_display_mode == BY_GROUP or len(self.expand_subgroups(group)) == 1:
@@ -1114,7 +1176,7 @@ def read_scenario_spec_instance_ids(self, num_instances) -> None:
11141176
"""
11151177
self.scenario_spec_instance_id_dict: Dict[ScenarioSpec, List[str]] = dict()
11161178

1117-
data_overlap_dir = os.path.join(self.run_suite_path, "data_overlap")
1179+
data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
11181180
if not os.path.isdir(data_overlap_dir):
11191181
hlog(f"Directory {data_overlap_dir} not found; skipped producing instance ids file.")
11201182
return
@@ -1163,18 +1225,16 @@ def write_scenario_spec_instance_ids_json(self, file_path) -> None:
11631225
for scenario_spec_instance_ids in all_scenario_spec_instance_ids
11641226
)
11651227

1166-
1167-
def symlink_latest(output_path: str, suite: str) -> None:
1168-
# Create a symlink runs/latest -> runs/<name_of_suite>,
1169-
# so runs/latest always points to the latest run suite.
1170-
runs_dir: str = os.path.join(output_path, "runs")
1171-
suite_dir: str = os.path.join(runs_dir, suite)
1172-
symlink_path: str = os.path.abspath(os.path.join(runs_dir, LATEST_SYMLINK))
1173-
hlog(f"Symlinking {suite_dir} to {LATEST_SYMLINK}.")
1174-
if os.path.islink(symlink_path):
1175-
# Remove the previous symlink if it exists.
1176-
os.unlink(symlink_path)
1177-
os.symlink(os.path.abspath(suite_dir), symlink_path)
1228+
def symlink_latest(self) -> None:
1229+
# Create a symlink runs/latest -> runs/<name_of_suite>,
1230+
# so runs/latest always points to the latest run suite.
1231+
releases_dir: str = os.path.dirname(self.run_release_path)
1232+
symlink_path: str = os.path.abspath(os.path.join(releases_dir, LATEST_SYMLINK))
1233+
hlog(f"Symlinking {self.run_release_path} to {LATEST_SYMLINK}.")
1234+
if os.path.islink(symlink_path):
1235+
# Remove the previous symlink if it exists.
1236+
os.unlink(symlink_path)
1237+
os.symlink(os.path.abspath(self.run_release_path), symlink_path)
11781238

11791239

11801240
@htrack(None)
@@ -1186,8 +1246,15 @@ def main():
11861246
parser.add_argument(
11871247
"--suite",
11881248
type=str,
1189-
help="Name of the suite this run belongs to (default is today's date).",
1190-
required=True,
1249+
help="Name of the suite this summarization should go under.",
1250+
)
1251+
parser.add_argument(
1252+
"--release",
1253+
type=str,
1254+
help="Experimental: Name of the release this summarization should go under.",
1255+
)
1256+
parser.add_argument(
1257+
"--suites", type=str, nargs="+", help="Experimental: List of suites to summarize for this this release."
11911258
)
11921259
parser.add_argument("-n", "--num-threads", type=int, help="Max number of threads used to summarize", default=8)
11931260
parser.add_argument(
@@ -1208,9 +1275,35 @@ def main():
12081275
)
12091276
args = parser.parse_args()
12101277

1278+
release: Optional[str] = None
1279+
suites: Optional[str] = None
1280+
suite: Optional[str] = None
1281+
if args.suite and (args.release or args.suites):
1282+
raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
1283+
elif args.suite:
1284+
# Comment this out while we have a trial period for the `release` method.
1285+
# hlog(
1286+
# "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
1287+
# "where --release specifies the name of a release and --suites specifies several run suites "
1288+
# "to be included in that release."
1289+
# )
1290+
suite = args.suite
1291+
elif args.release or args.suites:
1292+
if not args.release or not args.suites:
1293+
raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
1294+
release = args.release
1295+
suites = args.suites
1296+
else:
1297+
raise ValueError("Exactly one of --release or --suite must be specified.")
1298+
12111299
# Output JSON files summarizing the benchmark results which will be loaded in the web interface
12121300
summarizer = Summarizer(
1213-
suite=args.suite, output_path=args.output_path, verbose=args.debug, num_threads=args.num_threads
1301+
release=release,
1302+
suites=suites,
1303+
suite=suite,
1304+
output_path=args.output_path,
1305+
verbose=args.debug,
1306+
num_threads=args.num_threads,
12141307
)
12151308
summarizer.read_runs()
12161309
summarizer.check_metrics_defined()
@@ -1228,10 +1321,11 @@ def main():
12281321
summarizer.write_executive_summary()
12291322
summarizer.write_runs()
12301323
summarizer.write_run_specs()
1324+
summarizer.write_runs_to_run_suites()
12311325
summarizer.write_groups()
12321326
summarizer.write_cost_report()
12331327

1234-
symlink_latest(args.output_path, args.suite)
1328+
summarizer.symlink_latest()
12351329
hlog("Done.")
12361330

12371331

0 commit comments

Comments
 (0)