Skip to content

Commit c2ee966

Browse files
committed
Rename modules and commands
1 parent 9eb099c commit c2ee966

File tree

285 files changed

+581
-550
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

285 files changed

+581
-550
lines changed

.github/workflows/test.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,13 @@ jobs:
4747
- run: ./pre-commit-venv.sh
4848
- name: Run tests
4949
# Skip ICE tokenizer tests. GHA is having trouble downloading ice_text.model.
50-
run: venv/bin/pytest --ignore src/benchmark/window_services/test_ice_window_service.py --ignore src/proxy/clients/test_ice_tokenizer_client.py
50+
run: venv/bin/pytest --ignore src/helm/benchmark/window_services/test_ice_window_service.py --ignore src/helm/proxy/clients/test_ice_tokenizer_client.py
5151
env:
5252
TEST: ${{ matrix.test }}
5353
VERSION: ${{ github.head_ref || 'main' }}
5454
- name: Run entire pipeline quickly without any data
5555
# Checking RunSpecs with openai/davinci should be comprehensive enough
56-
run: venv/bin/benchmark-present --suite test -m 100 --skip-instances --models-to-run openai/davinci --exit-on-error
56+
run: venv/bin/helm-run --suite test -m 100 --skip-instances --models-to-run openai/davinci --exit-on-error
5757

5858
ci:
5959
name: All CI tasks complete

MANIFEST.in

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
recursive-include src/benchmark/efficiency_data/ *.json
2-
recursive-include src/benchmark/static/ *.css *.html *.js *.png *.yaml
1+
recursive-include src/helm/benchmark/efficiency_data/ *.json
2+
recursive-include src/helm/benchmark/static/ *.css *.html *.js *.png *.yaml
33
include requirements.txt

demo.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import getpass
22

3-
from common.authentication import Authentication
4-
from common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
5-
from common.request import Request, RequestResult
6-
from common.tokenization_request import TokenizationRequest, TokenizationRequestResult
7-
from proxy.accounts import Account
8-
from proxy.services.remote_service import RemoteService
3+
from helm.common.authentication import Authentication
4+
from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
5+
from helm.common.request import Request, RequestResult
6+
from helm.common.tokenization_request import TokenizationRequest, TokenizationRequestResult
7+
from helm.proxy.accounts import Account
8+
from helm.proxy.services.remote_service import RemoteService
99

1010
# An example of how to use the request API.
1111
api_key = getpass.getpass(prompt="Enter a valid API key: ")

docs/benchmark.md

+11-11
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,16 @@ directory exists.
1111
To try to test things out a small subset (defined in `run_specs_small.conf`) with just 10 eval instances:
1212

1313
# Just load the config file
14-
venv/bin/benchmark-present --conf src/benchmark/presentation/run_specs_small.conf --local --max-eval-instances 10 --suite $SUITE --skip-instances
14+
venv/bin/helm-run --conf src/helm/benchmark/presentation/run_specs_small.conf --local --max-eval-instances 10 --suite $SUITE --skip-instances
1515

1616
# Create the instances and the requests, but don't execute
17-
venv/bin/benchmark-present --conf src/benchmark/presentation/run_specs_small.conf --local --max-eval-instances 10 --suite $SUITE --dry-run
17+
venv/bin/helm-run --conf src/helm/benchmark/presentation/run_specs_small.conf --local --max-eval-instances 10 --suite $SUITE --dry-run
1818

1919
# Execute the requests and compute metrics
20-
venv/bin/benchmark-present --conf src/benchmark/presentation/run_specs_small.conf --local --max-eval-instances 10 --suite $SUITE
20+
venv/bin/helm-run --conf src/helm/benchmark/presentation/run_specs_small.conf --local --max-eval-instances 10 --suite $SUITE
2121

2222
# Generate assets for the website
23-
venv/bin/benchmark-summarize --suite $SUITE
23+
venv/bin/helm-summarize --suite $SUITE
2424

2525
Notes:
2626
- `--local` means we bypass the proxy server.
@@ -30,7 +30,7 @@ To run everything (note we're restricting the number of instances and
3030
scenarios) in parallel:
3131

3232
# Generate all the commands to run in parallel
33-
venv/bin/benchmark-present --local --suite $SUITE --max-eval-instances 1000 --priority 2 --num-threads 8 --skip-instances
33+
venv/bin/helm-run --local --suite $SUITE --max-eval-instances 1000 --priority 2 --num-threads 8 --skip-instances
3434

3535
# Run everything in parallel over Slurm
3636
bash benchmark_output/runs/$SUITE/run-all.sh
@@ -39,11 +39,11 @@ scenarios) in parallel:
3939
# tail benchmark_output/runs/$SUITE/slurm-*.out
4040

4141
# Generate assets for the website
42-
venv/bin/benchmark-present --local --suite $SUITE --max-eval-instances 1000 --skip-instances
43-
venv/bin/benchmark-summarize --suite $SUITE
42+
venv/bin/helm-run --local --suite $SUITE --max-eval-instances 1000 --skip-instances
43+
venv/bin/helm-summarize --suite $SUITE
4444

4545
# Run a simple Python server to make sure things work at http://localhost:8000
46-
benchmark-server
46+
helm-server
4747

4848
# Copy all website assets to the `www` directory, which can be copied to GitHub pages for static serving.
4949
sh scripts/create-www.sh $SUITE
@@ -56,15 +56,15 @@ Once everytihng has been sanity checked, push `www` to a GitHub page.
5656

5757
To estimate token usage without making any requests, append the `--dry-run` option:
5858

59-
venv/bin/benchmark-run -r <RunSpec to estimate token usage> --suite $SUITE --max-eval-instances <Number of eval instances> --dry-run
59+
venv/bin/helm-run -r <RunSpec to estimate token usage> --suite $SUITE --max-eval-instances <Number of eval instances> --dry-run
6060

6161
and check the output in `benchmark_output/runs/$SUITE`.
6262

6363

6464
where `sum` indicates the estimated total number of tokens used for the specific `RunSpec`.
6565

6666
For the OpenAI models, we use a
67-
[GPT-2 Tokenizer](https://github.com/stanford-crfm/benchmarking/blob/master/src/proxy/tokenizer/openai_token_counter.py#L12)
67+
[GPT-2 Tokenizer](https://github.com/stanford-crfm/benchmarking/blob/master/src/helm/proxy/tokenizer/openai_token_counter.py#L12)
6868
to estimate the token usage. The tokenizer will be downloaded and cached when running a dry run.
6969

7070
## Final benchmarking (Infrastructure team only)
@@ -115,5 +115,5 @@ to estimate the token usage. The tokenizer will be downloaded and cached when ru
115115
1. Create a screen session: `screen -S reproducible`.
116116
1. `conda activate crfm_benchmarking`.
117117
1. Run `python3 scripts/verify_reproducibility.py --models-to-run openai/davinci openai/code-cushman-001 together/gpt-neox-20b
118-
--conf-path src/benchmark/presentation/run_specs.conf --max-eval-instances 1000 --priority 2 &> reproducible.log`.
118+
--conf-path src/helm/benchmark/presentation/run_specs.conf --max-eval-instances 1000 --priority 2 &> reproducible.log`.
119119
1. Check the result at `reproducible.log`.

docs/code.md

+6-6
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ In order to implement new scenarios:
6464
`__init__` function even if it is simply `pass`.
6565
7. Define a function `get_specname_spec` in `run_specs.py` to retrieve a `ScenarioSpec`
6666
for your scenario using a class name corresponding to the Python path of
67-
the class (e.g. `benchmark.scenarios.your_scenario.YourScenario`) and any
67+
the class (e.g. `helm.benchmark.scenarios.your_scenario.YourScenario`) and any
6868
arguments which must be passed as a dictionary of `args`.
6969
8. Have the `get_specname_spec` function retrieve an `AdapterSpec` for your
7070
scenario specifying the type of language model generation which must be
@@ -81,10 +81,10 @@ In order to implement new scenarios:
8181
and `groups`.
8282
12. Add the scenario to `__init__.py`
8383
13. Attempt to run your task with
84-
`venv/bin/benchmark-run -r yourscenarioname:arg=value` where
84+
`venv/bin/helm-run -r yourscenarioname:arg=value` where
8585
`yourscenarioname` matches the `name` specified in YourScenario
8686
14. Add the spec to dictionary `CANONICAL_RUN_SPEC_FUNCS` in `run_specs.py`.
87-
15. Update `src/proxy/static/contamination.yaml` with models that we trained on your scenario (i.e. contaminated).
87+
15. Update `src/helm/proxy/static/contamination.yaml` with models that we trained on your scenario (i.e. contaminated).
8888

8989

9090
## Adding new metrics
@@ -113,7 +113,7 @@ example:
113113
data_augmenter_spec = DataAugmenterSpec(
114114
perturbation_specs=[
115115
PerturbationSpec(
116-
class_name="benchmark.augmentations.perturbation.ExtraSpacePerturbation",
116+
class_name="helm.benchmark.augmentations.perturbation.ExtraSpacePerturbation",
117117
args={"num_spaces": 5},
118118
)
119119
],
@@ -138,12 +138,12 @@ multiple perturbations and applying it onto a single instance.
138138

139139
### Adding a new perturbation
140140

141-
To add a new perturbation to the framework, create a new file at `src/benchmark/augmentations` with the name
141+
To add a new perturbation to the framework, create a new file at `src/helm/benchmark/augmentations` with the name
142142
`<Name of perturbation>_perturbation.py` e.g., `typo_perturbation.py`. Inside the file, create a new class
143143
(name it `<Name of the perturbation>Perturbation` e.g., `TypoPerturbation`)
144144
that extends the abstract class `Perturbation` and implement the `perturb` method which
145145
takes in text and outputs the perturbed text.
146-
Add your new perturbation to `src/benchmark/__init__.py`.
146+
Add your new perturbation to `src/helm/benchmark/__init__.py`.
147147
Add a test for the new perturbation in `test_perturbation.py`.
148148

149149
## Supporting new Hugging Face tokenizers

docs/deployment.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ If everything looks okay:
109109

110110
# Hit ctrl-c to kill the existing process
111111
# Restart the server
112-
sudo venv/bin/proxy-server -p 443 --ssl-key-file /home/ssl/private.key --ssl-cert-file /home/ssl/crfm-models.crt --workers 16 &> server.log
112+
sudo venv/bin/crfm-proxy-server -p 443 --ssl-key-file /home/ssl/private.key --ssl-cert-file /home/ssl/crfm-models.crt --workers 16 &> server.log
113113

114114
# Exit the screen session: ctrl-ad
115115

docs/proxy-server.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ models you have access to.
1919

2020
To start a local server (go to `http://localhost:1959` to try it out):
2121

22-
venv/bin/proxy-server
22+
venv/bin/crfm-proxy-server
2323

2424
When starting the server for the first time, the server will create an admin account
2525
with the API key: `root`.
@@ -30,4 +30,4 @@ default admin account.
3030

3131
Bypass the added security that restricts multithreading by running:
3232

33-
OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES venv/bin/proxy-server
33+
OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES venv/bin/crfm-proxy-server

scripts/benchmark-present-all.sh

+8-8
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
: '
2-
Run RunSpecs in parallel by models using benchmark-present. To bypass the proxy server
2+
Run RunSpecs in parallel by models using helm-run. To bypass the proxy server
33
and run in root mode, append --local.
44
55
Usage:
66
7-
bash scripts/benchmark-present-all.sh <Any additional CLI arguments for benchmark-present>
7+
bash scripts/helm-run-all.sh <Any additional CLI arguments for helm-run>
88
99
e.g.,
10-
bash scripts/benchmark-present-all.sh --max-eval-instances 1000 --num-threads 1 --priority 2 --local
10+
bash scripts/helm-run-all.sh --max-eval-instances 1000 --num-threads 1 --priority 2 --local
1111
1212
To kill a running process:
1313
14-
ps -A | grep benchmark-present
14+
ps -A | grep helm-run
1515
kill <pid>
1616
'
1717

@@ -21,11 +21,11 @@ function execute {
2121
eval "time $1"
2222
}
2323

24-
# NOTE: this script is deprecated. Use the `run-all.sh` generated by `benchmark-present` instead.
24+
# NOTE: this script is deprecated. Use the `run-all.sh` generated by `helm-run` instead.
2525

2626
# Perform dry run with just a single model to download and cache all the datasets
2727
# Override with passed-in CLI arguments
28-
# execute "benchmark-present --models-to-run openai/davinci openai/code-davinci-001 --dry-run --suite dryrun $* &> dryrun.log"
28+
# execute "helm-run --models-to-run openai/davinci openai/code-davinci-001 --dry-run --suite dryrun $* &> dryrun.log"
2929

3030
models=(
3131
"ai21/j1-jumbo"
@@ -68,7 +68,7 @@ do
6868
logfile="${logfile// /_}" # Replace spaces
6969

7070
# Override with passed-in CLI arguments
71-
# By default, the command will run the RunSpecs listed in src/benchmark/presentation/run_specs.conf
71+
# By default, the command will run the RunSpecs listed in src/helm/benchmark/presentation/run_specs.conf
7272
# and output results to `benchmark_output/runs/<Today's date e.g., 06-28-2022>`.
73-
execute "benchmark-present --models-to-run $model $* &> $logfile.log &"
73+
execute "helm-run --models-to-run $model $* &> $logfile.log &"
7474
done

scripts/cache/fix_anthropic_cache.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55

66
from sqlitedict import SqliteDict
77

8-
from common.cache import key_to_request, request_to_key, SqliteCacheConfig
9-
from common.general import parse_hocon
10-
from common.hierarchical_logger import hlog, htrack
11-
from proxy.clients.anthropic_client import AnthropicClient
8+
from helm.common.cache import key_to_request, request_to_key, SqliteCacheConfig
9+
from helm.common.general import parse_hocon
10+
from helm.common.hierarchical_logger import hlog, htrack
11+
from helm.proxy.clients.anthropic_client import AnthropicClient
1212

1313
"""
1414
Fix the Anthropic cache with one of two commands:

scripts/cache/fix_together_cache.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22

33
from pymongo import MongoClient
44

5-
from common.cache import create_key_value_store, MongoCacheConfig
6-
from common.hierarchical_logger import hlog, htrack, htrack_block
5+
from helm.common.cache import create_key_value_store, MongoCacheConfig
6+
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
77

88
"""
99
Fix the Together cache. For each entry in the cache:

scripts/cache/remove_together_api_entries.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import argparse
22

3-
from common.cache import create_key_value_store, MongoCacheConfig
4-
from common.hierarchical_logger import hlog, htrack
3+
from helm.common.cache import create_key_value_store, MongoCacheConfig
4+
from helm.common.hierarchical_logger import hlog, htrack
55

66
"""
77
Removes Together API entries from cache.

scripts/create-www.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ echo Copying suite $suite into www...
1212

1313
# Copy code (note: follow symlinks)
1414
mkdir -p www || exit 1
15-
rsync -pLrvz --exclude=benchmark_output src/benchmark/static/* www || exit 1
15+
rsync -pLrvz --exclude=benchmark_output src/helm/benchmark/static/* www || exit 1
1616

1717
# Copy data
1818
mkdir -p www/benchmark_output/runs || exit 1

scripts/efficiency/generate_instances.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,21 @@
99
import os
1010
from typing import Dict, List, Tuple
1111

12-
from common.general import ensure_directory_exists, ensure_file_downloaded, parse_hocon, write
13-
from common.tokenization_request import (
12+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded, parse_hocon, write
13+
from helm.common.tokenization_request import (
1414
TokenizationRequest,
1515
TokenizationRequestResult,
1616
DecodeRequest,
1717
DecodeRequestResult,
1818
TokenizationToken,
1919
)
20-
from proxy.clients.client import Client
21-
from proxy.clients.auto_client import AutoClient
22-
from proxy.services.service import (
20+
from helm.proxy.clients.client import Client
21+
from helm.proxy.clients.auto_client import AutoClient
22+
from helm.proxy.services.service import (
2323
CREDENTIALS_FILE,
2424
CACHE_DIR,
2525
)
26-
from benchmark.scenarios.synthetic_efficiency_scenario import NUM_INPUT_TOKENS
26+
from helm.benchmark.scenarios.synthetic_efficiency_scenario import NUM_INPUT_TOKENS
2727

2828
MAX_ITERS = 5
2929

scripts/generate-together-requests.sh

+4-4
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@ The dry run results will be outputted to benchmark_output/runs/together.
44
55
Usage:
66
7-
bash scripts/generate-together-requests.sh <Any additional CLI arguments for benchmark-present>
7+
bash scripts/generate-together-requests.sh <Any additional CLI arguments for helm-run>
88
99
e.g.,
1010
bash scripts/generate-together-requests.sh --max-eval-instances 1000 --priority 2 --local
1111
1212
To kill a running process:
1313
14-
ps -A | grep benchmark-present
14+
ps -A | grep helm-run
1515
kill <pid>
1616
'
1717

@@ -40,7 +40,7 @@ do
4040
logfile="${logfile// /_}" # Replace spaces
4141

4242
# Override with passed-in CLI arguments
43-
# By default, the command will run the RunSpecs listed in src/benchmark/presentation/run_specs.conf
43+
# By default, the command will run the RunSpecs listed in src/helm/benchmark/presentation/run_specs.conf
4444
# and output results to `benchmark_output/runs/together`.
45-
execute "benchmark-present --suite together --dry-run --models-to-run $model $* &> dryrun_$logfile.log &"
45+
execute "helm-run --suite together --dry-run --models-to-run $model $* &> dryrun_$logfile.log &"
4646
done

scripts/offline_eval/export_requests.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,17 @@
55
from collections import Counter
66
from dacite import from_dict
77

8-
from common.request import Request
9-
from common.cache import (
8+
from helm.common.request import Request
9+
from helm.common.cache import (
1010
KeyValueStoreCacheConfig,
1111
MongoCacheConfig,
1212
SqliteCacheConfig,
1313
create_key_value_store,
1414
request_to_key,
1515
)
16-
from common.hierarchical_logger import hlog, htrack, htrack_block
17-
from proxy.clients.together_client import TogetherClient
18-
from proxy.clients.microsoft_client import MicrosoftClient
16+
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
17+
from helm.proxy.clients.together_client import TogetherClient
18+
from helm.proxy.clients.microsoft_client import MicrosoftClient
1919

2020

2121
"""

scripts/offline_eval/import_results.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44
import os
55
from typing import Dict
66

7-
from common.cache import (
7+
from helm.common.cache import (
88
KeyValueStoreCacheConfig,
99
MongoCacheConfig,
1010
SqliteCacheConfig,
1111
create_key_value_store,
1212
request_to_key,
1313
)
14-
from common.hierarchical_logger import hlog, htrack
14+
from helm.common.hierarchical_logger import hlog, htrack
1515

1616

1717
"""

0 commit comments

Comments
 (0)