toi-dawn
diff --git a/‎.github/workflows/test.yml
+2-2 b/‎.github/workflows/test.yml
+2-2
diff --git a/‎MANIFEST.in
+2-2 b/‎MANIFEST.in
+2-2
diff --git a/‎demo.py
+6-6 b/‎demo.py
+6-6
diff --git a/‎docs/benchmark.md
+11-11 b/‎docs/benchmark.md
+11-11
diff --git a/‎docs/code.md
+6-6 b/‎docs/code.md
+6-6
diff --git a/‎docs/deployment.md
+1-1 b/‎docs/deployment.md
+1-1
diff --git a/‎docs/proxy-server.md
+2-2 b/‎docs/proxy-server.md
+2-2
diff --git a/‎scripts/benchmark-present-all.sh
+8-8 b/‎scripts/benchmark-present-all.sh
+8-8
diff --git a/‎scripts/cache/fix_anthropic_cache.py
+4-4 b/‎scripts/cache/fix_anthropic_cache.py
+4-4
diff --git a/‎scripts/cache/fix_together_cache.py
+2-2 b/‎scripts/cache/fix_together_cache.py
+2-2
diff --git a/‎scripts/cache/remove_together_api_entries.py
+2-2 b/‎scripts/cache/remove_together_api_entries.py
+2-2
diff --git a/‎scripts/create-www.sh
+1-1 b/‎scripts/create-www.sh
+1-1
diff --git a/‎scripts/efficiency/generate_instances.py
+6-6 b/‎scripts/efficiency/generate_instances.py
+6-6
diff --git a/‎scripts/generate-together-requests.sh
+4-4 b/‎scripts/generate-together-requests.sh
+4-4
diff --git a/‎scripts/offline_eval/export_requests.py
+5-5 b/‎scripts/offline_eval/export_requests.py
+5-5
diff --git a/‎scripts/offline_eval/import_results.py
+2-2 b/‎scripts/offline_eval/import_results.py
+2-2
@@ -47,13 +47,13 @@ jobs:
       - run: ./pre-commit-venv.sh
       - name: Run tests
         # Skip ICE tokenizer tests. GHA is having trouble downloading ice_text.model.
-        run: venv/bin/pytest --ignore src/benchmark/window_services/test_ice_window_service.py --ignore src/proxy/clients/test_ice_tokenizer_client.py
+        run: venv/bin/pytest --ignore src/helm/benchmark/window_services/test_ice_window_service.py --ignore src/helm/proxy/clients/test_ice_tokenizer_client.py
         env:
           TEST: ${{ matrix.test }}
           VERSION: ${{ github.head_ref || 'main' }}
       - name: Run entire pipeline quickly without any data
         # Checking RunSpecs with openai/davinci should be comprehensive enough
-        run: venv/bin/benchmark-present --suite test -m 100 --skip-instances --models-to-run openai/davinci --exit-on-error
+        run: venv/bin/helm-run --suite test -m 100 --skip-instances --models-to-run openai/davinci --exit-on-error
 
   ci:
     name: All CI tasks complete
 
@@ -1,3 +1,3 @@
-recursive-include src/benchmark/efficiency_data/ *.json
-recursive-include src/benchmark/static/ *.css *.html *.js *.png *.yaml
+recursive-include src/helm/benchmark/efficiency_data/ *.json
+recursive-include src/helm/benchmark/static/ *.css *.html *.js *.png *.yaml
 include requirements.txt
@@ -1,11 +1,11 @@
 import getpass
 
-from common.authentication import Authentication
-from common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
-from common.request import Request, RequestResult
-from common.tokenization_request import TokenizationRequest, TokenizationRequestResult
-from proxy.accounts import Account
-from proxy.services.remote_service import RemoteService
+from helm.common.authentication import Authentication
+from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
+from helm.common.request import Request, RequestResult
+from helm.common.tokenization_request import TokenizationRequest, TokenizationRequestResult
+from helm.proxy.accounts import Account
+from helm.proxy.services.remote_service import RemoteService
 
 # An example of how to use the request API.
 api_key = getpass.getpass(prompt="Enter a valid API key: ")
 
@@ -11,16 +11,16 @@ directory exists.
 To try to test things out a small subset (defined in `run_specs_small.conf`) with just 10 eval instances:
 
     # Just load the config file
-    venv/bin/benchmark-present --conf src/benchmark/presentation/run_specs_small.conf --local --max-eval-instances 10 --suite $SUITE --skip-instances
+    venv/bin/helm-run --conf src/helm/benchmark/presentation/run_specs_small.conf --local --max-eval-instances 10 --suite $SUITE --skip-instances
 
     # Create the instances and the requests, but don't execute
-    venv/bin/benchmark-present --conf src/benchmark/presentation/run_specs_small.conf --local --max-eval-instances 10  --suite $SUITE --dry-run
+    venv/bin/helm-run --conf src/helm/benchmark/presentation/run_specs_small.conf --local --max-eval-instances 10  --suite $SUITE --dry-run
 
     # Execute the requests and compute metrics
-    venv/bin/benchmark-present --conf src/benchmark/presentation/run_specs_small.conf --local --max-eval-instances 10  --suite $SUITE
+    venv/bin/helm-run --conf src/helm/benchmark/presentation/run_specs_small.conf --local --max-eval-instances 10  --suite $SUITE
 
     # Generate assets for the website
-    venv/bin/benchmark-summarize --suite $SUITE
+    venv/bin/helm-summarize --suite $SUITE
 
 Notes:
 - `--local` means we bypass the proxy server.
@@ -30,7 +30,7 @@ To run everything (note we're restricting the number of instances and
 scenarios) in parallel:
 
     # Generate all the commands to run in parallel
-    venv/bin/benchmark-present --local --suite $SUITE --max-eval-instances 1000 --priority 2 --num-threads 8 --skip-instances
+    venv/bin/helm-run --local --suite $SUITE --max-eval-instances 1000 --priority 2 --num-threads 8 --skip-instances
 
     # Run everything in parallel over Slurm
     bash benchmark_output/runs/$SUITE/run-all.sh
@@ -39,11 +39,11 @@ scenarios) in parallel:
     # tail benchmark_output/runs/$SUITE/slurm-*.out
 
     # Generate assets for the website
-    venv/bin/benchmark-present --local --suite $SUITE --max-eval-instances 1000 --skip-instances
-    venv/bin/benchmark-summarize --suite $SUITE
+    venv/bin/helm-run --local --suite $SUITE --max-eval-instances 1000 --skip-instances
+    venv/bin/helm-summarize --suite $SUITE
 
     # Run a simple Python server to make sure things work at http://localhost:8000
-    benchmark-server
+    helm-server
 
     # Copy all website assets to the `www` directory, which can be copied to GitHub pages for static serving.
     sh scripts/create-www.sh $SUITE
@@ -56,15 +56,15 @@ Once everytihng has been sanity checked, push `www` to a GitHub page.
 
 To estimate token usage without making any requests, append the `--dry-run` option:
 
-    venv/bin/benchmark-run -r <RunSpec to estimate token usage> --suite $SUITE --max-eval-instances <Number of eval instances> --dry-run
+    venv/bin/helm-run -r <RunSpec to estimate token usage> --suite $SUITE --max-eval-instances <Number of eval instances> --dry-run
 
 and check the output in `benchmark_output/runs/$SUITE`.
 
 
 where `sum` indicates the estimated total number of tokens used for the specific `RunSpec`.
 
 For the OpenAI models, we use a
-[GPT-2 Tokenizer](https://github.com/stanford-crfm/benchmarking/blob/master/src/proxy/tokenizer/openai_token_counter.py#L12)
+[GPT-2 Tokenizer](https://github.com/stanford-crfm/benchmarking/blob/master/src/helm/proxy/tokenizer/openai_token_counter.py#L12)
 to estimate the token usage. The tokenizer will be downloaded and cached when running a dry run.
 
 ## Final benchmarking (Infrastructure team only)
@@ -115,5 +115,5 @@ to estimate the token usage. The tokenizer will be downloaded and cached when ru
 1. Create a screen session: `screen -S reproducible`.
 1. `conda activate crfm_benchmarking`.
 1. Run `python3 scripts/verify_reproducibility.py --models-to-run openai/davinci openai/code-cushman-001 together/gpt-neox-20b
-   --conf-path src/benchmark/presentation/run_specs.conf --max-eval-instances 1000 --priority 2 &> reproducible.log`.
+   --conf-path src/helm/benchmark/presentation/run_specs.conf --max-eval-instances 1000 --priority 2 &> reproducible.log`.
 1. Check the result at `reproducible.log`.
@@ -64,7 +64,7 @@ In order to implement new scenarios:
    `__init__` function even if it is simply `pass`.
 7. Define a function `get_specname_spec` in `run_specs.py` to retrieve a `ScenarioSpec` 
    for your scenario using a class name corresponding to the Python path of 
-   the class (e.g. `benchmark.scenarios.your_scenario.YourScenario`) and any 
+   the class (e.g. `helm.benchmark.scenarios.your_scenario.YourScenario`) and any 
    arguments which must be passed as a dictionary of `args`.
 8. Have the `get_specname_spec` function retrieve an `AdapterSpec` for your
    scenario specifying the type of language model generation which must be 
@@ -81,10 +81,10 @@ In order to implement new scenarios:
    and `groups`. 
 12. Add the scenario to `__init__.py`
 13. Attempt to run your task with
-    `venv/bin/benchmark-run -r yourscenarioname:arg=value` where 
+    `venv/bin/helm-run -r yourscenarioname:arg=value` where 
     `yourscenarioname` matches the `name` specified in YourScenario
 14. Add the spec to dictionary `CANONICAL_RUN_SPEC_FUNCS` in `run_specs.py`.
-15. Update `src/proxy/static/contamination.yaml` with models that we trained on your scenario (i.e. contaminated).
+15. Update `src/helm/proxy/static/contamination.yaml` with models that we trained on your scenario (i.e. contaminated).
 
 
 ## Adding new metrics
@@ -113,7 +113,7 @@ example:
     data_augmenter_spec = DataAugmenterSpec(
         perturbation_specs=[
             PerturbationSpec(
-                class_name="benchmark.augmentations.perturbation.ExtraSpacePerturbation",
+                class_name="helm.benchmark.augmentations.perturbation.ExtraSpacePerturbation",
                 args={"num_spaces": 5},
             )
         ],
@@ -138,12 +138,12 @@ multiple perturbations and applying it onto a single instance.
 
 ### Adding a new perturbation
 
-To add a new perturbation to the framework, create a new file at `src/benchmark/augmentations` with the name
+To add a new perturbation to the framework, create a new file at `src/helm/benchmark/augmentations` with the name
 `<Name of perturbation>_perturbation.py` e.g., `typo_perturbation.py`. Inside the file, create a new class
 (name it `<Name of the perturbation>Perturbation` e.g., `TypoPerturbation`)
 that extends the abstract class `Perturbation` and implement the `perturb` method which
 takes in text and outputs the perturbed text.
-Add your new perturbation to `src/benchmark/__init__.py`.
+Add your new perturbation to `src/helm/benchmark/__init__.py`.
 Add a test for the new perturbation in `test_perturbation.py`.
 
 ## Supporting new Hugging Face tokenizers
 
@@ -109,7 +109,7 @@ If everything looks okay:
 
     # Hit ctrl-c to kill the existing process
     # Restart the server
-    sudo venv/bin/proxy-server -p 443 --ssl-key-file /home/ssl/private.key --ssl-cert-file /home/ssl/crfm-models.crt --workers 16 &> server.log
+    sudo venv/bin/crfm-proxy-server -p 443 --ssl-key-file /home/ssl/private.key --ssl-cert-file /home/ssl/crfm-models.crt --workers 16 &> server.log
 
     # Exit the screen session: ctrl-ad
 
 
@@ -19,7 +19,7 @@ models you have access to.
 
 To start a local server (go to `http://localhost:1959` to try it out):
 
-    venv/bin/proxy-server
+    venv/bin/crfm-proxy-server
 
 When starting the server for the first time, the server will create an admin account 
 with the API key: `root`.
@@ -30,4 +30,4 @@ default admin account.
 
 Bypass the added security that restricts multithreading by running:
 
-    OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES venv/bin/proxy-server
+    OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES venv/bin/crfm-proxy-server
@@ -1,17 +1,17 @@
 : '
-Run RunSpecs in parallel by models using benchmark-present. To bypass the proxy server
+Run RunSpecs in parallel by models using helm-run. To bypass the proxy server
 and run in root mode, append --local.
 
 Usage:
 
-  bash scripts/benchmark-present-all.sh <Any additional CLI arguments for benchmark-present>
+  bash scripts/helm-run-all.sh <Any additional CLI arguments for helm-run>
 
   e.g.,
-  bash scripts/benchmark-present-all.sh --max-eval-instances 1000 --num-threads 1 --priority 2 --local
+  bash scripts/helm-run-all.sh --max-eval-instances 1000 --num-threads 1 --priority 2 --local
 
 To kill a running process:
 
-  ps -A | grep benchmark-present
+  ps -A | grep helm-run
   kill <pid>
 '
 
@@ -21,11 +21,11 @@ function execute {
    eval "time $1"
 }
 
-# NOTE: this script is deprecated.  Use the `run-all.sh` generated by `benchmark-present` instead.
+# NOTE: this script is deprecated.  Use the `run-all.sh` generated by `helm-run` instead.
 
 # Perform dry run with just a single model to download and cache all the datasets
 # Override with passed-in CLI arguments
-# execute "benchmark-present --models-to-run openai/davinci openai/code-davinci-001 --dry-run --suite dryrun $* &> dryrun.log"
+# execute "helm-run --models-to-run openai/davinci openai/code-davinci-001 --dry-run --suite dryrun $* &> dryrun.log"
 
 models=(
   "ai21/j1-jumbo"
@@ -68,7 +68,7 @@ do
     logfile="${logfile// /_}"   # Replace spaces
 
     # Override with passed-in CLI arguments
-    # By default, the command will run the RunSpecs listed in src/benchmark/presentation/run_specs.conf
+    # By default, the command will run the RunSpecs listed in src/helm/benchmark/presentation/run_specs.conf
     # and output results to `benchmark_output/runs/<Today's date e.g., 06-28-2022>`.
-    execute "benchmark-present --models-to-run $model $* &> $logfile.log &"
+    execute "helm-run --models-to-run $model $* &> $logfile.log &"
 done
@@ -5,10 +5,10 @@
 
 from sqlitedict import SqliteDict
 
-from common.cache import key_to_request, request_to_key, SqliteCacheConfig
-from common.general import parse_hocon
-from common.hierarchical_logger import hlog, htrack
-from proxy.clients.anthropic_client import AnthropicClient
+from helm.common.cache import key_to_request, request_to_key, SqliteCacheConfig
+from helm.common.general import parse_hocon
+from helm.common.hierarchical_logger import hlog, htrack
+from helm.proxy.clients.anthropic_client import AnthropicClient
 
 """
 Fix the Anthropic cache with one of two commands:
 
@@ -2,8 +2,8 @@
 
 from pymongo import MongoClient
 
-from common.cache import create_key_value_store, MongoCacheConfig
-from common.hierarchical_logger import hlog, htrack, htrack_block
+from helm.common.cache import create_key_value_store, MongoCacheConfig
+from helm.common.hierarchical_logger import hlog, htrack, htrack_block
 
 """
 Fix the Together cache. For each entry in the cache:
 
@@ -1,7 +1,7 @@
 import argparse
 
-from common.cache import create_key_value_store, MongoCacheConfig
-from common.hierarchical_logger import hlog, htrack
+from helm.common.cache import create_key_value_store, MongoCacheConfig
+from helm.common.hierarchical_logger import hlog, htrack
 
 """
 Removes Together API entries from cache.
 
@@ -12,7 +12,7 @@ echo Copying suite $suite into www...
 
 # Copy code (note: follow symlinks)
 mkdir -p www || exit 1
-rsync -pLrvz --exclude=benchmark_output src/benchmark/static/* www || exit 1
+rsync -pLrvz --exclude=benchmark_output src/helm/benchmark/static/* www || exit 1
 
 # Copy data
 mkdir -p www/benchmark_output/runs || exit 1
 
@@ -9,21 +9,21 @@
 import os
 from typing import Dict, List, Tuple
 
-from common.general import ensure_directory_exists, ensure_file_downloaded, parse_hocon, write
-from common.tokenization_request import (
+from helm.common.general import ensure_directory_exists, ensure_file_downloaded, parse_hocon, write
+from helm.common.tokenization_request import (
     TokenizationRequest,
     TokenizationRequestResult,
     DecodeRequest,
     DecodeRequestResult,
     TokenizationToken,
 )
-from proxy.clients.client import Client
-from proxy.clients.auto_client import AutoClient
-from proxy.services.service import (
+from helm.proxy.clients.client import Client
+from helm.proxy.clients.auto_client import AutoClient
+from helm.proxy.services.service import (
     CREDENTIALS_FILE,
     CACHE_DIR,
 )
-from benchmark.scenarios.synthetic_efficiency_scenario import NUM_INPUT_TOKENS
+from helm.benchmark.scenarios.synthetic_efficiency_scenario import NUM_INPUT_TOKENS
 
 MAX_ITERS = 5
 
 
@@ -4,14 +4,14 @@ The dry run results will be outputted to benchmark_output/runs/together.
 
 Usage:
 
-  bash scripts/generate-together-requests.sh <Any additional CLI arguments for benchmark-present>
+  bash scripts/generate-together-requests.sh <Any additional CLI arguments for helm-run>
 
   e.g.,
   bash scripts/generate-together-requests.sh --max-eval-instances 1000 --priority 2 --local
 
 To kill a running process:
 
-  ps -A | grep benchmark-present
+  ps -A | grep helm-run
   kill <pid>
 '
 
@@ -40,7 +40,7 @@ do
     logfile="${logfile// /_}"   # Replace spaces
 
     # Override with passed-in CLI arguments
-    # By default, the command will run the RunSpecs listed in src/benchmark/presentation/run_specs.conf
+    # By default, the command will run the RunSpecs listed in src/helm/benchmark/presentation/run_specs.conf
     # and output results to `benchmark_output/runs/together`.
-    execute "benchmark-present --suite together --dry-run --models-to-run $model $* &> dryrun_$logfile.log &"
+    execute "helm-run --suite together --dry-run --models-to-run $model $* &> dryrun_$logfile.log &"
 done
@@ -5,17 +5,17 @@
 from collections import Counter
 from dacite import from_dict
 
-from common.request import Request
-from common.cache import (
+from helm.common.request import Request
+from helm.common.cache import (
     KeyValueStoreCacheConfig,
     MongoCacheConfig,
     SqliteCacheConfig,
     create_key_value_store,
     request_to_key,
 )
-from common.hierarchical_logger import hlog, htrack, htrack_block
-from proxy.clients.together_client import TogetherClient
-from proxy.clients.microsoft_client import MicrosoftClient
+from helm.common.hierarchical_logger import hlog, htrack, htrack_block
+from helm.proxy.clients.together_client import TogetherClient
+from helm.proxy.clients.microsoft_client import MicrosoftClient
 
 
 """
 
@@ -4,14 +4,14 @@
 import os
 from typing import Dict
 
-from common.cache import (
+from helm.common.cache import (
     KeyValueStoreCacheConfig,
     MongoCacheConfig,
     SqliteCacheConfig,
     create_key_value_store,
     request_to_key,
 )
-from common.hierarchical_logger import hlog, htrack
+from helm.common.hierarchical_logger import hlog, htrack
 
 
 """