toi-dawn
diff --git a/‎scripts/cache/fix_anthropic_cache.py
+4-1 b/‎scripts/cache/fix_anthropic_cache.py
+4-1
diff --git a/‎scripts/compute_request_limits.py
+24-27 b/‎scripts/compute_request_limits.py
+24-27
diff --git a/‎src/helm/benchmark/window_services/cohere_window_service.py
+5-5 b/‎src/helm/benchmark/window_services/cohere_window_service.py
+5-5
diff --git a/‎src/helm/benchmark/window_services/huggingface_window_service.py
+2-2 b/‎src/helm/benchmark/window_services/huggingface_window_service.py
+2-2
diff --git a/‎src/helm/benchmark/window_services/yalm_window_service.py
+1-1 b/‎src/helm/benchmark/window_services/yalm_window_service.py
+1-1
diff --git a/‎src/helm/common/request.py
+16-1 b/‎src/helm/common/request.py
+16-1
@@ -8,6 +8,7 @@
 from helm.common.hierarchical_logger import hlog, htrack
 from helm.proxy.clients.anthropic_client import AnthropicLegacyClient
 from helm.proxy.retry import get_retry_decorator
+from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
 
 
 """
@@ -47,7 +48,9 @@ def add_logprobs(mongo_uri: str, credentials_path: str, dry_run: bool):
         api_key: str = credentials["anthropicApiKey"]
 
     cache_config = MongoCacheConfig(mongo_uri, collection_name="anthropic")
-    client = AnthropicLegacyClient(api_key, cache_config)
+    client = AnthropicLegacyClient(
+        api_key=api_key, tokenizer=HuggingFaceTokenizer(cache_config), cache_config=cache_config
+    )
 
     with create_key_value_store(cache_config) as cache:
         for i, (request, response) in enumerate(cache.get_all()):
 
@@ -10,6 +10,7 @@
 
 # TODO #1592: reenable this once the imports are faster
 # from helm.proxy.clients.client import Client
+from helm.proxy.tokenizers.tokenizer import Tokenizer
 
 import os
 import math
@@ -31,17 +32,17 @@ def get_credentials(path: str) -> Dict[str, str]:
         return credentials
 
 
-def get_number_of_tokens(prompt: str, tokenizer_client: Any, tokenizer_name: str) -> int:
+def get_number_of_tokens(prompt: str, tokenizer: Tokenizer, tokenizer_name: str) -> int:
     tokenization_request = TokenizationRequest(tokenizer=tokenizer_name, text=prompt, encode=True)
-    tokenization_response = tokenizer_client.tokenize(tokenization_request)
+    tokenization_response = tokenizer.tokenize(tokenization_request)
     return len(tokenization_response.tokens)
 
 
 def try_request(
     client: Any,
     model_name: str,
     tokenizer_name: str,
-    tokenizer_client: Any,
+    tokenizer: Tokenizer,
     sequence_length: int,
     num_tokens: int,
     prefix: str = "",
@@ -51,8 +52,8 @@ def try_request(
     Try to make a request with the given sequence_length and num_tokens.
     Return True if the request was successful, False otherwise.
     """
-    num_tokens_prefix = get_number_of_tokens(prefix, tokenizer_client, tokenizer_name)
-    num_tokens_suffix = get_number_of_tokens(suffix, tokenizer_client, tokenizer_name)
+    num_tokens_prefix = get_number_of_tokens(prefix, tokenizer, tokenizer_name)
+    num_tokens_suffix = get_number_of_tokens(suffix, tokenizer, tokenizer_name)
 
     try:
         request = Request(
@@ -76,25 +77,25 @@ class RequestLimits:
 
 
 def figure_out_max_prompt_length(
-    client: Any,  # Client,
+    client: AutoClient,
     model_name: str,
     tokenizer_name: str,
     upper_bound: int = 9500,
     lower_bound: int = 450,
     prefix: str = "",
     suffix: str = "",
 ) -> RequestLimits:
-    tokenizer_client = client._get_tokenizer_client(tokenizer_name)
-    num_tokens_prefix = get_number_of_tokens(prefix, tokenizer_client, tokenizer_name)
-    num_tokens_suffix = get_number_of_tokens(suffix, tokenizer_client, tokenizer_name)
+    tokenizer = client._get_tokenizer(tokenizer_name)
+    num_tokens_prefix = get_number_of_tokens(prefix, tokenizer, tokenizer_name)
+    num_tokens_suffix = get_number_of_tokens(suffix, tokenizer, tokenizer_name)
 
     # Perform a binary search to find the max tokens between lower_bound and upper_bound
     lower_bound += num_tokens_prefix + num_tokens_suffix
     pbar: tqdm
     with tqdm(total=int(math.log2(upper_bound - lower_bound))) as pbar:
         while lower_bound < upper_bound:
             middle = math.ceil((lower_bound + upper_bound) / 2)
-            if try_request(client, model_name, tokenizer_name, tokenizer_client, middle, 0, prefix, suffix):
+            if try_request(client, model_name, tokenizer_name, tokenizer, middle, 0, prefix, suffix):
                 lower_bound = middle
             else:
                 upper_bound = middle - 1
@@ -103,7 +104,7 @@ def figure_out_max_prompt_length(
     # Just in case the number of tokens does not match the number of words, check number of tokens with tokenizer
     max_prompt_length = get_number_of_tokens(
         prefix + " ".join(["hello"] * (lower_bound - num_tokens_prefix - num_tokens_suffix)) + suffix,
-        tokenizer_client,
+        tokenizer,
         tokenizer_name,
     )
     return RequestLimits(
@@ -122,7 +123,7 @@ def figure_out_max_prompt_length_plus_tokens(
     prefix: str = "",
     suffix: str = "",
 ) -> int:
-    tokenizer_client = client._get_tokenizer_client(tokenizer_name)
+    tokenizer = client._get_tokenizer(tokenizer_name)
     lower_bound = 1
     upper_bound = 2 * max_prompt_length + 1
 
@@ -131,7 +132,7 @@ def figure_out_max_prompt_length_plus_tokens(
         client,
         model_name,
         tokenizer_name,
-        tokenizer_client,
+        tokenizer,
         max_prompt_length,
         2**31 - 2 - max_prompt_length,
         prefix,
@@ -147,9 +148,7 @@ def figure_out_max_prompt_length_plus_tokens(
     with tqdm(total=int(math.log2(upper_bound - lower_bound))) as pbar:
         while lower_bound < upper_bound:
             middle = math.ceil((lower_bound + upper_bound) / 2)
-            if try_request(
-                client, model_name, tokenizer_name, tokenizer_client, max_prompt_length, middle, prefix, suffix
-            ):
+            if try_request(client, model_name, tokenizer_name, tokenizer, max_prompt_length, middle, prefix, suffix):
                 lower_bound = middle
             else:
                 upper_bound = middle - 1
@@ -159,39 +158,37 @@ def figure_out_max_prompt_length_plus_tokens(
 
 
 def check_limits(
-    client: Any,  # Client,
+    client: AutoClient,
     model_name: str,
     tokenizer_name: str,
     limits: RequestLimits,
     prefix: str = "",
     suffix: str = "",
 ) -> bool:
-    tokenizer_client = client._get_tokenizer_client(tokenizer_name)
+    tokenizer = client._get_tokenizer(tokenizer_name)
     result: bool = True
 
     # Check the max_prompt_length
     max_prompt_length = limits.max_prompt_length
     if max_prompt_length < 0:
         print("No limit on the number of tokens")
-        if not try_request(client, model_name, tokenizer_name, tokenizer_client, 2**32 - 2, 0, prefix, suffix):
+        if not try_request(client, model_name, tokenizer_name, tokenizer, 2**32 - 2, 0, prefix, suffix):
             print(f"There is a limit on the number of tokens. Params: max_prompt_length={2**32 - 2}, max_tokens=1")
             result = False
     else:
         # There is a limit on the number of tokens
         # If there is no limit on the number of tokens, max_prompt_length should be -1
         # And we should not be here
         # Check that max_prompt_length is ok
-        if not try_request(client, model_name, tokenizer_name, tokenizer_client, max_prompt_length, 0, prefix, suffix):
+        if not try_request(client, model_name, tokenizer_name, tokenizer, max_prompt_length, 0, prefix, suffix):
             print(f"max_prompt_length is too big. Params: max_prompt_length={max_prompt_length}, max_tokens=1")
             result = False
         # Check that max_prompt_length + 1 is not ok
-        if try_request(client, model_name, tokenizer_name, tokenizer_client, max_prompt_length + 1, 0, prefix, suffix):
+        if try_request(client, model_name, tokenizer_name, tokenizer, max_prompt_length + 1, 0, prefix, suffix):
             print(f"max_prompt_length could be bigger. Params: max_prompt_length={max_prompt_length+1}, max_tokens=1")
             result = False
         # Check that max_prompt_length - 1 is ok
-        if not try_request(
-            client, model_name, tokenizer_name, tokenizer_client, max_prompt_length - 1, 0, prefix, suffix
-        ):
+        if not try_request(client, model_name, tokenizer_name, tokenizer, max_prompt_length - 1, 0, prefix, suffix):
             print(
                 f"max_prompt_length ssems to be inconsistent. max_prompt_length={max_prompt_length} "
                 f"is ok but max_prompt_length={max_prompt_length-1} is not, with max_tokens=0"
@@ -206,7 +203,7 @@ def check_limits(
     if max_prompt_length_plus_tokens < 0:
         print("No limit on the number of tokens")
         if not try_request(
-            client, model_name, tokenizer_name, tokenizer_client, max(1, max_prompt_length), 2**32 - 2, prefix, suffix
+            client, model_name, tokenizer_name, tokenizer, max(1, max_prompt_length), 2**32 - 2, prefix, suffix
         ):
             print(
                 f"There is a limit on the number of tokens. Params: max_prompt_length={max_prompt_length},"
@@ -221,7 +218,7 @@ def check_limits(
             client,
             model_name,
             tokenizer_name,
-            tokenizer_client,
+            tokenizer,
             max_prompt_length,
             max_prompt_length_plus_tokens - max_prompt_length,
             prefix,
@@ -236,7 +233,7 @@ def check_limits(
             client,
             model_name,
             tokenizer_name,
-            tokenizer_client,
+            tokenizer,
             max_prompt_length,
             max_prompt_length_plus_tokens - max_prompt_length + 1,
             prefix,
 
@@ -1,6 +1,6 @@
 from typing import List, Optional
 
-from helm.proxy.clients.cohere_client import CohereClient
+from helm.proxy.tokenizers.cohere_tokenizer import CohereTokenizer
 from .local_window_service import LocalWindowService
 from .tokenizer_service import TokenizerService
 from .window_service import EncodeResult
@@ -62,7 +62,7 @@ def encode(self, text: str, truncation: bool = False, max_length: Optional[int]
 
         response: TokenizationRequestResult
         tokens: List[TokenizationToken] = []
-        if truncation or len(text) <= CohereClient.TOKENIZE_API_MAX_TEXT_LENGTH:
+        if truncation or len(text) <= CohereTokenizer.TOKENIZE_API_MAX_TEXT_LENGTH:
             response = self.service.tokenize(
                 TokenizationRequest(
                     text,
@@ -80,7 +80,7 @@ def encode(self, text: str, truncation: bool = False, max_length: Optional[int]
             # and make a request for each chunk.
             # This can potentially break up valid tokens at the end of the chunk, but the chunk size
             # is large enough that this happens infrequently.
-            chunk_size: int = CohereClient.TOKENIZE_API_MAX_TEXT_LENGTH
+            chunk_size: int = CohereTokenizer.TOKENIZE_API_MAX_TEXT_LENGTH
             for i in range(0, len(text), chunk_size):
                 chunk: str = text[i : chunk_size + i]
                 response = self.service.tokenize(
@@ -120,7 +120,7 @@ def fits_within_context_window(self, text: str, expected_completion_token_length
         so first check if the text has fewer than 65,536 characters.
         """
         return (
-            len(text) <= CohereClient.TOKENIZE_API_MAX_TEXT_LENGTH
+            len(text) <= CohereTokenizer.TOKENIZE_API_MAX_TEXT_LENGTH
             and self.get_num_tokens(text) + expected_completion_token_length <= self.max_request_length
         )
 
@@ -130,7 +130,7 @@ def truncate_from_right(self, text: str, expected_completion_token_length: int =
         minus the expected completion length (defaults to 0).
         """
         # First truncate the text so it's within `CohereClient.TOKENIZE_MAX_TEXT_LENGTH` length.
-        text = text[: CohereClient.TOKENIZE_API_MAX_TEXT_LENGTH]
+        text = text[: CohereTokenizer.TOKENIZE_API_MAX_TEXT_LENGTH]
 
         max_length: int = self.max_request_length - expected_completion_token_length
         result: str = self.decode(self.encode(text, truncation=True, max_length=max_length).tokens)
 
@@ -1,5 +1,5 @@
 from typing import Optional
-from helm.proxy.clients.huggingface_tokenizer import HuggingFaceTokenizers
+from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
 from .local_window_service import LocalWindowService
 from .tokenizer_service import TokenizerService
 
@@ -16,7 +16,7 @@ def __init__(
     ):
         super().__init__(service)
         self._tokenizer_name = tokenizer_name
-        tokenizer = HuggingFaceTokenizers.get_tokenizer(
+        tokenizer = HuggingFaceTokenizer.get_tokenizer(
             helm_tokenizer_name=tokenizer_name,
             pretrained_model_name_or_path=pretrained_model_name_or_path or tokenizer_name,
             revision=revision,
 
@@ -1,4 +1,4 @@
-from helm.proxy.clients.yalm_tokenizer.yalm_tokenizer import YaLMTokenizer
+from helm.proxy.tokenizers.yalm_tokenizer_data.yalm_tokenizer import YaLMTokenizer
 from .local_window_service import LocalWindowService
 from .tokenizer_service import TokenizerService
 
 
@@ -1,5 +1,6 @@
+import time
 from dataclasses import dataclass, field
-from typing import List, Optional, Dict
+from typing import Any, Callable, Dict, List, Optional
 
 from helm.common.media_object import MultimediaObject
 from helm.proxy.models import Model, get_model
@@ -213,3 +214,17 @@ def render_lines(self) -> List[str]:
     completions=[],
     embedding=[],
 )
+
+
+def wrap_request_time(compute: Callable[[], Dict[str, Any]]) -> Callable[[], Any]:
+    """Return a version of `compute` that puts `request_time` into its output."""
+
+    def wrapped_compute():
+        start_time = time.time()
+        response = compute()
+        end_time = time.time()
+        response["request_time"] = end_time - start_time
+        response["request_datetime"] = int(start_time)
+        return response
+
+    return wrapped_compute
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from helm.proxy.clients.yalm_tokenizer.yalm_tokenizer import YaLMTokenizer`
	`1`	`+from helm.proxy.tokenizers.yalm_tokenizer_data.yalm_tokenizer import YaLMTokenizer`
`2`	`2`	`from .local_window_service import LocalWindowService`
`3`	`3`	`from .tokenizer_service import TokenizerService`
`4`	`4`