Clean up icetk and protobuf dependencies (stanford-crfm#1834)

yifanmai · web-flow · commit 78d29f11ebf2 · 2023-10-06T17:59:55.000-07:00
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -57,8 +57,7 @@ jobs:
       - run: source venv/bin/activate && ./install-dev.sh
       - run: source venv/bin/activate && ./pre-commit.sh
       - name: Run tests
-        # Skip ICE tokenizer tests. GHA is having trouble downloading ice_text.model.
-        run: source venv/bin/activate && pytest --ignore src/helm/benchmark/window_services/test_ice_window_service.py --ignore src/helm/proxy/clients/test_ice_tokenizer_client.py
+        run: source venv/bin/activate && pytest
         env:
           TEST: ${{ matrix.test }}
           VERSION: ${{ github.head_ref || 'main' }}
diff --git a/install-dev.sh b/install-dev.sh
@@ -9,8 +9,6 @@ if [[ $OSTYPE != 'darwin'* ]]; then
   # Manually install pytorch to avoid pip getting killed: https://stackoverflow.com/a/54329850
   pip install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html torch==1.12.1+cu113 torchvision==0.13.1+cu113
 fi
-# Manually install protobuf to workaround issue: https://github.com/protocolbuffers/protobuf/issues/6550
-pip install --no-binary=protobuf protobuf==3.20.2
 # Install all pinned dependencies
 pip install -r requirements.txt
 # Install HELM in edit mode
diff --git a/setup.cfg b/setup.cfg
@@ -38,7 +38,6 @@ install_requires=
     sqlitedict~=1.7.0
     bottle~=0.12.23
     # TODO: Remove these from common
-    protobuf~=3.20.2  # Can't use 4.21.0 due to backward incompatibility
     pymongo~=4.2.0
 
     # Basic Scenarios
diff --git a/src/helm/proxy/clients/ice_tokenizer_client.py b/src/helm/proxy/clients/ice_tokenizer_client.py
@@ -1,8 +1,8 @@
+import os
 from dataclasses import asdict
 
-from icetk import icetk as tokenizer
-
 from helm.common.cache import Cache, CacheConfig
+from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import Request, RequestResult
 from helm.common.tokenization_request import (
     TokenizationRequest,
@@ -13,6 +13,14 @@
 )
 from .client import Client, wrap_request_time, cleanup_tokens
 
+try:
+    # Fall back to pure Python protobufs to work around issue #1613,
+    # which is caused by icetk using C++ protobufs compiled with protobuf<3.19.
+    os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
+    from icetk import icetk as tokenizer
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e)
+
 
 class ICETokenizerClient(Client):
     """
diff --git a/src/helm/proxy/clients/test_ice_tokenizer_client.py b/src/helm/proxy/clients/test_ice_tokenizer_client.py
@@ -17,6 +17,7 @@ class TestICETokenizerClient:
     def setup_method(self, method):
         cache_file = tempfile.NamedTemporaryFile(delete=False)
         self.cache_path: str = cache_file.name
+        self.tokenizer_name = "TsinghuaKEG/ice"
         self.client = ICETokenizerClient(SqliteCacheConfig(self.cache_path))
 
         # The test cases were created using the examples from https://github.com/THUDM/icetk#tokenization
@@ -27,26 +28,28 @@ def teardown_method(self, method):
         os.remove(self.cache_path)
 
     def test_tokenize(self):
-        request = TokenizationRequest(text=self.test_prompt)
+        request = TokenizationRequest(text=self.test_prompt, tokenizer=self.tokenizer_name)
         result: TokenizationRequestResult = self.client.tokenize(request)
         assert not result.cached, "First time making the tokenize request. Result should not be cached"
         result: TokenizationRequestResult = self.client.tokenize(request)
         assert result.cached, "Result should be cached"
         assert result.raw_tokens == [" Hello", " World", "!", " I", " am", " ice", "tk", "."]
 
     def test_encode(self):
-        request = TokenizationRequest(text=self.test_prompt, encode=True)
+        request = TokenizationRequest(text=self.test_prompt, tokenizer=self.tokenizer_name, encode=True)
         result: TokenizationRequestResult = self.client.tokenize(request)
         assert result.raw_tokens == self.encoded_test_prompt
 
     def test_encode_with_truncation(self):
         max_length: int = 3
-        request = TokenizationRequest(text=self.test_prompt, encode=True, truncation=True, max_length=max_length)
+        request = TokenizationRequest(
+            text=self.test_prompt, tokenizer=self.tokenizer_name, encode=True, truncation=True, max_length=max_length
+        )
         result: TokenizationRequestResult = self.client.tokenize(request)
         assert result.raw_tokens == self.encoded_test_prompt[:max_length]
 
     def test_decode(self):
-        request = DecodeRequest(tokens=self.encoded_test_prompt)
+        request = DecodeRequest(tokens=self.encoded_test_prompt, tokenizer=self.tokenizer_name)
         result: DecodeRequestResult = self.client.decode(request)
         assert not result.cached, "First time making the decode request. Result should not be cached"
         result: DecodeRequestResult = self.client.decode(request)