Skip to content

Commit 78d29f1

Browse files
authored
Clean up icetk and protobuf dependencies (stanford-crfm#1834)
1 parent bb30601 commit 78d29f1

File tree

5 files changed

+18
-11
lines changed

5 files changed

+18
-11
lines changed

.github/workflows/test.yml

+1-2
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,7 @@ jobs:
5757
- run: source venv/bin/activate && ./install-dev.sh
5858
- run: source venv/bin/activate && ./pre-commit.sh
5959
- name: Run tests
60-
# Skip ICE tokenizer tests. GHA is having trouble downloading ice_text.model.
61-
run: source venv/bin/activate && pytest --ignore src/helm/benchmark/window_services/test_ice_window_service.py --ignore src/helm/proxy/clients/test_ice_tokenizer_client.py
60+
run: source venv/bin/activate && pytest
6261
env:
6362
TEST: ${{ matrix.test }}
6463
VERSION: ${{ github.head_ref || 'main' }}

install-dev.sh

-2
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@ if [[ $OSTYPE != 'darwin'* ]]; then
99
# Manually install pytorch to avoid pip getting killed: https://stackoverflow.com/a/54329850
1010
pip install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html torch==1.12.1+cu113 torchvision==0.13.1+cu113
1111
fi
12-
# Manually install protobuf to workaround issue: https://github.com/protocolbuffers/protobuf/issues/6550
13-
pip install --no-binary=protobuf protobuf==3.20.2
1412
# Install all pinned dependencies
1513
pip install -r requirements.txt
1614
# Install HELM in edit mode

setup.cfg

-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ install_requires=
3838
sqlitedict~=1.7.0
3939
bottle~=0.12.23
4040
# TODO: Remove these from common
41-
protobuf~=3.20.2 # Can't use 4.21.0 due to backward incompatibility
4241
pymongo~=4.2.0
4342

4443
# Basic Scenarios

src/helm/proxy/clients/ice_tokenizer_client.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1+
import os
12
from dataclasses import asdict
23

3-
from icetk import icetk as tokenizer
4-
54
from helm.common.cache import Cache, CacheConfig
5+
from helm.common.optional_dependencies import handle_module_not_found_error
66
from helm.common.request import Request, RequestResult
77
from helm.common.tokenization_request import (
88
TokenizationRequest,
@@ -13,6 +13,14 @@
1313
)
1414
from .client import Client, wrap_request_time, cleanup_tokens
1515

16+
try:
17+
# Fall back to pure Python protobufs to work around issue #1613,
18+
# which is caused by icetk using C++ protobufs compiled with protobuf<3.19.
19+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
20+
from icetk import icetk as tokenizer
21+
except ModuleNotFoundError as e:
22+
handle_module_not_found_error(e)
23+
1624

1725
class ICETokenizerClient(Client):
1826
"""

src/helm/proxy/clients/test_ice_tokenizer_client.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ class TestICETokenizerClient:
1717
def setup_method(self, method):
1818
cache_file = tempfile.NamedTemporaryFile(delete=False)
1919
self.cache_path: str = cache_file.name
20+
self.tokenizer_name = "TsinghuaKEG/ice"
2021
self.client = ICETokenizerClient(SqliteCacheConfig(self.cache_path))
2122

2223
# The test cases were created using the examples from https://github.com/THUDM/icetk#tokenization
@@ -27,26 +28,28 @@ def teardown_method(self, method):
2728
os.remove(self.cache_path)
2829

2930
def test_tokenize(self):
30-
request = TokenizationRequest(text=self.test_prompt)
31+
request = TokenizationRequest(text=self.test_prompt, tokenizer=self.tokenizer_name)
3132
result: TokenizationRequestResult = self.client.tokenize(request)
3233
assert not result.cached, "First time making the tokenize request. Result should not be cached"
3334
result: TokenizationRequestResult = self.client.tokenize(request)
3435
assert result.cached, "Result should be cached"
3536
assert result.raw_tokens == [" Hello", " World", "!", " I", " am", " ice", "tk", "."]
3637

3738
def test_encode(self):
38-
request = TokenizationRequest(text=self.test_prompt, encode=True)
39+
request = TokenizationRequest(text=self.test_prompt, tokenizer=self.tokenizer_name, encode=True)
3940
result: TokenizationRequestResult = self.client.tokenize(request)
4041
assert result.raw_tokens == self.encoded_test_prompt
4142

4243
def test_encode_with_truncation(self):
4344
max_length: int = 3
44-
request = TokenizationRequest(text=self.test_prompt, encode=True, truncation=True, max_length=max_length)
45+
request = TokenizationRequest(
46+
text=self.test_prompt, tokenizer=self.tokenizer_name, encode=True, truncation=True, max_length=max_length
47+
)
4548
result: TokenizationRequestResult = self.client.tokenize(request)
4649
assert result.raw_tokens == self.encoded_test_prompt[:max_length]
4750

4851
def test_decode(self):
49-
request = DecodeRequest(tokens=self.encoded_test_prompt)
52+
request = DecodeRequest(tokens=self.encoded_test_prompt, tokenizer=self.tokenizer_name)
5053
result: DecodeRequestResult = self.client.decode(request)
5154
assert not result.cached, "First time making the decode request. Result should not be cached"
5255
result: DecodeRequestResult = self.client.decode(request)

0 commit comments

Comments
 (0)