Merge pull request stanford-crfm#1143 from stanford-crfm/multiple_completions

deepakn94 · web-flow · commit c945233254f1 · 2022-11-16T08:44:04.000-08:00
Account for multiple completions
diff --git a/src/benchmark/metrics/basic_metrics.py b/src/benchmark/metrics/basic_metrics.py
@@ -539,22 +539,20 @@ def compute_efficiency_metrics(
             runtime = request_state.result.batch_request_time
             batch_size = request_state.result.batch_size
 
-        # Compute total number of prompt and output tokens (in first sequence).
+        # Compute total number of prompt and output tokens.
         # Fetch the right `Tokenizer` depending on the model defined in `AdapterSpec`
         # and calculate the number of tokens in the prompt.
         tokenizer_service: TokenizerService = metric_service
         window_service: WindowService = WindowServiceFactory.get_window_service(adapter_spec.model, tokenizer_service)
         prompt: str = request_state.request.prompt
         num_prompt_tokens: int = window_service.get_num_tokens(prompt)
 
-        # Just take the first completion
-        # TODO: don't we need to take into account all the completions, since
-        # the runtime we get (that's used to compute denoised_runtime) is for
-        # generating all of them?
-        # TODO: we should unify this into num_completion_tokens
-        sequence = request_state.result.completions[0]
-        num_output_tokens: int = len(sequence.tokens)
+        # Total number of tokens in the completion.
+        num_completion_tokens: int = sum([len(completion.tokens) for completion in request_state.result.completions])
         # Don't include prompt in number of generated tokens (e.g., for language modeling).
+        # Assume that tokens for different completions are generated sequentially (instead of batched) when
+        # computing num_output_tokens (for the purpose of runtime estimation).
+        num_output_tokens: int = num_completion_tokens
         if request_state.request.echo_prompt:
             # num_prompt_tokens > num_output_tokens can happen if tokenizer doesn't round trip.
             if num_prompt_tokens <= num_output_tokens:
@@ -591,10 +589,6 @@ def compute_efficiency_metrics(
         else:
             training_energy_cost = None
 
-        # Total number of tokens in the completion
-        num_completion_tokens = sum([len(completion.tokens) for completion in request_state.result.completions])
-
-        # TODO: unify num_completion_tokens and num_output_tokens
         stats = [
             Stat(MetricName("num_prompt_tokens")).add(num_prompt_tokens),
             Stat(MetricName("num_completion_tokens")).add(num_completion_tokens),
diff --git a/src/benchmark/static/schema.yaml b/src/benchmark/static/schema.yaml
@@ -913,7 +913,7 @@ metric_groups:
       split: ${main_split}
     - name: num_prompt_tokens
       split: ${main_split}
-    - name: num_completion_tokens
+    - name: num_output_tokens
       split: ${main_split}
     - name: num_train_trials
       split: ${main_split}