vllm-project · lk-chen · Nov 16, 2024 · Nov 17, 2024 · Nov 16, 2024 · Nov 17, 2024
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
@@ -27,7 +27,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputs, MultiModalKwargs,
-                                    NestedTensors)
+                                    NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -507,6 +507,43 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         if (config.text_config.architectures is None
                 and config.text_config.model_type == "mistral"):
             config.text_config.architectures = ["MistralForCausalLM"]
+
+            def _slice_encoder_output(
+                mm_input: MultiModalKwargs,
+                encoder_output: torch.Tensor,
+                mm_pos: PlaceholderRange,
+                num_computed_tokens: int,
+                num_scheduled_tokens: int,
+            ) -> torch.Tensor:
+                assert "pixel_values" in mm_input
+                image_input = mm_input["pixel_values"]
+                ncols, nrows = get_pixtral_hf_image_feature_grid_size(
+                    self.config.vision_config,
+                    image_width=image_input.shape[-1],
+                    image_height=image_input.shape[-2],
+                )
+                placeholder_start = mm_pos["offset"]
+
+                # Turn placeholder position into encoder output position
+                def placeholder_pos_to_encoder_output_pos(
+                        placeholder_pos: int) -> int:
+                    return placeholder_pos % (ncols + 1) + placeholder_pos // (
+                        ncols + 1) * ncols
+
+                start_idx = max(
+                    placeholder_pos_to_encoder_output_pos(num_computed_tokens -
+                                                          placeholder_start),
+                    0)
+                end_idx = min(
+                    placeholder_pos_to_encoder_output_pos(
+                        num_computed_tokens + num_scheduled_tokens -
+                        placeholder_start), len(encoder_output))
+                assert start_idx <= end_idx, (
+                    f"{start_idx=} should be no greater than {end_idx=}")
+                return encoder_output[start_idx:end_idx]
+
+            self.slice_encoder_output = _slice_encoder_output
+
         if (config.projector_hidden_act is None
                 and config.vision_config.hidden_act == "gelu"):
             config.projector_hidden_act = "gelu"

@@ -755,15 +755,30 @@ def _gather_encoder_outputs(
                     # in the decoder's KV cache.
                     continue
 
-                start_idx = max(num_computed_tokens - start_pos, 0)
-                end_idx = min(
-                    num_computed_tokens - start_pos + num_scheduled_tokens,
-                    num_encoder_tokens)
-                assert start_idx < end_idx
                 assert req_id in self.encoder_cache
                 assert i in self.encoder_cache[req_id]
                 encoder_output = self.encoder_cache[req_id][i]
-                encoder_outputs.append(encoder_output[start_idx:end_idx])
+                if hasattr(self.model, "slice_encoder_output"):
+                    # Per-model custom logic to slice the encoder output. Some
+                    # models (e.g. Pixtral) have dynamic number of special
+                    # tokens (e.g. image_break) in the middle of placeholder
+                    # positions. This allows the model to calculate
+                    # encoder_output slices taking into account the special
+                    # tokens.
+                    encoder_outputs.append(
+                        self.model.slice_encoder_output(
+                            mm_input=req_state.mm_inputs[i],
+                            encoder_output=encoder_output,
+                            mm_pos=pos_info,
+                            num_computed_tokens=num_computed_tokens,
+                            num_scheduled_tokens=num_scheduled_tokens))
+                else:
+                    start_idx = max(num_computed_tokens - start_pos, 0)
+                    end_idx = min(
+                        num_computed_tokens - start_pos + num_scheduled_tokens,
+                        num_encoder_tokens)
+                    assert start_idx < end_idx
+                    encoder_outputs.append(encoder_output[start_idx:end_idx])
         return encoder_outputs
 
     def get_model(self) -> nn.Module: