Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[V1][Misc] Shorten FinishReason enum and use constant strings #12760

Merged
merged 1 commit into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions vllm/v1/engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,17 @@
from vllm.multimodal.inputs import PlaceholderRange
from vllm.sampling_params import SamplingParams

# These are possible values of RequestOutput.finish_reason,
# so form part of the external API.
FINISH_REASON_STRINGS = ("stop", "length", "abort")

class RequestFinishedReason(enum.IntEnum):

class FinishReason(enum.IntEnum):
"""
Reason a request finished - stop, length, or abort.

Int rather than Str for more compact serialization.

stop - a stop string was emitted
length - max_tokens was consumed, or max_model_len was reached
abort - aborted for another reason
Expand All @@ -29,7 +35,7 @@ class RequestFinishedReason(enum.IntEnum):
ABORT = 2

def __str__(self):
return self.name.lower()
return FINISH_REASON_STRINGS[self.value]


@dataclass
Expand Down Expand Up @@ -62,7 +68,7 @@ class EngineCoreOutput(
request_id: str
new_token_ids: List[int]
finished: bool
finish_reason: Optional[RequestFinishedReason] = None
finish_reason: Optional[FinishReason] = None
stop_reason: Union[int, str, None] = None


Expand Down
7 changes: 3 additions & 4 deletions vllm/v1/engine/detokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
from vllm.sampling_params import RequestOutputKind
from vllm.transformers_utils.detokenizer_utils import (
AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
from vllm.v1.engine import (EngineCoreOutput, EngineCoreRequest,
RequestFinishedReason)
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason

logger = init_logger(__name__)

Expand All @@ -19,7 +18,7 @@ class DetokenizerOutput:
output_text: str
token_ids: List[int]
finished: bool
finish_reason: Optional[RequestFinishedReason] = None
finish_reason: Optional[FinishReason] = None
stop_reason: Union[int, str, None] = None


Expand Down Expand Up @@ -148,7 +147,7 @@ def update_from_output(
stop_str, truncate_to = stop
if truncate_to != -1:
self.output_text = self.output_text[:truncate_to]
finish_reason = RequestFinishedReason.STOP
finish_reason = FinishReason.STOP
stop_reason = stop_str

# TODO: handle stop_token_ids here too?
Expand Down
6 changes: 3 additions & 3 deletions vllm/v1/metrics/loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from vllm.config import ModelConfig
from vllm.logger import init_logger
from vllm.v1.engine import RequestFinishedReason
from vllm.v1.engine import FinishReason
from vllm.v1.metrics.stats import IterationStats, SchedulerStats

logger = init_logger(__name__)
Expand Down Expand Up @@ -117,13 +117,13 @@ def __init__(self, model_config: ModelConfig):
documentation="Number of generation tokens processed.",
labelnames=labelnames).labels(*labelvalues)

self.counter_request_success: Dict[RequestFinishedReason,
self.counter_request_success: Dict[FinishReason,
prometheus_client.Counter] = {}
counter_request_success_base = prometheus_client.Counter(
name="vllm:request_success_total",
documentation="Count of successfully processed requests.",
labelnames=labelnames + ["finished_reason"])
for reason in RequestFinishedReason:
for reason in FinishReason:
self.counter_request_success[
reason] = counter_request_success_base.labels(*(labelvalues +
[str(reason)]))
Expand Down
7 changes: 3 additions & 4 deletions vllm/v1/metrics/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

if TYPE_CHECKING:
from vllm.outputs import RequestOutput
from vllm.v1.engine import EngineCoreOutput, RequestFinishedReason
from vllm.v1.engine import EngineCoreOutput, FinishReason


@dataclass
Expand All @@ -32,7 +32,7 @@ class RequestStateStats:
class FinishedRequestStats:
"""Stats associated with a finished request."""

finish_reason: "RequestFinishedReason"
finish_reason: "FinishReason"
num_prompt_tokens: int = 0
num_generation_tokens: int = 0

Expand Down Expand Up @@ -74,8 +74,7 @@ def update_from_output(self, output: "EngineCoreOutput",
request_state_stats.num_generation_tokens += num_new_generation_tokens
request_state_stats.last_token_time = now

def update_from_finished_request(self,
finish_reason: "RequestFinishedReason",
def update_from_finished_request(self, finish_reason: "FinishReason",
request_output: "RequestOutput",
request_state_stats: RequestStateStats):
self.finished_requests.append(
Expand Down
14 changes: 7 additions & 7 deletions vllm/v1/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams
from vllm.sequence import RequestMetrics
from vllm.v1.engine import EngineCoreRequest, RequestFinishedReason
from vllm.v1.engine import EngineCoreRequest, FinishReason
from vllm.v1.utils import ConstantList

if TYPE_CHECKING:
Expand Down Expand Up @@ -109,7 +109,7 @@ def num_output_tokens(self) -> int:
def is_finished(self) -> bool:
return RequestStatus.is_finished(self.status)

def get_finished_reason(self) -> Union[RequestFinishedReason, None]:
def get_finished_reason(self) -> Union[FinishReason, None]:
return RequestStatus.get_finished_reason(self.status)

def has_encoder_inputs(self) -> bool:
Expand Down Expand Up @@ -150,7 +150,7 @@ def is_finished(status: "RequestStatus") -> bool:

@staticmethod
def get_finished_reason(
status: "RequestStatus") -> Union[RequestFinishedReason, None]:
status: "RequestStatus") -> Union[FinishReason, None]:
return _FINISHED_REASON_MAP.get(status)


Expand All @@ -159,8 +159,8 @@ def get_finished_reason(
# are longer than the model's length cap. Therefore, the stop
# reason should also be "length" as in OpenAI API.
_FINISHED_REASON_MAP = {
RequestStatus.FINISHED_STOPPED: RequestFinishedReason.STOP,
RequestStatus.FINISHED_LENGTH_CAPPED: RequestFinishedReason.LENGTH,
RequestStatus.FINISHED_ABORTED: RequestFinishedReason.ABORT,
RequestStatus.FINISHED_IGNORED: RequestFinishedReason.LENGTH,
RequestStatus.FINISHED_STOPPED: FinishReason.STOP,
RequestStatus.FINISHED_LENGTH_CAPPED: FinishReason.LENGTH,
RequestStatus.FINISHED_ABORTED: FinishReason.ABORT,
RequestStatus.FINISHED_IGNORED: FinishReason.LENGTH,
}