From 3a5be9534c791d739ac9e3806f46bf01868dd92b Mon Sep 17 00:00:00 2001 From: liushuang Date: Sat, 11 Oct 2025 13:56:25 +0800 Subject: [PATCH] add --- 20251011.md | 590 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 585 insertions(+), 5 deletions(-) diff --git a/20251011.md b/20251011.md index 7c64943..24d6d5f 100644 --- a/20251011.md +++ b/20251011.md @@ -1,6 +1,6 @@ ## 源码追踪 -### get_structured_output_key 函数实现 +### 函数 get_structured_output_key 实现 #### 分支:release/v0.11.0 @@ -9,7 +9,7 @@ def get_structured_output_key(sampling_params: SamplingParams) -> StructuredOutputKey: params = sampling_params.structured_outputs assert params is not None, "params can't be None." - # structured_outputs 不满足以下任何一个条件 + # 参数用的是 structured_outputs if params.json is not None: if not isinstance(params.json, str): json_str = json.dumps(params.json) @@ -31,7 +31,7 @@ def get_structured_output_key(sampling_params: SamplingParams) -> StructuredOutp elif params.structural_tag is not None: return (StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag) else: - # 最终抛出了这个Error + # 不满足上面的条件,就会抛这个Error raise ValueError("No valid structured output parameter found") ``` @@ -41,6 +41,7 @@ def get_structured_output_key(sampling_params: SamplingParams) -> StructuredOutp # https://github.com/vllm-project/vllm/blob/releases/v0.10.2/vllm/v1/structured_output/request.py def get_structured_output_key( sampling_params: SamplingParams) -> StructuredOutputKey: + # 参数用的是 guided_decoding params = sampling_params.guided_decoding assert params is not None, "params can't be None." if params.json is not None: @@ -71,8 +72,6 @@ def get_structured_output_key( #### 分支:release/v0.11.0 -```python - ```python # https://github.com/vllm-project/vllm/blob/releases/v0.11.0/vllm/sampling_params.py class SamplingParams( @@ -571,6 +570,587 @@ class SamplingParams( f"structured_outputs={self.structured_outputs}, " f"extra_args={self.extra_args})") ``` + +#### 分支:release/v0.10.2 + +```python +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Sampling parameters for text generation.""" +import copy +from dataclasses import dataclass +from enum import Enum, IntEnum +from functools import cached_property +from typing import Annotated, Any, Optional, Union + +import msgspec +from pydantic import BaseModel + +from vllm.logger import init_logger +from vllm.logits_process import LogitsProcessor +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + +_SAMPLING_EPS = 1e-5 +_MAX_TEMP = 1e-2 + + +class SamplingType(IntEnum): + GREEDY = 0 + RANDOM = 1 + RANDOM_SEED = 2 + + +# maybe make msgspec? +@dataclass +class GuidedDecodingParams: + """One of these fields will be used to build a logit processor.""" + json: Optional[Union[str, dict]] = None + regex: Optional[str] = None + choice: Optional[list[str]] = None + grammar: Optional[str] = None + json_object: Optional[bool] = None + """These are other options that can be set""" + backend: Optional[str] = None + backend_was_auto: bool = False + disable_fallback: bool = False + disable_any_whitespace: bool = False + disable_additional_properties: bool = False + whitespace_pattern: Optional[str] = None + structural_tag: Optional[str] = None + + @staticmethod + def from_optional( + json: Optional[Union[dict, BaseModel, str]] = None, + regex: Optional[str] = None, + choice: Optional[list[str]] = None, + grammar: Optional[str] = None, + json_object: Optional[bool] = None, + backend: Optional[str] = None, + whitespace_pattern: Optional[str] = None, + structural_tag: Optional[str] = None, + ) -> Optional["GuidedDecodingParams"]: + if all(arg is None for arg in (json, regex, choice, grammar, + json_object, structural_tag)): + return None + # Extract json schemas from pydantic models + if isinstance(json, (BaseModel, type(BaseModel))): + json = json.model_json_schema() + return GuidedDecodingParams( + json=json, + regex=regex, + choice=choice, + grammar=grammar, + json_object=json_object, + backend=backend, + whitespace_pattern=whitespace_pattern, + structural_tag=structural_tag, + ) + + def __post_init__(self): + """Validate that some fields are mutually exclusive.""" + guide_count = sum([ + self.json is not None, self.regex is not None, self.choice + is not None, self.grammar is not None, self.json_object is not None + ]) + if guide_count > 1: + raise ValueError( + "You can only use one kind of guided decoding but multiple are " + f"specified: {self.__dict__}") + + +class RequestOutputKind(Enum): + # Return entire output so far in every RequestOutput + CUMULATIVE = 0 + # Return only deltas in each RequestOutput + DELTA = 1 + # Do not return intermediate RequestOutput + FINAL_ONLY = 2 + + +class SamplingParams( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + # required for @cached_property. + dict=True): # type: ignore[call-arg] + """Sampling parameters for text generation. + + Overall, we follow the sampling parameters from the OpenAI text completion + API (https://platform.openai.com/docs/api-reference/completions/create). + In addition, we support beam search, which is not supported by OpenAI. + """ + + n: int = 1 + """Number of output sequences to return for the given prompt.""" + best_of: Optional[int] = None + """Number of output sequences that are generated from the prompt. From + these `best_of` sequences, the top `n` sequences are returned. `best_of` + must be greater than or equal to `n`. By default, `best_of` is set to `n`. + Warning, this is only supported in V0.""" + _real_n: Optional[int] = None + presence_penalty: float = 0.0 + """Penalizes new tokens based on whether they appear in the generated text + so far. Values > 0 encourage the model to use new tokens, while values < 0 + encourage the model to repeat tokens.""" + frequency_penalty: float = 0.0 + """Penalizes new tokens based on their frequency in the generated text so + far. Values > 0 encourage the model to use new tokens, while values < 0 + encourage the model to repeat tokens.""" + repetition_penalty: float = 1.0 + """Penalizes new tokens based on whether they appear in the prompt and the + generated text so far. Values > 1 encourage the model to use new tokens, + while values < 1 encourage the model to repeat tokens.""" + temperature: float = 1.0 + """Controls the randomness of the sampling. Lower values make the model + more deterministic, while higher values make the model more random. Zero + means greedy sampling.""" + top_p: float = 1.0 + """Controls the cumulative probability of the top tokens to consider. Must + be in (0, 1]. Set to 1 to consider all tokens.""" + top_k: int = 0 + """Controls the number of top tokens to consider. Set to 0 (or -1) to + consider all tokens.""" + min_p: float = 0.0 + """Represents the minimum probability for a token to be considered, + relative to the probability of the most likely token. Must be in [0, 1]. + Set to 0 to disable this.""" + seed: Optional[int] = None + """Random seed to use for the generation.""" + stop: Optional[Union[str, list[str]]] = None + """String(s) that stop the generation when they are generated. The returned + output will not contain the stop strings.""" + stop_token_ids: Optional[list[int]] = None + """Token IDs that stop the generation when they are generated. The returned + output will contain the stop tokens unless the stop tokens are special + tokens.""" + ignore_eos: bool = False + """Whether to ignore the EOS token and continue generating + tokens after the EOS token is generated.""" + max_tokens: Optional[int] = 16 + """Maximum number of tokens to generate per output sequence.""" + min_tokens: int = 0 + """Minimum number of tokens to generate per output sequence before EOS or + `stop_token_ids` can be generated""" + logprobs: Optional[int] = None + """Number of log probabilities to return per output token. When set to + `None`, no probability is returned. If set to a non-`None` value, the + result includes the log probabilities of the specified number of most + likely tokens, as well as the chosen tokens. Note that the implementation + follows the OpenAI API: The API will always return the log probability of + the sampled token, so there may be up to `logprobs+1` elements in the + response. When set to -1, return all `vocab_size` log probabilities.""" + prompt_logprobs: Optional[int] = None + """Number of log probabilities to return per prompt token. + When set to -1, return all `vocab_size` log probabilities.""" + # NOTE: This parameter is only exposed at the engine level for now. + # It is not exposed in the OpenAI API server, as the OpenAI API does + # not support returning only a list of token IDs. + detokenize: bool = True + """Whether to detokenize the output.""" + skip_special_tokens: bool = True + """Whether to skip special tokens in the output.""" + spaces_between_special_tokens: bool = True + """Whether to add spaces between special tokens in the output.""" + # Optional[list[LogitsProcessor]] type. We use Any here because + # Optional[list[LogitsProcessor]] type is not supported by msgspec. + logits_processors: Optional[Any] = None + """Functions that modify logits based on previously generated tokens, and + optionally prompt tokens as a first argument.""" + include_stop_str_in_output: bool = False + """Whether to include the stop strings in output text.""" + truncate_prompt_tokens: Optional[Annotated[int, + msgspec.Meta(ge=-1)]] = None + """If set to -1, will use the truncation size supported by the model. If + set to an integer k, will use only the last k tokens from the prompt + (i.e., left truncation). If set to `None`, truncation is disabled.""" + output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE + + # The below fields are not supposed to be used as an input. + # They are set in post_init. + output_text_buffer_length: int = 0 + _all_stop_token_ids: set[int] = msgspec.field(default_factory=set) + + # Fields used to construct logits processors + guided_decoding: Optional[GuidedDecodingParams] = None + """If provided, the engine will construct a guided decoding logits + processor from these parameters.""" + logit_bias: Optional[dict[int, float]] = None + """If provided, the engine will construct a logits processor that applies + these logit biases.""" + allowed_token_ids: Optional[list[int]] = None + """If provided, the engine will construct a logits processor which only + retains scores for the given token ids.""" + extra_args: Optional[dict[str, Any]] = None + """Arbitrary additional args, that can be used by custom sampling + implementations, plugins, etc. Not used by any in-tree sampling + implementations.""" + + # Fields used for bad words + bad_words: Optional[list[str]] = None + """Words that are not allowed to be generated. More precisely, only the + last token of a corresponding token sequence is not allowed when the next + generated token can complete the sequence.""" + _bad_words_token_ids: Optional[list[list[int]]] = None + + @staticmethod + def from_optional( + n: Optional[int] = 1, + best_of: Optional[int] = None, + presence_penalty: Optional[float] = 0.0, + frequency_penalty: Optional[float] = 0.0, + repetition_penalty: Optional[float] = 1.0, + temperature: Optional[float] = 1.0, + top_p: Optional[float] = 1.0, + top_k: int = 0, + min_p: float = 0.0, + seed: Optional[int] = None, + stop: Optional[Union[str, list[str]]] = None, + stop_token_ids: Optional[list[int]] = None, + bad_words: Optional[list[str]] = None, + include_stop_str_in_output: bool = False, + ignore_eos: bool = False, + max_tokens: Optional[int] = 16, + min_tokens: int = 0, + logprobs: Optional[int] = None, + prompt_logprobs: Optional[int] = None, + detokenize: bool = True, + skip_special_tokens: bool = True, + spaces_between_special_tokens: bool = True, + logits_processors: Optional[list[LogitsProcessor]] = None, + truncate_prompt_tokens: Optional[Annotated[int, + msgspec.Meta( + ge=-1)]] = None, + output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE, + guided_decoding: Optional[GuidedDecodingParams] = None, + logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None, + allowed_token_ids: Optional[list[int]] = None, + extra_args: Optional[dict[str, Any]] = None, + ) -> "SamplingParams": + if logit_bias is not None: + # Convert token_id to integer + # Clamp the bias between -100 and 100 per OpenAI API spec + logit_bias = { + int(token): min(100.0, max(-100.0, bias)) + for token, bias in logit_bias.items() + } + + return SamplingParams( + n=1 if n is None else n, + best_of=best_of, + presence_penalty=0.0 + if presence_penalty is None else presence_penalty, + frequency_penalty=0.0 + if frequency_penalty is None else frequency_penalty, + repetition_penalty=1.0 + if repetition_penalty is None else repetition_penalty, + temperature=1.0 if temperature is None else temperature, + top_p=1.0 if top_p is None else top_p, + top_k=top_k, + min_p=min_p, + seed=seed, + stop=stop, + stop_token_ids=stop_token_ids, + bad_words=bad_words, + include_stop_str_in_output=include_stop_str_in_output, + ignore_eos=ignore_eos, + max_tokens=max_tokens, + min_tokens=min_tokens, + logprobs=logprobs, + prompt_logprobs=prompt_logprobs, + detokenize=detokenize, + skip_special_tokens=skip_special_tokens, + spaces_between_special_tokens=spaces_between_special_tokens, + logits_processors=logits_processors, + truncate_prompt_tokens=truncate_prompt_tokens, + output_kind=output_kind, + guided_decoding=guided_decoding, + logit_bias=logit_bias, + allowed_token_ids=allowed_token_ids, + extra_args=extra_args, + ) + + def __post_init__(self) -> None: + # how we deal with `best_of``: + # if `best_of`` is not set, we default to `n`; + # if `best_of`` is set, we set `n`` to `best_of`, + # and set `_real_n`` to the original `n`. + # when we return the result, we will check + # if we need to return `n` or `_real_n` results + if self.best_of: + if self.best_of < self.n: + raise ValueError( + f"best_of must be greater than or equal to n, " + f"got n={self.n} and best_of={self.best_of}.") + if not self._real_n: + self._real_n = self.n + self.n = self.best_of + + if 0 < self.temperature < _MAX_TEMP: + logger.warning( + "temperature %s is less than %s, which may cause numerical " + "errors nan or inf in tensors. We have maxed it out to %s.", + self.temperature, _MAX_TEMP, _MAX_TEMP) + self.temperature = max(self.temperature, _MAX_TEMP) + + if self.seed == -1: + self.seed = None + + if self.stop is None: + self.stop = [] + elif isinstance(self.stop, str): + self.stop = [self.stop] + + if self.stop_token_ids is None: + self.stop_token_ids = [] + + if self.bad_words is None: + self.bad_words = [] + + if self.logprobs is True: + self.logprobs = 1 + + if self.prompt_logprobs is True: + self.prompt_logprobs = 1 + + # Number of characters to hold back for stop string evaluation + # until sequence is finished. + if self.stop and not self.include_stop_str_in_output: + self.output_text_buffer_length = max(len(s) for s in self.stop) - 1 + + self._verify_args() + + if self.temperature < _SAMPLING_EPS: + # Zero temperature means greedy sampling. + self.top_p = 1.0 + self.top_k = 0 + self.min_p = 0.0 + self._verify_greedy_sampling() + + # eos_token_id is added to this by the engine + self._all_stop_token_ids.update(self.stop_token_ids) + + def _verify_args(self) -> None: + if not isinstance(self.n, int): + raise ValueError(f"n must be an int, but is of " + f"type {type(self.n)}") + if self.n < 1: + raise ValueError(f"n must be at least 1, got {self.n}.") + if self.best_of is not None: + if not isinstance(self.best_of, int): + raise ValueError( + f"best_of must be an integer, got {type(self.best_of)}") + if self.best_of < 1: + raise ValueError( + f"best_of must be at least 1, got {self.best_of}") + if self.best_of < self.n: + raise ValueError( + f"best_of must be greater than or equal to n, " + f"got n={self.n} and best_of={self.best_of}.") + if not -2.0 <= self.presence_penalty <= 2.0: + raise ValueError("presence_penalty must be in [-2, 2], got " + f"{self.presence_penalty}.") + if not -2.0 <= self.frequency_penalty <= 2.0: + raise ValueError("frequency_penalty must be in [-2, 2], got " + f"{self.frequency_penalty}.") + if self.repetition_penalty <= 0.0: + raise ValueError( + "repetition_penalty must be greater than zero, got " + f"{self.repetition_penalty}.") + if self.temperature < 0.0: + raise ValueError( + f"temperature must be non-negative, got {self.temperature}.") + if not 0.0 < self.top_p <= 1.0: + raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.") + # quietly accept -1 as disabled, but prefer 0 + if self.top_k < -1: + raise ValueError(f"top_k must be 0 (disable), or at least 1, " + f"got {self.top_k}.") + if not isinstance(self.top_k, int): + raise TypeError( + f"top_k must be an integer, got {type(self.top_k).__name__}") + if not 0.0 <= self.min_p <= 1.0: + raise ValueError("min_p must be in [0, 1], got " + f"{self.min_p}.") + if self.max_tokens is not None and self.max_tokens < 1: + raise ValueError( + f"max_tokens must be at least 1, got {self.max_tokens}.") + if self.min_tokens < 0: + raise ValueError(f"min_tokens must be greater than or equal to 0, " + f"got {self.min_tokens}.") + if self.max_tokens is not None and self.min_tokens > self.max_tokens: + raise ValueError( + f"min_tokens must be less than or equal to " + f"max_tokens={self.max_tokens}, got {self.min_tokens}.") + if (self.logprobs is not None and self.logprobs != -1 + and self.logprobs < 0): + raise ValueError( + f"logprobs must be non-negative or -1, got {self.logprobs}.") + if (self.prompt_logprobs is not None and self.prompt_logprobs != -1 + and self.prompt_logprobs < 0): + raise ValueError( + f"prompt_logprobs must be non-negative or -1, got " + f"{self.prompt_logprobs}.") + if (self.truncate_prompt_tokens is not None + and (self.truncate_prompt_tokens == 0 + or self.truncate_prompt_tokens < -1)): + raise ValueError( + f"truncate_prompt_tokens must be an integer >= 1 or -1, " + f"got {self.truncate_prompt_tokens}") + assert isinstance(self.stop_token_ids, list) + if not all(isinstance(st_id, int) for st_id in self.stop_token_ids): + raise ValueError(f"stop_token_ids must contain only integers, " + f"got {self.stop_token_ids}.") + assert isinstance(self.stop, list) + if any(not stop_str for stop_str in self.stop): + raise ValueError("stop cannot contain an empty string.") + if self.stop and not self.detokenize: + raise ValueError( + "stop strings are only supported when detokenize is True. " + "Set detokenize=True to use stop.") + if self.best_of != self._real_n and self.output_kind == ( + RequestOutputKind.DELTA): + raise ValueError("best_of must equal n to use output_kind=DELTA") + + def _verify_greedy_sampling(self) -> None: + if self.n > 1: + raise ValueError("n must be 1 when using greedy sampling, " + f"got {self.n}.") + + def update_from_generation_config( + self, + generation_config: dict[str, Any], + model_eos_token_id: Optional[int] = None) -> None: + """Update if there are non-default values from generation_config""" + + if model_eos_token_id is not None: + # Add the eos token id into the sampling_params to support + # min_tokens processing. + self._all_stop_token_ids.add(model_eos_token_id) + + # Update eos_token_id for generation + if (eos_ids := generation_config.get("eos_token_id")) is not None: + # it can be either int or list of int + eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids) + if model_eos_token_id is not None: + # We don't need to include the primary eos_token_id in + # stop_token_ids since it's handled separately for stopping + # purposes. + eos_ids.discard(model_eos_token_id) + if eos_ids: + self._all_stop_token_ids.update(eos_ids) + if not self.ignore_eos: + eos_ids.update(self.stop_token_ids) + self.stop_token_ids = list(eos_ids) + + def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None: + if not self.bad_words: + return + self._bad_words_token_ids = [] + for bad_word in self.bad_words: + # To prohibit words both at the beginning + # and in the middle of text + # (related to add_prefix_space tokenizer parameter) + for add_prefix_space in [False, True]: + prefix = " " if add_prefix_space else "" + prompt = prefix + bad_word.lstrip() + prompt_token_ids = tokenizer.encode(text=prompt, + add_special_tokens=False) + + # If no space at the beginning + # or if prefix space produces a new word token + if (not add_prefix_space) or ( + add_prefix_space and prompt_token_ids[0] + != self._bad_words_token_ids[-1][0] + and len(prompt_token_ids) == len( + self._bad_words_token_ids[-1])): + self._bad_words_token_ids.append(prompt_token_ids) + + invalid_token_ids = [ + token_id for bad_words_token_ids in self._bad_words_token_ids + for token_id in bad_words_token_ids + if token_id < 0 or token_id > tokenizer.max_token_id + ] + if len(invalid_token_ids) > 0: + raise ValueError( + f"The model vocabulary size is {tokenizer.max_token_id+1}," + f" but the following tokens" + f" were specified as bad: {invalid_token_ids}." + f" All token id values should be integers satisfying:" + f" 0 <= token_id <= {tokenizer.max_token_id}.") + + @cached_property + def sampling_type(self) -> SamplingType: + if self.temperature < _SAMPLING_EPS: + return SamplingType.GREEDY + if self.seed is not None: + return SamplingType.RANDOM_SEED + return SamplingType.RANDOM + + @property + def all_stop_token_ids(self) -> set[int]: + return self._all_stop_token_ids + + @property + def bad_words_token_ids(self) -> Optional[list[list[int]]]: + # For internal use only. Backward compatibility not guaranteed + return self._bad_words_token_ids + + def clone(self) -> "SamplingParams": + """Deep copy, but maybe not the LogitsProcessor objects. + + LogitsProcessor objects may contain an arbitrary, nontrivial amount of + data that is expensive to copy. However, if not copied, the processor + needs to support parallel decoding for multiple sequences + See https://github.com/vllm-project/vllm/issues/3087 + """ + + logit_processor_refs = None if self.logits_processors is None else { + id(lp): lp.clone() if hasattr(lp, 'clone') else lp + for lp in self.logits_processors + } + return copy.deepcopy(self, memo=logit_processor_refs) + + def __repr__(self) -> str: + return ( + f"SamplingParams(n={self.n}, " + f"presence_penalty={self.presence_penalty}, " + f"frequency_penalty={self.frequency_penalty}, " + f"repetition_penalty={self.repetition_penalty}, " + f"temperature={self.temperature}, " + f"top_p={self.top_p}, " + f"top_k={self.top_k}, " + f"min_p={self.min_p}, " + f"seed={self.seed}, " + f"stop={self.stop}, " + f"stop_token_ids={self.stop_token_ids}, " + f"bad_words={self.bad_words}, " + f"include_stop_str_in_output={self.include_stop_str_in_output}, " + f"ignore_eos={self.ignore_eos}, " + f"max_tokens={self.max_tokens}, " + f"min_tokens={self.min_tokens}, " + f"logprobs={self.logprobs}, " + f"prompt_logprobs={self.prompt_logprobs}, " + f"skip_special_tokens={self.skip_special_tokens}, " + "spaces_between_special_tokens=" + f"{self.spaces_between_special_tokens}, " + f"truncate_prompt_tokens={self.truncate_prompt_tokens}, " + f"guided_decoding={self.guided_decoding}, " + f"extra_args={self.extra_args})") + + +class BeamSearchParams( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + # required for @cached_property. + dict=True): # type: ignore[call-arg] + """Beam search parameters for text generation.""" + beam_width: int + max_tokens: int + ignore_eos: bool = False + temperature: float = 0.0 + length_penalty: float = 1.0 + include_stop_str_in_output: bool = False ``` ## 崩溃日志片段