vllm.entrypoints.serve.disagg.protocol ¶

GenerateRequest ¶

Bases: BaseModel

Source code in vllm/entrypoints/serve/disagg/protocol.py

class GenerateRequest(BaseModel):
    request_id: str = Field(
        default_factory=lambda: f"{random_uuid()}",
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."
        ),
    )
    token_ids: list[int]
    """The token ids to generate text from."""

    @field_validator("token_ids")
    @classmethod
    def validate_token_ids(cls, v: list[int]) -> list[int]:
        if any(t < 0 for t in v):
            raise ValueError("token_ids must not contain negative values")
        return v

    features: MultiModalFeatures | None = None
    """Multimodal hashes and placeholder positions (populated for MM inputs)."""

    sampling_params: SamplingParams
    """The sampling parameters for the model."""

    model: str | None = None

    stream: bool | None = False
    stream_options: StreamOptions | None = None
    cache_salt: str | None = Field(
        default=None,
        description=(
            "If specified, the prefix cache will be salted with the provided "
            "string to prevent an attacker to guess prompts in multi-user "
            "environments. The salt should be random, protected from "
            "access by 3rd parties, and long enough to be "
            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
            "to 256 bit)."
        ),
    )
    priority: int = Field(
        default=0,
        ge=-(2**63),
        le=2**63 - 1,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."
        ),
    )
    kv_transfer_params: dict[str, Any] | None = Field(
        default=None,
        description="KVTransfer parameters used for disaggregated serving.",
    )

    def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
        return TokenizeParams(
            max_total_tokens=None,
            max_output_tokens=0,
        )

features `class-attribute` `instance-attribute` ¶

features: MultiModalFeatures | None = None

Multimodal hashes and placeholder positions (populated for MM inputs).

sampling_params `instance-attribute` ¶

sampling_params: SamplingParams

The sampling parameters for the model.

token_ids `instance-attribute` ¶

token_ids: list[int]

The token ids to generate text from.

MultiModalFeatures ¶

Bases: BaseModel

Lightweight multimodal metadata produced by the render step.

Carries hashes (for cache lookup / identification) and placeholder positions so the downstream /generate service knows where in the token sequence each multimodal item lives.

Source code in vllm/entrypoints/serve/disagg/protocol.py

class MultiModalFeatures(BaseModel):
    """Lightweight multimodal metadata produced by the render step.

    Carries hashes (for cache lookup / identification) and placeholder
    positions so the downstream `/generate` service knows *where* in
    the token sequence each multimodal item lives.
    """

    mm_hashes: dict[str, list[str]]
    """Per-modality item hashes, e.g. `{"image": ["abc", "def"]}`."""

    mm_placeholders: dict[str, list[PlaceholderRangeInfo]]
    """Per-modality placeholder ranges in the token sequence."""

    kwargs_data: dict[str, list[str | None]] | None = None
    """Per-modality serialized tensor data.

    Each value is a list parallel to ``mm_hashes[modality]``.  A ``str``
    entry is a base64-encoded ``MultiModalKwargsItem``; ``None`` means
    the item should be resolved from cache.  The entire field is
    ``None`` for metadata-only (cache-hit) responses.
    """

kwargs_data `class-attribute` `instance-attribute` ¶

kwargs_data: dict[str, list[str | None]] | None = None

Per-modality serialized tensor data.

Each value is a list parallel to mm_hashes[modality]. A str entry is a base64-encoded MultiModalKwargsItem; None means the item should be resolved from cache. The entire field is None for metadata-only (cache-hit) responses.

mm_hashes `instance-attribute` ¶

mm_hashes: dict[str, list[str]]

Per-modality item hashes, e.g. {"image": ["abc", "def"]}.

mm_placeholders `instance-attribute` ¶

mm_placeholders: dict[str, list[PlaceholderRangeInfo]]

Per-modality placeholder ranges in the token sequence.

PlaceholderRangeInfo ¶

Bases: BaseModel

Serializable placeholder location for a single multi-modal item.

Source code in vllm/entrypoints/serve/disagg/protocol.py

class PlaceholderRangeInfo(BaseModel):
    """Serializable placeholder location for a single multi-modal item."""

    offset: int
    """Start index of the placeholder tokens in the prompt."""

    length: int
    """Number of placeholder tokens."""

length `instance-attribute` ¶

length: int

Number of placeholder tokens.

offset `instance-attribute` ¶

offset: int

Start index of the placeholder tokens in the prompt.

vllm.entrypoints.serve.disagg.protocol ¶

GenerateRequest ¶

features class-attribute instance-attribute ¶

sampling_params instance-attribute ¶

token_ids instance-attribute ¶

MultiModalFeatures ¶

kwargs_data class-attribute instance-attribute ¶

mm_hashes instance-attribute ¶

mm_placeholders instance-attribute ¶

PlaceholderRangeInfo ¶

length instance-attribute ¶

offset instance-attribute ¶

features `class-attribute` `instance-attribute` ¶

sampling_params `instance-attribute` ¶

token_ids `instance-attribute` ¶

kwargs_data `class-attribute` `instance-attribute` ¶

mm_hashes `instance-attribute` ¶

mm_placeholders `instance-attribute` ¶

length `instance-attribute` ¶

offset `instance-attribute` ¶