11 Commits

Author SHA1 Message Date
  Laith Sakka f569c654e1
enable unbacked with aot_compile (#30462) 2 days ago
  Micah Williamson 97f2f160fd
[ROCm][CI] Add "Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy Test" Back Into AMD CI (#30590) 2 days ago
  Kayvan Mivehnejad 29f7d97715
Improve parse_raw_prompt test cases for invalid input .v2 (#30512) 2 days ago
  Qier Li dc7fb5bebe
[Bug][KVConnector][Metrics] Remove a vacuous assertion breaking external-launcher (#30577) 2 days ago
  Qidong Su 24429d5924
[Doc] Add instructions for building docker image on GB300 with CUDA13 (#30414) 2 days ago
  Wentao Ye 6e78ed6ba7
[Logs] Optimize startup logs 4 (#29903) 2 days ago
  Isotr0py 7c16f3fbcc
[Doc] Add documents for multi-node distributed serving with MP backend (#30509) 2 days ago
  lif ddbfbe5278
[Docs] Clarify Expert Parallel behavior for attention and MoE layers (#30615) 2 days ago
  Laith Sakka 763963aa73
set assume_32bit_indexing and pass unbacked hints (#30459) 2 days ago
  Cyrus Leung 39cefbdf17
[Refactor] `TokenizerRegistry` only uses lazy imports (#30609) 2 days ago
  Chen Zhang ace34e3783
[Bugfix] Qwen3-next with --hf-overrides \{\"num_hidden_layers\":8\} (#30433) 2 days ago
31 changed files with 424 additions and 227 deletions
Split View
  1. +74
    -0
      .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
  2. +0
    -1
      .buildkite/test-amd.yaml
  3. +20
    -1
      docs/deployment/docker.md
  4. +2
    -2
      docs/serving/data_parallel_deployment.md
  5. +21
    -1
      docs/serving/expert_parallel_deployment.md
  6. +23
    -1
      docs/serving/parallelism_scaling.md
  7. +8
    -2
      tests/compile/test_dynamic_shapes_compilation.py
  8. +9
    -2
      tests/test_inputs.py
  9. +24
    -23
      tests/tokenizers_/test_basic.py
  10. +21
    -2
      tests/tokenizers_/test_registry.py
  11. +22
    -11
      vllm/compilation/decorators.py
  12. +0
    -3
      vllm/distributed/eplb/rebalance_execute.py
  13. +0
    -3
      vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
  14. +3
    -2
      vllm/entrypoints/chat_utils.py
  15. +17
    -8
      vllm/inputs/parse.py
  16. +5
    -6
      vllm/model_executor/layers/fused_moe/fused_moe.py
  17. +3
    -1
      vllm/model_executor/layers/fused_moe/layer.py
  18. +7
    -0
      vllm/model_executor/models/qwen3_next.py
  19. +3
    -2
      vllm/platforms/cuda.py
  20. +8
    -5
      vllm/profiler/wrapper.py
  21. +0
    -6
      vllm/tokenizers/__init__.py
  22. +33
    -14
      vllm/tokenizers/deepseekv32.py
  23. +7
    -12
      vllm/tokenizers/hf.py
  24. +2
    -5
      vllm/tokenizers/mistral.py
  25. +1
    -1
      vllm/tokenizers/protocol.py
  26. +100
    -100
      vllm/tokenizers/registry.py
  27. +3
    -3
      vllm/transformers_utils/tokenizer.py
  28. +2
    -2
      vllm/v1/engine/async_llm.py
  29. +2
    -2
      vllm/v1/engine/llm_engine.py
  30. +2
    -4
      vllm/v1/executor/multiproc_executor.py
  31. +2
    -2
      vllm/v1/structured_output/__init__.py

+ 74
- 0
.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh View File

@@ -0,0 +1,74 @@
#!/usr/bin/env bash
set -euxo pipefail

# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319}
PORT=${3:-8040}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"

wait_for_server() {
local port=$1
timeout 600 bash -c '
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
sleep 1
done'
}

MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"

# Set BACKENDS based on platform
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
else
# Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
fi

cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
for _ in {1..20}; do
kill -0 "${SERVER_PID}" 2>/dev/null || break
sleep 0.5
done
kill -9 "${SERVER_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT

for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
--tensor-parallel-size 4 \
--enable-expert-parallel \
--enable-eplb \
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
--trust-remote-code \
--max-model-len 2048 \
--gpu-memory-utilization 0.9 \
--port $PORT &
SERVER_PID=$!
wait_for_server $PORT

TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
PY

cleanup
SERVER_PID=
sleep 1
PORT=$((PORT+1))
done

+ 0
- 1
.buildkite/test-amd.yaml View File

@@ -1629,7 +1629,6 @@ steps:
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"


+ 20
- 1
docs/deployment/docker.md View File

@@ -82,7 +82,7 @@ DOCKER_BUILDKIT=1 docker build . \

## Building for Arm64/aarch64

A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper and Grace-Blackwell. Using the flag `--platform "linux/arm64"` will build for arm64.

!!! note
Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
@@ -104,6 +104,25 @@ A docker container can be built for aarch64 systems such as the Nvidia Grace-Hop
--build-arg RUN_WHEEL_CHECK=false
```

For (G)B300, we recommend using CUDA 13, as shown in the following command.

??? console "Command"

```bash
DOCKER_BUILDKIT=1 docker build \
--build-arg CUDA_VERSION=13.0.1 \
--build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 \
--build-arg max_jobs=256 \
--build-arg nvcc_threads=2 \
--build-arg RUN_WHEEL_CHECK=false \
--build-arg torch_cuda_arch_list='9.0 10.0+PTX' \
--platform "linux/arm64" \
--tag vllm/vllm-gb300-openai:latest \
--target vllm-openai \
-f docker/Dockerfile \
.
```

!!! note
If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.



+ 2
- 2
docs/serving/data_parallel_deployment.md View File

@@ -8,11 +8,11 @@ For MoE models, particularly those like DeepSeek that employ MLA (Multi-head Lat

In these cases, the data parallel ranks are not completely independent. Forward passes must be aligned, and expert layers across all ranks are required to synchronize during every forward pass, even when there are fewer requests to be processed than DP ranks.

The expert layers will by default form a (DP x TP) sized tensor parallel group. To enable expert parallelism, include the `--enable-expert-parallel` CLI arg (on all nodes in the multi-node case).
By default, expert layers form a tensor parallel group of size `DP × TP`. To use expert parallelism instead, include the `--enable-expert-parallel` CLI arg (on all nodes in the multi-node case). See [Expert Parallel Deployment](expert_parallel_deployment.md) for details on how attention and expert layers behave differently with EP enabled.

In vLLM, each DP rank is deployed as a separate "core engine" process that communicates with front-end process(es) via ZMQ sockets. Data Parallel attention can be combined with Tensor Parallel attention, in which case each DP engine owns a number of per-GPU worker processes equal to the configured TP size.

For MoE models, when any requests are in progress in any rank, we must ensure that empty "dummy" forward passes are performed in all ranks that don't currently have any requests scheduled. This is handled via a separate DP Coordinator process that communicates with all ranks, and a collective operation performed every N steps to determine when all ranks become idle and can be paused. When TP is used in conjunction with DP, expert layers form an EP or TP group of size (DP x TP).
For MoE models, when any requests are in progress in any rank, we must ensure that empty "dummy" forward passes are performed in all ranks that don't currently have any requests scheduled. This is handled via a separate DP Coordinator process that communicates with all ranks, and a collective operation performed every N steps to determine when all ranks become idle and can be paused. When TP is used in conjunction with DP, expert layers form a group of size `DP × TP` (using either tensor parallelism by default, or expert parallelism if `--enable-expert-parallel` is set).

In all cases, it is beneficial to load-balance requests between DP ranks. For online deployments, this balancing can be optimized by taking into account the state of each DP engine - in particular its currently scheduled and waiting (queued) requests, and KV cache state. Each DP engine has an independent KV cache, and the benefit of prefix caching can be maximized by directing prompts intelligently.



+ 21
- 1
docs/serving/expert_parallel_deployment.md View File

@@ -44,7 +44,27 @@ Where:
- `DP_SIZE`: Data parallel size
- `EP_SIZE`: Expert parallel size (computed automatically)

When EP is enabled, MoE layers use expert parallelism instead of tensor parallelism, while attention layers continue to use tensor parallelism if `TP_SIZE > 1`.
### Layer Behavior with EP Enabled

When EP is enabled, different layers in MoE models behave differently:

| Layer Type | Behavior | Parallelism Used |
|------------|----------|------------------|
| **Expert (MoE) Layers** | Sharded across all EP ranks | Expert Parallel (EP) of size `TP × DP` |
| **Attention Layers** | Behavior depends on TP size | See below |

**Attention layer parallelism:**

- **When `TP = 1`**: Attention weights are **replicated** across all DP ranks (data parallelism)
- **When `TP > 1`**: Attention weights are **sharded** using tensor parallelism across TP ranks within each DP group

For example, with `TP=2, DP=4` (8 GPUs total):

- Expert layers form an EP group of size 8, with experts distributed across all GPUs
- Attention layers use TP=2 within each of the 4 DP groups

!!! note "Key Difference from Data Parallel Deployment"
Without `--enable-expert-parallel`, MoE layers would use tensor parallelism (forming a TP group of size `TP × DP`), similar to dense models. With EP enabled, expert layers switch to expert parallelism, which can provide better efficiency and locality for MoE models.

### Example Command



+ 23
- 1
docs/serving/parallelism_scaling.md View File

@@ -62,7 +62,7 @@ If a single node lacks sufficient GPUs to hold the model, deploy vLLM across mul

### What is Ray?

Ray is a distributed computing framework for scaling Python programs. Multi-node vLLM deployments require Ray as the runtime engine.
Ray is a distributed computing framework for scaling Python programs. Multi-node vLLM deployments can use Ray as the runtime engine.

vLLM uses Ray to manage the distributed execution of tasks across multiple nodes and control where execution happens.

@@ -130,6 +130,28 @@ vllm serve /path/to/the/model/in/the/container \
--distributed-executor-backend ray
```

### Running vLLM with MultiProcessing

Besides Ray, Multi-node vLLM deployments can also use `multiprocessing` as the runtime engine. Here's an example to deploy model across 2 nodes (8 GPUs per node) with `tp_size=8` and `pp_size=2`.

Choose one node as the head node and run:

```bash
vllm serve /path/to/the/model/in/the/container \
--tensor-parallel-size 8 --pipeline-parallel-size 2 \
--nnodes 2 --node-rank 0 \
--master-addr <HEAD_NODE_IP>
```

On the other worker node, run:

```bash
vllm serve /path/to/the/model/in/the/container \
--tensor-parallel-size 8 --pipeline-parallel-size 2 \
--nnodes 2 --node-rank 1 \
--master-addr <HEAD_NODE_IP> --headless
```

## Optimizing network communication for tensor parallelism

Efficient tensor parallelism requires fast inter-node communication, preferably through high-speed network adapters such as InfiniBand.


+ 8
- 2
tests/compile/test_dynamic_shapes_compilation.py View File

@@ -36,7 +36,7 @@ def get_test_models():
DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
],
)
@pytest.mark.parametrize("use_aot_compile", ["0"])
@pytest.mark.parametrize("use_aot_compile", ["0", "1"])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
@pytest.mark.parametrize("evaluate_guards", [False, True])
@pytest.mark.skipif(
@@ -54,6 +54,12 @@ def test_dynamic_shapes_compilation(
if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")

if evaluate_guards and shapes_type == DynamicShapesType.UNBACKED:
pytest.skip("unbacked dynamic shapes do not add guards")

if evaluate_guards and use_aot_compile:
pytest.skip("evaluate_guards requires use_aot_compile=0")

monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")

@@ -120,7 +126,7 @@ def test_model_specialization_with_evaluate_guards(
and dynamic_shapes_type == DynamicShapesType.BACKED
and evaluate_guards
):
pytest.skip("evaluate_guards for backed does not work with aot_compile =1")
pytest.skip("evaluate_guards for backed does not work with aot_compile=1")

@support_torch_compile
class ModelWithSizeCheck(torch.nn.Module):


+ 9
- 2
tests/test_inputs.py View File

@@ -7,7 +7,7 @@ from vllm.config import ModelConfig
from vllm.inputs import zip_enc_dec_prompts
from vllm.inputs.parse import parse_raw_prompts
from vllm.inputs.preprocess import InputPreprocessor
from vllm.tokenizers import init_tokenizer_from_config
from vllm.tokenizers import cached_tokenizer_from_config

pytestmark = pytest.mark.cpu_test

@@ -34,6 +34,13 @@ INPUTS_SLICES = [
]


# Test that a nested mixed-type list of lists raises a TypeError.
@pytest.mark.parametrize("invalid_input", [[[1, 2], ["foo", "bar"]]])
def test_invalid_input_raise_type_error(invalid_input):
with pytest.raises(TypeError):
parse_raw_prompts(invalid_input)


def test_parse_raw_single_batch_empty():
with pytest.raises(ValueError, match="at least one prompt"):
parse_raw_prompts([])
@@ -108,7 +115,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
)
def test_preprocessor_always_mm_code_path(model_id, prompt):
model_config = ModelConfig(model=model_id)
tokenizer = init_tokenizer_from_config(model_config)
tokenizer = cached_tokenizer_from_config(model_config)
input_preprocessor = InputPreprocessor(model_config, tokenizer)

# HF processor adds sep token


+ 24
- 23
tests/tokenizers_/test_basic.py View File

@@ -3,38 +3,39 @@
from typing import _get_protocol_attrs # type: ignore

import pytest
from transformers import PreTrainedTokenizerBase
from transformers import (
PreTrainedTokenizer,
PreTrainedTokenizerBase,
PreTrainedTokenizerFast,
)

from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.mistral import MistralTokenizer


def _get_missing_attrs(obj: object, target: type):
return [k for k in _get_protocol_attrs(target) if not hasattr(obj, k)]


def _assert_tokenizer_like(tokenizer: object):
missing_attrs = _get_missing_attrs(tokenizer, TokenizerLike)
assert not missing_attrs, f"Missing attrs: {missing_attrs}"


def test_tokenizer_like_protocol():
assert not (
missing_attrs := _get_missing_attrs(
get_tokenizer("gpt2", use_fast=False),
TokenizerLike,
)
), f"Missing attrs: {missing_attrs}"

assert not (
missing_attrs := _get_missing_attrs(
get_tokenizer("gpt2", use_fast=True),
TokenizerLike,
)
), f"Missing attrs: {missing_attrs}"

assert not (
missing_attrs := _get_missing_attrs(
get_tokenizer(
"mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
),
TokenizerLike,
)
), f"Missing attrs: {missing_attrs}"
tokenizer = get_tokenizer("gpt2", use_fast=False)
assert isinstance(tokenizer, PreTrainedTokenizer)
_assert_tokenizer_like(tokenizer)

tokenizer = get_tokenizer("gpt2", use_fast=True)
assert isinstance(tokenizer, PreTrainedTokenizerFast)
_assert_tokenizer_like(tokenizer)

tokenizer = get_tokenizer(
"mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
)
assert isinstance(tokenizer, MistralTokenizer)
_assert_tokenizer_like(tokenizer)


@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])


+ 21
- 2
tests/tokenizers_/test_registry.py View File

@@ -2,7 +2,14 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path

from vllm.tokenizers import TokenizerLike, TokenizerRegistry, get_tokenizer
import pytest

from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.registry import (
TokenizerRegistry,
get_tokenizer,
resolve_tokenizer_args,
)


class TestTokenizer(TokenizerLike):
@@ -40,10 +47,22 @@ class TestTokenizer(TokenizerLike):
return True


@pytest.mark.parametrize("runner_type", ["generate", "pooling"])
def test_resolve_tokenizer_args_idempotent(runner_type):
tokenizer_mode, tokenizer_name, args, kwargs = resolve_tokenizer_args(
"facebook/opt-125m",
runner_type=runner_type,
)

assert (tokenizer_mode, tokenizer_name, args, kwargs) == resolve_tokenizer_args(
tokenizer_name, *args, **kwargs
)


def test_customized_tokenizer():
TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)

tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer", "abc")
tokenizer = TokenizerRegistry.load_tokenizer("test_tokenizer", "abc")
assert isinstance(tokenizer, TestTokenizer)
assert tokenizer.path_or_repo_id == "abc"
assert tokenizer.bos_token_id == 0


+ 22
- 11
vllm/compilation/decorators.py View File

@@ -28,7 +28,7 @@ from vllm.config.compilation import DynamicShapesType
from vllm.logger import init_logger
from vllm.sequence import IntermediateTensors
from vllm.utils.import_utils import resolve_obj_by_qualname
from vllm.utils.torch_utils import supports_dynamo
from vllm.utils.torch_utils import is_torch_equal_or_newer, supports_dynamo

from .monitor import start_monitoring_torch_compile

@@ -316,7 +316,13 @@ def _support_torch_compile(
def _mark_dynamic_inputs(mod, type, *args, **kwargs):
def mark_dynamic(arg, dims):
if type == DynamicShapesType.UNBACKED:
torch._dynamo.decorators.mark_unbacked(arg, dims)
if is_torch_equal_or_newer("2.10.0.dev"):
for dim in dims:
torch._dynamo.decorators.mark_unbacked(
arg, dim, hint_override=arg.size()[dim]
)
else:
torch._dynamo.decorators.mark_unbacked(arg, dims)
else:
torch._dynamo.mark_dynamic(arg, dims)

@@ -350,7 +356,13 @@ def _support_torch_compile(
if isinstance(arg, torch.Tensor):
# In case dims is specified with negative indexing
dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
torch._dynamo.decorators.mark_unbacked(arg, dims)
if is_torch_equal_or_newer("2.10.0.dev"):
for dim in dims:
torch._dynamo.decorators.mark_unbacked(
arg, dim, hint_override=arg.size()[dim]
)
else:
torch._dynamo.decorators.mark_unbacked(arg, dims)

def __call__(self, *args, **kwargs):
# torch.compiler.is_compiling() means we are inside the compilation
@@ -378,14 +390,6 @@ def _support_torch_compile(
serialized backend artifacts), then we need to generate a new AOT
compile artifact from scratch.
"""
# Validate that AOT compile is not used with unbacked dynamic
# shapes. aot_compile re-allocates backed symbols post dynamo!
if ds_type == DynamicShapesType.UNBACKED:
raise ValueError(
"AOT compilation is not compatible with UNBACKED dynamic shapes. "
"Please use BACKED or BACKED_SIZE_OBLIVIOUS dynamic shapes type "
"when VLLM_USE_AOT_COMPILE is enabled."
)
from .caching import compilation_config_hash_factors

factors: list[str] = compilation_config_hash_factors(self.vllm_config)
@@ -488,6 +492,12 @@ def _support_torch_compile(
if ds_type == DynamicShapesType.BACKED_SIZE_OBLIVIOUS:
fx_config_patches["backed_size_oblivious"] = True

# Prepare inductor config patches
# assume_32bit_indexing is only available in torch 2.10.0.dev+
inductor_config_patches = {}
if is_torch_equal_or_newer("2.10.0.dev"):
inductor_config_patches["assume_32bit_indexing"] = True

with (
patch.object(
InliningInstructionTranslator, "inline_call_", patched_inline_call
@@ -496,6 +506,7 @@ def _support_torch_compile(
maybe_use_cudagraph_partition_wrapper(self.vllm_config),
torch.fx.experimental._config.patch(**fx_config_patches),
_torch27_patch_tensor_subclasses(),
torch._inductor.config.patch(**inductor_config_patches),
):
if envs.VLLM_USE_AOT_COMPILE:
self.aot_compiled_fn = self.aot_compile(*args, **kwargs)


+ 0
- 3
vllm/distributed/eplb/rebalance_execute.py View File

@@ -322,9 +322,6 @@ async def transfer_layer(
num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
assert num_physical_experts == ep_size * num_local_physical_experts
# A buffer to hold the expert weights in one layer during the exchange.
# NOTE: Currently we assume the same weights across different layers
# have the same shape.

is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
num_local_experts=num_local_physical_experts,


+ 0
- 3
vllm/distributed/kv_transfer/kv_connector/v1/metrics.py View File

@@ -7,7 +7,6 @@ from prometheus_client import Counter, Gauge, Histogram

from vllm.config import KVTransferConfig, VllmConfig
from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
from vllm.distributed.kv_transfer.kv_transfer_state import has_kv_transfer_group
from vllm.logger import init_logger

PromMetric: TypeAlias = Gauge | Counter | Histogram
@@ -53,8 +52,6 @@ class KVConnectorStats:

class KVConnectorLogging:
def __init__(self, kv_transfer_config: KVTransferConfig | None):
# This should be called on frontend process.
assert not has_kv_transfer_group()
# Instantiate the connector's stats class.
if kv_transfer_config and kv_transfer_config.kv_connector:
self.connector_cls = KVConnectorFactory.get_connector_class(


+ 3
- 2
vllm/entrypoints/chat_utils.py View File

@@ -50,7 +50,6 @@ from vllm.model_executor.models import SupportsMultiModal
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
from vllm.transformers_utils.processor import cached_get_processor
from vllm.utils import random_uuid
@@ -60,6 +59,8 @@ from vllm.utils.import_utils import LazyLoader

if TYPE_CHECKING:
import torch

from vllm.tokenizers.mistral import MistralTokenizer
else:
torch = LazyLoader("torch", globals(), "torch")

@@ -1832,7 +1833,7 @@ def apply_hf_chat_template(


def apply_mistral_chat_template(
tokenizer: MistralTokenizer,
tokenizer: "MistralTokenizer",
messages: list[ChatCompletionMessageParam],
chat_template: str | None,
tools: list[dict[str, Any]] | None,


+ 17
- 8
vllm/inputs/parse.py View File

@@ -33,22 +33,31 @@ def parse_raw_prompts(
if len(prompt) == 0:
raise ValueError("please provide at least one prompt")

# case 2: array of strings
if is_list_of(prompt, str):
# case 2: array of strings
prompt = cast(list[str], prompt)
return [TextPrompt(prompt=elem) for elem in prompt]

# case 3: array of tokens
if is_list_of(prompt, int):
# case 3: array of tokens
prompt = cast(list[int], prompt)
return [TokensPrompt(prompt_token_ids=prompt)]

# case 4: array of token arrays
if is_list_of(prompt, list):
prompt = cast(list[list[int]], prompt)
if len(prompt[0]) == 0:
raise ValueError("please provide at least one prompt")
first = prompt[0]
if not isinstance(first, list):
raise ValueError("prompt expected to be a list of lists")

if is_list_of(prompt[0], int):
# case 4: array of token arrays
return [TokensPrompt(prompt_token_ids=elem) for elem in prompt]
if len(first) == 0:
raise ValueError("Please provide at least one prompt")

# strict validation: every nested list must be list[int]
if not all(is_list_of(elem, int) for elem in prompt):
raise TypeError("Nested lists must contain only integers")

prompt = cast(list[list[int]], prompt)
return [TokensPrompt(prompt_token_ids=elem) for elem in prompt]

raise TypeError(
"prompt must be a string, array of strings, "


+ 5
- 6
vllm/model_executor/layers/fused_moe/fused_moe.py View File

@@ -885,12 +885,11 @@ def get_moe_configs(

# If no optimized configuration is available, we will use the default
# configuration
logger.warning(
(
"Using default MoE config. Performance might be sub-optimal! "
"Config file not found at %s"
),
config_file_paths,
logger.warning_once(
"Using default MoE config. Performance might be sub-optimal! "
"Config file not found at %s",
", ".join(config_file_paths),
scope="local",
)
return None



+ 3
- 1
vllm/model_executor/layers/fused_moe/layer.py View File

@@ -369,7 +369,9 @@ class FusedMoE(CustomOp):
# aux_stream() returns None on non-cuda-alike platforms.
self.shared_experts_stream = aux_stream()
if self.shared_experts_stream is not None:
logger.info_once("Enabled separate cuda stream for MoE shared_experts")
logger.info_once(
"Enabled separate cuda stream for MoE shared_experts", scope="local"
)

if params_dtype is None:
params_dtype = torch.get_default_dtype()


+ 7
- 0
vllm/model_executor/models/qwen3_next.py View File

@@ -1092,6 +1092,8 @@ class Qwen3NextModel(nn.Module):
name.endswith(".bias") or name.endswith("_bias")
) and name not in params_dict:
continue
if name not in params_dict:
continue
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(
@@ -1108,6 +1110,11 @@ class Qwen3NextModel(nn.Module):
continue
if is_pp_missing_parameter(name, self):
continue
if name not in params_dict:
logger.warning_once(
f"Parameter {name} not found in params_dict, skip loading"
)
continue
param = params_dict[name]
weight_loader = getattr(
param, "weight_loader", default_weight_loader


+ 3
- 2
vllm/platforms/cuda.py View File

@@ -409,10 +409,11 @@ class CudaPlatformBase(Platform):
)
selected_index = sorted_indices[0]
selected_backend = valid_backends_priorities[selected_index][0]
logger.info(
logger.info_once(
"Using %s attention backend out of potential backends: %s",
selected_backend.name,
[b[0].name for b in valid_backends_priorities],
tuple(b[0].name for b in valid_backends_priorities),
scope="local",
)

return selected_backend.get_path()


+ 8
- 5
vllm/profiler/wrapper.py View File

@@ -61,7 +61,7 @@ class WorkerProfiler(ABC):
"""Call _stop with error handling but no safeguards."""
try:
self._stop()
logger.info("Profiler stopped successfully.")
logger.info_once("Profiler stopped successfully.", scope="local")
except Exception as e:
logger.warning("Failed to stop profiler: %s", e)
self._running = False # Always mark as not running, assume stop worked
@@ -91,7 +91,7 @@ class WorkerProfiler(ABC):
and self._delay_iters > 0
and self._active_iteration_count == self._delay_iters
):
logger.info("Starting profiler after delay...")
logger.info_once("Starting profiler after delay...", scope="local")
self._call_start()

if self._running:
@@ -105,7 +105,9 @@ class WorkerProfiler(ABC):
# Automatically stop the profiler after max iters
# will be marked as not running, but leave as active so that stop
# can clean up properly
logger.info("Max profiling iterations reached. Stopping profiler...")
logger.info_once(
"Max profiling iterations reached. Stopping profiler...", scope="local"
)
self._call_stop()
return

@@ -125,7 +127,7 @@ class WorkerProfiler(ABC):

def shutdown(self) -> None:
"""Ensure profiler is stopped when shutting down."""
logger.info_once("Shutting down profiler")
logger.info_once("Shutting down profiler", scope="local")
if self._running:
self.stop()

@@ -156,9 +158,10 @@ class TorchProfilerWrapper(WorkerProfiler):
self.profiler_config = profiler_config
torch_profiler_trace_dir = profiler_config.torch_profiler_dir
if local_rank in (None, 0):
logger.info(
logger.info_once(
"Torch profiling enabled. Traces will be saved to: %s",
torch_profiler_trace_dir,
scope="local",
)
logger.debug(
"Profiler config: record_shapes=%s,"


+ 0
- 6
vllm/tokenizers/__init__.py View File

@@ -1,9 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from .deepseekv32 import DeepseekV32Tokenizer
from .hf import HfTokenizer
from .mistral import MistralTokenizer
from .protocol import TokenizerLike
from .registry import (
TokenizerRegistry,
@@ -15,12 +12,9 @@ from .registry import (

__all__ = [
"TokenizerLike",
"HfTokenizer",
"MistralTokenizer",
"TokenizerRegistry",
"cached_get_tokenizer",
"get_tokenizer",
"cached_tokenizer_from_config",
"init_tokenizer_from_config",
"DeepseekV32Tokenizer",
]

+ 33
- 14
vllm/tokenizers/deepseekv32.py View File

@@ -2,24 +2,18 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from pathlib import Path
from typing import Any

from transformers import BatchEncoding

from .deepseek_v32_encoding import encode_messages
from .hf import HfTokenizer, TokenizerLike
from .registry import TokenizerRegistry
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam

from .deepseek_v32_encoding import encode_messages
from .hf import CachedHfTokenizer
from .protocol import TokenizerLike

@TokenizerRegistry.register("deepseek_v32")
class DeepseekV32Tokenizer(HfTokenizer):
def __init__(self, tokenizer: TokenizerLike):
self.tokenizer = tokenizer
self.name_or_path = (
tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
)
self._added_vocab = self.tokenizer.get_added_vocab()
self._added_vocab_size = len(self._added_vocab)

class DeepseekV32Tokenizer(CachedHfTokenizer):
@classmethod
def from_pretrained(
cls,
@@ -40,7 +34,21 @@ class DeepseekV32Tokenizer(HfTokenizer):
)
return DeepseekV32Tokenizer(tokenizer)

def apply_chat_template(self, messages, tools=None, **kwargs):
def __init__(self, tokenizer: TokenizerLike) -> None:
super().__init__()

self.tokenizer = tokenizer
self.name_or_path = getattr(tokenizer, "name_or_path", "")

self._added_vocab = self.tokenizer.get_added_vocab()
self._added_vocab_size = len(self._added_vocab)

def apply_chat_template(
self,
messages: list["ChatCompletionMessageParam"],
tools: list[dict[str, Any]] | None = None,
**kwargs,
) -> str | list[int]:
thinking = kwargs.get("thinking", False)
thinking_mode = "thinking"
if not thinking:
@@ -49,13 +57,24 @@ class DeepseekV32Tokenizer(HfTokenizer):
messages = conversation.copy()
if tools is not None and len(tools) > 0:
messages.insert(0, {"role": "system"})
messages[0]["tools"] = tools
messages[0]["tools"] = tools # type: ignore[typeddict-unknown-key]

# Historical reasoning content is dropped when a new user message is introduced
drop_thinking = messages[-1]["role"] == "user"

encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking)
prompt_str = encode_messages(messages, **encode_config) # type: ignore

if kwargs.get("tokenize", True):
tokenizer_kwargs = {
k: kwargs[k] for k in ("truncation", "max_length") if k in kwargs
}
return self.encode(
prompt_str,
add_special_tokens=False,
**tokenizer_kwargs,
)

return prompt_str

def num_special_tokens_to_add(self) -> int:


+ 7
- 12
vllm/tokenizers/hf.py View File

@@ -3,22 +3,18 @@
import contextlib
import copy
from pathlib import Path
from typing import TYPE_CHECKING
from typing import TypeAlias

from transformers import AutoTokenizer
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast

from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config

from .protocol import TokenizerLike
from .registry import TokenizerRegistry

if TYPE_CHECKING:
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
HfTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast


def get_cached_tokenizer(
tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast",
) -> TokenizerLike:
def get_cached_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
"""
By default, transformers will recompute multiple tokenizer properties
each time they are called, leading to a significant slowdown.
@@ -65,11 +61,10 @@ def get_cached_tokenizer(
CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"

cached_tokenizer.__class__ = CachedTokenizer
return cached_tokenizer # type: ignore
return cached_tokenizer


@TokenizerRegistry.register("hf")
class HfTokenizer(TokenizerLike):
class CachedHfTokenizer(TokenizerLike):
@classmethod
def from_pretrained(
cls,
@@ -79,7 +74,7 @@ class HfTokenizer(TokenizerLike):
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> "TokenizerLike":
) -> HfTokenizer:
try:
tokenizer = AutoTokenizer.from_pretrained(
path_or_repo_id,


+ 2
- 5
vllm/tokenizers/mistral.py View File

@@ -3,10 +3,11 @@
from pathlib import Path
from typing import TYPE_CHECKING, Any, cast

from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.logger import init_logger

from .protocol import TokenizerLike
from .registry import TokenizerRegistry

if TYPE_CHECKING:
from mistral_common.protocol.instruct.request import (
@@ -15,9 +16,6 @@ if TYPE_CHECKING:
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
from transformers import BatchEncoding

from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.entrypoints.openai.protocol import ChatCompletionRequest

try:
# Transformers v5
from transformers.tokenization_mistral_common import MistralCommonBackend
@@ -201,7 +199,6 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
return tokenizer.unk_id


@TokenizerRegistry.register("mistral")
class MistralTokenizer(TokenizerLike):
@classmethod
def from_pretrained(


+ 1
- 1
vllm/tokenizers/protocol.py View File

@@ -97,7 +97,7 @@ class TokenizerLike(Protocol):
messages: list["ChatCompletionMessageParam"],
tools: list[dict[str, Any]] | None = None,
**kwargs,
) -> list[int]:
) -> str | list[int]:
raise NotImplementedError

def convert_tokens_to_string(self, tokens: list[str]) -> str:


+ 100
- 100
vllm/tokenizers/registry.py View File

@@ -1,13 +1,13 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib.util
from collections.abc import Callable
from dataclasses import dataclass, field
from functools import lru_cache
from pathlib import Path
from typing import TYPE_CHECKING, TypeVar, overload
from typing import TYPE_CHECKING

import huggingface_hub
from typing_extensions import assert_never
from typing_extensions import TypeVar, assert_never, deprecated

import vllm.envs as envs
from vllm.logger import init_logger
@@ -24,46 +24,25 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
from .protocol import TokenizerLike

if TYPE_CHECKING:
from vllm.config import ModelConfig
from vllm.config.model import ModelConfig, RunnerType

logger = init_logger(__name__)

_T = TypeVar("_T", bound=type[TokenizerLike])

_VLLM_TOKENIZERS = {
"deepseekv32": ("deepseekv32", "DeepseekV32Tokenizer"),
"hf": ("hf", "CachedHfTokenizer"),
"mistral": ("mistral", "MistralTokenizer"),
}

class TokenizerRegistry:
# Tokenizer name -> tokenizer_cls or (tokenizer module, tokenizer class)
REGISTRY: dict[str, type[TokenizerLike] | tuple[str, str]] = {}

# In-tree tokenizers
@staticmethod
@overload
def register(tokenizer_mode: str) -> Callable[[_T], _T]: ...
@dataclass
class _TokenizerRegistry:
# Tokenizer mode -> (tokenizer module, tokenizer class)
tokenizers: dict[str, tuple[str, str]] = field(default_factory=dict)

# OOT tokenizers
@staticmethod
@overload
def register(tokenizer_mode: str, module: str, class_name: str) -> None: ...

@staticmethod
def register(
tokenizer_mode: str,
module: str | None = None,
class_name: str | None = None,
) -> Callable[[_T], _T] | None:
# In-tree tokenizers
if module is None or class_name is None:

def wrapper(tokenizer_cls: _T) -> _T:
assert tokenizer_mode not in TokenizerRegistry.REGISTRY
TokenizerRegistry.REGISTRY[tokenizer_mode] = tokenizer_cls

return tokenizer_cls

return wrapper

# OOT tokenizers
if tokenizer_mode in TokenizerRegistry.REGISTRY:
def register(self, tokenizer_mode: str, module: str, class_name: str) -> None:
if tokenizer_mode in self.tokenizers:
logger.warning(
"%s.%s is already registered for tokenizer_mode=%r. "
"It is overwritten by the new one.",
@@ -72,36 +51,42 @@ class TokenizerRegistry:
tokenizer_mode,
)

TokenizerRegistry.REGISTRY[tokenizer_mode] = (module, class_name)
self.tokenizers[tokenizer_mode] = (module, class_name)

return None

@staticmethod
def get_tokenizer(tokenizer_mode: str, *args, **kwargs) -> "TokenizerLike":
if tokenizer_mode not in TokenizerRegistry.REGISTRY:
def load_tokenizer_cls(self, tokenizer_mode: str) -> type[TokenizerLike]:
if tokenizer_mode not in self.tokenizers:
raise ValueError(f"No tokenizer registered for {tokenizer_mode=!r}.")

item = TokenizerRegistry.REGISTRY[tokenizer_mode]
if isinstance(item, type):
return item.from_pretrained(*args, **kwargs)

module, class_name = item
module, class_name = self.tokenizers[tokenizer_mode]
logger.debug_once(f"Loading {class_name} for {tokenizer_mode=!r}")

class_ = resolve_obj_by_qualname(f"{module}.{class_name}")
return class_.from_pretrained(*args, **kwargs)
return resolve_obj_by_qualname(f"{module}.{class_name}")

def load_tokenizer(self, tokenizer_mode: str, *args, **kwargs) -> TokenizerLike:
tokenizer_cls = self.load_tokenizer_cls(tokenizer_mode)
return tokenizer_cls.from_pretrained(*args, **kwargs)

def get_tokenizer(

TokenizerRegistry = _TokenizerRegistry(
{
mode: (f"vllm.tokenizers.{mod_relname}", cls_name)
for mode, (mod_relname, cls_name) in _VLLM_TOKENIZERS.items()
}
)


def resolve_tokenizer_args(
tokenizer_name: str | Path,
*args,
runner_type: "RunnerType" = "generate",
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> TokenizerLike:
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
):
revision: str | None = kwargs.get("revision")
download_dir: str | None = kwargs.get("download_dir")

if envs.VLLM_USE_MODELSCOPE:
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
@@ -125,16 +110,6 @@ def get_tokenizer(
)
tokenizer_name = tokenizer_path

if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")

tokenizer_mode = "hf"
kwargs["use_fast"] = False

if "truncation_side" not in kwargs:
kwargs["truncation_side"] = "left"

# Separate model folder from file path for GGUF models
if is_gguf(tokenizer_name):
if check_gguf_file(tokenizer_name):
@@ -150,6 +125,21 @@ def get_tokenizer(
)
kwargs["gguf_file"] = gguf_file

if "truncation_side" not in kwargs:
if runner_type == "generate" or runner_type == "draft":
kwargs["truncation_side"] = "left"
elif runner_type == "pooling":
kwargs["truncation_side"] = "right"
else:
assert_never(runner_type)

if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")

tokenizer_mode = "hf"
kwargs["use_fast"] = False

# Try to use official Mistral tokenizer if possible
if tokenizer_mode == "auto" and importlib.util.find_spec("mistral_common"):
allow_patterns = ["tekken.json", "tokenizer.model.v*"]
@@ -165,49 +155,70 @@ def get_tokenizer(
if tokenizer_mode == "auto":
tokenizer_mode = "hf"

tokenizer_args = (tokenizer_name, *args)
tokenizer_kwargs = dict(
return tokenizer_mode, tokenizer_name, args, kwargs


cached_resolve_tokenizer_args = lru_cache(resolve_tokenizer_args)


def tokenizer_args_from_config(config: "ModelConfig", **kwargs):
return cached_resolve_tokenizer_args(
config.tokenizer,
runner_type=config.runner_type,
tokenizer_mode=config.tokenizer_mode,
revision=config.tokenizer_revision,
trust_remote_code=config.trust_remote_code,
**kwargs,
)


_T = TypeVar("_T", bound=TokenizerLike, default=TokenizerLike)


def get_tokenizer(
tokenizer_name: str | Path,
*args,
tokenizer_cls: type[_T] = TokenizerLike, # type: ignore[assignment]
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> _T:
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
tokenizer_mode, tokenizer_name, args, kwargs = cached_resolve_tokenizer_args(
tokenizer_name,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
download_dir=download_dir,
**kwargs,
)

if tokenizer_mode == "custom":
logger.warning_once(
"TokenizerRegistry now uses `tokenizer_mode` as the registry key "
"instead of `tokenizer_name`. "
"Please update the definition of `.from_pretrained` in "
"your custom tokenizer to accept `args=%s`, `kwargs=%s`. "
"Then, you can pass `tokenizer_mode=%r` instead of "
"`tokenizer_mode='custom'` when initializing vLLM.",
tokenizer_args,
str(tokenizer_kwargs),
tokenizer_name,
)

tokenizer_mode = str(tokenizer_name)
if tokenizer_cls == TokenizerLike:
tokenizer_cls_ = TokenizerRegistry.load_tokenizer_cls(tokenizer_mode)
else:
tokenizer_cls_ = tokenizer_cls

tokenizer = TokenizerRegistry.get_tokenizer(
tokenizer_mode,
*tokenizer_args,
**tokenizer_kwargs,
)
tokenizer = tokenizer_cls_.from_pretrained(tokenizer_name, *args, **kwargs)
if not tokenizer.is_fast:
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead."
)

return tokenizer
return tokenizer # type: ignore


cached_get_tokenizer = lru_cache(get_tokenizer)


def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
if model_config.skip_tokenizer_init:
return None

return cached_get_tokenizer(
model_config.tokenizer,
runner_type=model_config.runner_type,
tokenizer_mode=model_config.tokenizer_mode,
revision=model_config.tokenizer_revision,
trust_remote_code=model_config.trust_remote_code,
@@ -215,19 +226,8 @@ def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
)


@deprecated(
"Renamed to `cached_tokenizer_from_config`. The old name will be removed in v0.14."
)
def init_tokenizer_from_config(model_config: "ModelConfig"):
runner_type = model_config.runner_type
if runner_type == "generate" or runner_type == "draft":
truncation_side = "left"
elif runner_type == "pooling":
truncation_side = "right"
else:
assert_never(runner_type)

return get_tokenizer(
model_config.tokenizer,
tokenizer_mode=model_config.tokenizer_mode,
trust_remote_code=model_config.trust_remote_code,
revision=model_config.tokenizer_revision,
truncation_side=truncation_side,
)
return cached_tokenizer_from_config(model_config)

+ 3
- 3
vllm/transformers_utils/tokenizer.py View File

@@ -60,17 +60,17 @@ def __getattr__(name: str):

return cached_tokenizer_from_config
if name == "init_tokenizer_from_configs":
from vllm.tokenizers import init_tokenizer_from_config
from vllm.tokenizers import cached_tokenizer_from_config

warnings.warn(
"`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` "
"has been moved to `vllm.tokenizers.init_tokenizer_from_config`. "
"has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)

return init_tokenizer_from_config
return cached_tokenizer_from_config

raise AttributeError(f"module {__name__!r} has no attribute {name!r}")



+ 2
- 2
vllm/v1/engine/async_llm.py View File

@@ -26,7 +26,7 @@ from vllm.plugins.io_processors import get_io_processor
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.tasks import SupportedTask
from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
from vllm.tracing import init_tracer
from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
from vllm.usage.usage_lib import UsageContext
@@ -111,7 +111,7 @@ class AsyncLLM(EngineClient):
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = init_tokenizer_from_config(self.model_config)
tokenizer = cached_tokenizer_from_config(self.model_config)

self.input_processor = InputProcessor(self.vllm_config, tokenizer)
self.io_processor = get_io_processor(


+ 2
- 2
vllm/v1/engine/llm_engine.py View File

@@ -23,7 +23,7 @@ from vllm.plugins.io_processors import get_io_processor
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.tasks import SupportedTask
from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
from vllm.tracing import init_tracer
from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine import EngineCoreRequest
@@ -86,7 +86,7 @@ class LLMEngine:
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = init_tokenizer_from_config(self.model_config)
tokenizer = cached_tokenizer_from_config(self.model_config)

self.input_processor = InputProcessor(self.vllm_config, tokenizer)
self.io_processor = get_io_processor(


+ 2
- 4
vllm/v1/executor/multiproc_executor.py View File

@@ -124,9 +124,7 @@ class MultiprocExecutor(Executor):
# Set multiprocessing envs
set_multiprocessing_worker_envs()

# Multiprocessing-based executor does not support multi-node setting.
# Since it only works for single node, we can use the loopback address
# get_loopback_ip() for communication.
# use the loopback address get_loopback_ip() for communication.
distributed_init_method = get_distributed_init_method(
get_loopback_ip(), get_open_port()
)
@@ -708,7 +706,7 @@ class WorkerProc:
death_pipe.recv()
except EOFError:
# Parent process has exited, terminate this worker
logger.info("Parent process exited, terminating worker")
logger.info_once("Parent process exited, terminating worker")
# Send signal to self to trigger clean shutdown
shutdown_event.set()
except Exception as e:


+ 2
- 2
vllm/v1/structured_output/__init__.py View File

@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING
from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
from vllm.tokenizers import init_tokenizer_from_config
from vllm.tokenizers import cached_tokenizer_from_config
from vllm.utils.import_utils import LazyLoader
from vllm.v1.structured_output.backend_guidance import GuidanceBackend
from vllm.v1.structured_output.backend_types import (
@@ -71,7 +71,7 @@ class StructuredOutputManager:
# of CPUs.
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
self.executor = ThreadPoolExecutor(max_workers=max_workers)
self.tokenizer = init_tokenizer_from_config(
self.tokenizer = cached_tokenizer_from_config(
model_config=self.vllm_config.model_config
)
reasoning_parser = (


Loading…
Cancel
Save
Baidu
map