28 Commits

Author SHA1 Message Date
  Robert Shaw 968934e77d
Merge branch 'main' into wentao-small-refactor 2 days ago
  Laith Sakka 763963aa73
set assume_32bit_indexing and pass unbacked hints (#30459) 2 days ago
  Cyrus Leung 39cefbdf17
[Refactor] `TokenizerRegistry` only uses lazy imports (#30609) 2 days ago
  Chen Zhang ace34e3783
[Bugfix] Qwen3-next with --hf-overrides \{\"num_hidden_layers\":8\} (#30433) 2 days ago
  Isotr0py e5db3e2774
[CI/Build] Fix broken mm processor test Mistral-3-large (#30597) 2 days ago
  Cyrus Leung 64251f48df
[Chore] Adjust tokenizer import to avoid circular imports (#30601) 2 days ago
  Nick Hill 1cec5b7ea9
[Scheduer] Simplify stop checking for pooling models (#30591) 2 days ago
  Cyrus Leung b09806e28f
[Bugfix] Dictionary MM embeddings for online chat (#30507) 3 days ago
  Tsukasa OI fdc135d768
[Misc][Quantization] Clarify the intent of GGUF `FusedMoE` weight materialization (#30310) 3 days ago
  Roberto L. Castro 4fa7ce46f3
[Feature] Add SM103 (Blackwell Ultra) Support to vLLM (#30484) 3 days ago
  Nicolò Lucchesi 57e9bf1864
[CI] Whisper logprobs tests (#30504) 3 days ago
  Michael Goin 2f32a68d75
[CI] Update several models in registry that are available online now (#30514) 3 days ago
  Matthew Bonanni f5dfbbd8e9
[Docs] Remove references to `VLLM_ATTENTION_BACKEND` (#30564) 3 days ago
  Michael Goin fc0119425c
Add IBM and Red Hat to compute resources sponsors (#30581) 3 days ago
  Matthew Bonanni 86a3261525
[Bugfix] Pass FA version in `MultiHeadAttention` (#30575) 3 days ago
  rasmith 08f8a5627e
[CI/Build][Kernel][BugFix][AMD] Fix per_token_group_quant_fp8 to use correct fp8 min/max values and update atol/rtol in test_quantfp8_group_functionality (#30292) 3 days ago
  Kevin H. Luu b4039c08b5
[ci] Mark PrimeRL integration test as soft fail (#30578) 3 days ago
  Wentao Ye 1e6b115300
[Refactor] Reduce duplicate code in `per_token_group_quant` cuda kernels (#30496) 3 days ago
  danielafrimi 13618626df
[MoE-FP8-modelopt] Add FlashInfer alignment padding for intermediate dimensions (#29748) 3 days ago
  danielafrimi 6ec0d8dbe4
[Fix]Load kv-cache dtype from hf_quant_config.json automatically (#29980) 3 days ago
  Li, Jiang 9693dd0fe3
[CI/Build] Add x86 CPU wheel release pipeline (#28848) 3 days ago
  Xin Yang 1f19d8f899
[Perf] Set split_k to 1 for triton_kernels (#30528) 3 days ago
  shivampr cd7740ac5c
[ROCm] Enable Triton ScaledMM fallback + kernel selection fix (#26668) 3 days ago
  Wentao Ye 02a5880394
[CI] Fix mypy for vllm/v1/executor (#30517) 3 days ago
  realliujiaxu d2c919dcc2
[bugfix] fix bug when top_logprobs=0 with spec decoding (#30059) 3 days ago
  Benjamin Bartels f3237f3f6b
[Frontend] Fixes anthropic streaming message_start usage nesting (#30266) 3 days ago
  jvlunteren 9c0ee995a8
[Kernel] Support CUDA Graphs in 3D Triton Attention Kernel (#28306) 3 days ago
  Michael Goin 09ad3b76b3
[Bug] Fix attention_backend arg string parsing (#30534) 3 days ago
98 changed files with 1272 additions and 674 deletions
Split View
  1. +14
    -0
      .buildkite/release-pipeline.yaml
  2. +5
    -2
      .buildkite/test-pipeline.yaml
  3. +2
    -0
      README.md
  4. +1
    -1
      benchmarks/backend_request_func.py
  5. +83
    -98
      csrc/quantization/w8a8/fp8/per_token_group_quant.cu
  6. +2
    -0
      docs/community/sponsors.md
  7. +16
    -6
      docs/getting_started/quickstart.md
  8. +1
    -1
      tests/compile/distributed/test_fusions_e2e.py
  9. +7
    -1
      tests/conftest.py
  10. +6
    -3
      tests/entrypoints/openai/test_messages.py
  11. +1
    -1
      tests/entrypoints/openai/test_serving_engine.py
  12. +107
    -6
      tests/entrypoints/test_chat_utils.py
  13. +2
    -2
      tests/kernels/attention/test_cutlass_mla_decode.py
  14. +2
    -2
      tests/kernels/attention/test_flashinfer_trtllm_attention.py
  15. +27
    -0
      tests/kernels/attention/test_triton_unified_attention.py
  16. +2
    -2
      tests/kernels/moe/test_ocp_mx_moe.py
  17. +2
    -2
      tests/kernels/quantization/test_fp8_quant_group.py
  18. +91
    -0
      tests/kernels/quantization/test_scaled_mm_kernel_selection.py
  19. +1
    -1
      tests/models/language/generation/test_mistral.py
  20. +1
    -1
      tests/models/multimodal/generation/test_voxtral.py
  21. +123
    -111
      tests/models/multimodal/generation/test_whisper.py
  22. +2
    -5
      tests/models/multimodal/processing/test_common.py
  23. +8
    -0
      tests/models/multimodal/processing/test_tensor_schema.py
  24. +8
    -7
      tests/models/registry.py
  25. +2
    -2
      tests/quantization/test_blackwell_moe.py
  26. +1
    -1
      tests/reasoning/test_mistral_reasoning_parser.py
  27. +1
    -1
      tests/reasoning/utils.py
  28. +2
    -2
      tests/test_inputs.py
  29. +24
    -23
      tests/tokenizers_/test_basic.py
  30. +1
    -1
      tests/tokenizers_/test_detokenize.py
  31. +21
    -2
      tests/tokenizers_/test_registry.py
  32. +2
    -5
      tests/tool_use/test_mistral_tool_parser.py
  33. +3
    -1
      tests/v1/sample/test_logprobs.py
  34. +1
    -1
      tests/v1/sample/test_rejection_sampler.py
  35. +1
    -1
      tools/pre_commit/mypy.py
  36. +10
    -0
      vllm/attention/layer.py
  37. +30
    -39
      vllm/attention/ops/triton_unified_attention.py
  38. +22
    -3
      vllm/compilation/decorators.py
  39. +7
    -1
      vllm/engine/arg_utils.py
  40. +6
    -6
      vllm/entrypoints/anthropic/serving_messages.py
  41. +72
    -31
      vllm/entrypoints/chat_utils.py
  42. +2
    -1
      vllm/entrypoints/llm.py
  43. +3
    -1
      vllm/entrypoints/openai/serving_engine.py
  44. +2
    -1
      vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
  45. +2
    -1
      vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
  46. +4
    -1
      vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
  47. +2
    -1
      vllm/entrypoints/pooling/score/serving.py
  48. +1
    -1
      vllm/entrypoints/utils.py
  49. +1
    -1
      vllm/model_executor/layers/batch_invariant.py
  50. +4
    -1
      vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
  51. +6
    -2
      vllm/model_executor/layers/fused_moe/layer.py
  52. +3
    -3
      vllm/model_executor/layers/quantization/fp8.py
  53. +4
    -1
      vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
  54. +12
    -28
      vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
  55. +15
    -7
      vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
  56. +6
    -5
      vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
  57. +12
    -5
      vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
  58. +46
    -17
      vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
  59. +6
    -5
      vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
  60. +48
    -0
      vllm/model_executor/layers/quantization/modelopt.py
  61. +4
    -4
      vllm/model_executor/layers/quantization/mxfp4.py
  62. +1
    -1
      vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
  63. +1
    -1
      vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  64. +6
    -3
      vllm/model_executor/layers/quantization/utils/fp8_utils.py
  65. +12
    -6
      vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
  66. +1
    -1
      vllm/model_executor/models/config.py
  67. +2
    -1
      vllm/model_executor/models/pixtral.py
  68. +7
    -0
      vllm/model_executor/models/qwen3_next.py
  69. +2
    -1
      vllm/model_executor/models/voxtral.py
  70. +1
    -1
      vllm/platforms/cuda.py
  71. +15
    -0
      vllm/platforms/interface.py
  72. +1
    -1
      vllm/reasoning/mistral_reasoning_parser.py
  73. +0
    -6
      vllm/tokenizers/__init__.py
  74. +33
    -14
      vllm/tokenizers/deepseekv32.py
  75. +7
    -12
      vllm/tokenizers/hf.py
  76. +2
    -5
      vllm/tokenizers/mistral.py
  77. +1
    -1
      vllm/tokenizers/protocol.py
  78. +100
    -100
      vllm/tokenizers/registry.py
  79. +3
    -3
      vllm/transformers_utils/tokenizer.py
  80. +2
    -2
      vllm/utils/deep_gemm.py
  81. +3
    -1
      vllm/utils/flashinfer.py
  82. +23
    -2
      vllm/utils/torch_utils.py
  83. +1
    -1
      vllm/v1/attention/backends/flashinfer.py
  84. +3
    -3
      vllm/v1/attention/backends/mla/common.py
  85. +2
    -2
      vllm/v1/attention/backends/mla/flashmla_sparse.py
  86. +83
    -1
      vllm/v1/attention/backends/triton_attn.py
  87. +5
    -6
      vllm/v1/core/sched/scheduler.py
  88. +2
    -10
      vllm/v1/core/sched/utils.py
  89. +2
    -2
      vllm/v1/engine/async_llm.py
  90. +22
    -11
      vllm/v1/engine/input_processor.py
  91. +2
    -2
      vllm/v1/engine/llm_engine.py
  92. +1
    -1
      vllm/v1/executor/abstract.py
  93. +7
    -3
      vllm/v1/executor/multiproc_executor.py
  94. +3
    -3
      vllm/v1/executor/ray_executor.py
  95. +8
    -5
      vllm/v1/executor/uniproc_executor.py
  96. +1
    -1
      vllm/v1/sample/rejection_sampler.py
  97. +2
    -2
      vllm/v1/structured_output/__init__.py
  98. +2
    -1
      vllm/v1/structured_output/backend_xgrammar.py

+ 14
- 0
.buildkite/release-pipeline.yaml View File

@@ -71,6 +71,20 @@ steps:
env:
DOCKER_BUILDKIT: "1"

# x86 CPU wheel build
- label: "Build x86 CPU wheel"
depends_on: ~
id: build-wheel-x86-cpu
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"

# Build release images (12.9)
- label: "Build release image (x86)"
depends_on: ~


+ 5
- 2
.buildkite/test-pipeline.yaml View File

@@ -692,6 +692,7 @@ steps:
source_file_dependencies:
- vllm/
- tests/models/test_initialization.py
- tests/models/registry.py
commands:
# Run a subset of model initialization tests
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
@@ -704,6 +705,7 @@ steps:
- vllm/model_executor/models/
- vllm/transformers_utils/
- tests/models/test_initialization.py
- tests/models/registry.py
commands:
# Only when vLLM model source is modified - test initialization of a large
# subset of supported models (the complement of the small subset in the above
@@ -836,7 +838,7 @@ steps:
- tests/models/multimodal
no_gpu: true
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py

- label: Multi-Modal Processor Test
@@ -1346,6 +1348,7 @@ steps:
- label: Prime-RL Integration Test # 15min
timeout_in_minutes: 30
optional: true
soft_fail: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
@@ -1379,4 +1382,4 @@ steps:
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

+ 2
- 0
README.md View File

@@ -143,11 +143,13 @@ Compute Resources:
- Databricks
- DeepInfra
- Google Cloud
- IBM
- Intel
- Lambda Lab
- Nebius
- Novita AI
- NVIDIA
- Red Hat
- Replicate
- Roblox
- RunPod


+ 1
- 1
benchmarks/backend_request_func.py View File

@@ -620,7 +620,7 @@ def get_tokenizer(
kwargs["use_fast"] = False
if tokenizer_mode == "mistral":
try:
from vllm.tokenizers import MistralTokenizer
from vllm.tokenizers.mistral import MistralTokenizer
except ImportError as e:
raise ImportError(
"MistralTokenizer requires vllm package.\n"


+ 83
- 98
csrc/quantization/w8a8/fp8/per_token_group_quant.cu View File

@@ -22,6 +22,62 @@ __device__ __forceinline__ float GroupReduceMax(float val) {
return val;
}

template <typename T, bool SCALE_UE8M0>
__device__ __forceinline__ float ComputeGroupScale(
const T* __restrict__ group_input, T* __restrict__ smem_group,
const int group_size, const int lane_id, const int threads_per_group,
const float eps, const float max_8bit) {
float local_absmax = eps;

constexpr int vec_size = 16 / sizeof(T);

// copy global -> shared & compute absmax
auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
float abs_v = fabsf(static_cast<float>(src));
local_absmax = fmaxf(local_absmax, abs_v);
dst = src;
};

vllm::vectorize_with_alignment<vec_size>(
group_input, // in
smem_group, // out (shared)
group_size, // elements per group
lane_id, // thread id
threads_per_group, // stride in group
scalar_op_cache); // scalar handler

local_absmax = GroupReduceMax(local_absmax);

float y_s = local_absmax / max_8bit;
if constexpr (SCALE_UE8M0) {
y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
}

return y_s;
}

template <typename T, typename DST_DTYPE>
__device__ __forceinline__ void QuantizeGroup(
const T* __restrict__ smem_group, DST_DTYPE* __restrict__ group_output,
const int group_size, const int lane_id, const int threads_per_group,
const float y_s, const float min_8bit, const float max_8bit) {
constexpr int vec_size = 16 / sizeof(T);

// quantize shared -> global 8-bit
auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
dst = DST_DTYPE(q);
};

vllm::vectorize_with_alignment<vec_size>(
smem_group, // in (shared)
group_output, // out (global quant tensor)
group_size, // elements
lane_id, // tid
threads_per_group, // stride
scalar_op_quant); // scalar handler
}

template <typename T, typename DST_DTYPE, bool IS_COLUMN_MAJOR = false,
bool SCALE_UE8M0 = false, typename scale_packed_t = float>
__global__ void per_token_group_quant_8bit_kernel(
@@ -38,8 +94,6 @@ __global__ void per_token_group_quant_8bit_kernel(
const int64_t global_group_id = block_group_id + local_group_id;
const int64_t block_group_offset = global_group_id * group_size;

float local_absmax = eps;

using scale_element_t = float;
static_assert(sizeof(scale_packed_t) % sizeof(scale_element_t) == 0);

@@ -68,30 +122,9 @@ __global__ void per_token_group_quant_8bit_kernel(
T* smem = reinterpret_cast<T*>(smem_raw);
T* smem_group = smem + local_group_id * group_size;

constexpr int vec_size = 16 / sizeof(T);
using vec_t = vllm::vec_n_t<T, vec_size>;

// copy global -> shared & compute absmax
auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
float abs_v = fabsf(static_cast<float>(src));
local_absmax = fmaxf(local_absmax, abs_v);
dst = src;
};

vllm::vectorize_with_alignment<vec_size>(
group_input, // in
smem_group, // out (shared)
group_size, // elements per group
lane_id, // thread id
threads_per_group, // stride in group
scalar_op_cache); // scalar handler

local_absmax = GroupReduceMax(local_absmax);

float y_s = local_absmax / max_8bit;
if constexpr (SCALE_UE8M0) {
y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
}
const float y_s = ComputeGroupScale<T, SCALE_UE8M0>(
group_input, smem_group, group_size, lane_id, threads_per_group, eps,
max_8bit);

scale_element_t y_s_quant = y_s;

@@ -101,19 +134,24 @@ __global__ void per_token_group_quant_8bit_kernel(

__syncthreads();

// quantize shared -> global 8-bit
auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
dst = DST_DTYPE(q);
};
QuantizeGroup<T, DST_DTYPE>(smem_group, group_output, group_size, lane_id,
threads_per_group, y_s, min_8bit, max_8bit);
}

vllm::vectorize_with_alignment<vec_size>(
smem_group, // in (shared)
group_output, // out (global quant tensor)
group_size, // elements
lane_id, // tid
threads_per_group, // stride
scalar_op_quant); // scalar handler
inline int GetGroupsPerBlock(int64_t num_groups) {
if (num_groups % 16 == 0) {
return 16;
}
if (num_groups % 8 == 0) {
return 8;
}
if (num_groups % 4 == 0) {
return 4;
}
if (num_groups % 2 == 0) {
return 2;
}
return 1;
}

void per_token_group_quant_8bit(const torch::Tensor& input,
@@ -133,17 +171,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,

constexpr int THREADS_PER_GROUP = 16;

int groups_per_block = 1;

if (num_groups % 16 == 0) {
groups_per_block = 16;
} else if (num_groups % 8 == 0) {
groups_per_block = 8;
} else if (num_groups % 4 == 0) {
groups_per_block = 4;
} else if (num_groups % 2 == 0) {
groups_per_block = 2;
}
const int groups_per_block = GetGroupsPerBlock(num_groups);

auto dst_type = output_q.scalar_type();
const int num_blocks = num_groups / groups_per_block;
@@ -225,8 +253,6 @@ __global__ void per_token_group_quant_8bit_packed_kernel(

const int64_t block_group_offset = global_group_id * group_size;

float local_absmax = eps;

const T* group_input = input + block_group_offset;
DST_DTYPE* group_output =
static_cast<DST_DTYPE*>(output_q) + block_group_offset;
@@ -235,29 +261,9 @@ __global__ void per_token_group_quant_8bit_packed_kernel(
extern __shared__ __align__(16) char smem_raw[];
T* smem = reinterpret_cast<T*>(smem_raw);
T* smem_group = smem + local_group_id * group_size;

constexpr int vec_size = 16 / sizeof(T);
using vec_t = vllm::vec_n_t<T, vec_size>;

// copy global -> shared & compute absmax
auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
float abs_v = fabsf(static_cast<float>(src));
local_absmax = fmaxf(local_absmax, abs_v);
dst = src;
};

vllm::vectorize_with_alignment<vec_size>(
group_input, // in
smem_group, // out (shared)
group_size, // elements per group
lane_id, // thread id
threads_per_group, // stride in group
scalar_op_cache); // scalar handler

local_absmax = GroupReduceMax(local_absmax);

float y_s = local_absmax / max_8bit;
y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
const float y_s =
ComputeGroupScale<T, true>(group_input, smem_group, group_size, lane_id,
threads_per_group, eps, max_8bit);

// pack 4 scales into a uint32
if (lane_id == 0) {
@@ -284,19 +290,8 @@ __global__ void per_token_group_quant_8bit_packed_kernel(

__syncthreads();

// quantize shared -> global 8-bit
auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
dst = DST_DTYPE(q);
};

vllm::vectorize_with_alignment<vec_size>(
smem_group, // in (shared)
group_output, // out (global quant tensor)
group_size, // elements
lane_id, // tid
threads_per_group, // stride
scalar_op_quant); // scalar handler
QuantizeGroup<T, DST_DTYPE>(smem_group, group_output, group_size, lane_id,
threads_per_group, y_s, min_8bit, max_8bit);
}

void per_token_group_quant_8bit_packed(const torch::Tensor& input,
@@ -337,17 +332,7 @@ void per_token_group_quant_8bit_packed(const torch::Tensor& input,

constexpr int THREADS_PER_GROUP = 16;

int groups_per_block = 1;

if (num_groups % 16 == 0) {
groups_per_block = 16;
} else if (num_groups % 8 == 0) {
groups_per_block = 8;
} else if (num_groups % 4 == 0) {
groups_per_block = 4;
} else if (num_groups % 2 == 0) {
groups_per_block = 2;
}
const int groups_per_block = GetGroupsPerBlock(num_groups);

auto dst_type = output_q.scalar_type();
const int num_blocks = num_groups / groups_per_block;


+ 2
- 0
docs/community/sponsors.md View File

@@ -24,11 +24,13 @@ Compute Resources:
- Databricks
- DeepInfra
- Google Cloud
- IBM
- Intel
- Lambda Lab
- Nebius
- Novita AI
- NVIDIA
- Red Hat
- Replicate
- Roblox
- RunPod


+ 16
- 6
docs/getting_started/quickstart.md View File

@@ -281,17 +281,27 @@ Alternatively, you can use the `openai` Python package:

Currently, vLLM supports multiple backends for efficient Attention computation across different platforms and accelerator architectures. It automatically selects the most performant backend compatible with your system and model specifications.

If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options:
If desired, you can also manually set the backend of your choice using the `--attention-backend` CLI argument:

```bash
# For online serving
vllm serve Qwen/Qwen2.5-1.5B-Instruct --attention-backend FLASH_ATTN

# For offline inference
python script.py --attention-backend FLASHINFER
```

Some of the available backend options include:

- On NVIDIA CUDA: `FLASH_ATTN` or `FLASHINFER`.
- On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.

For AMD ROCm, you can further control the specific Attention implementation using the following variables:
For AMD ROCm, you can further control the specific Attention implementation using the following options:

- Triton Unified Attention: `VLLM_ROCM_USE_AITER=0 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
- AITER Unified Attention: `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
- Triton Prefill-Decode Attention: `VLLM_ROCM_USE_AITER=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0`
- AITER Multi-head Attention: `VLLM_ROCM_USE_AITER=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=1`
- Triton Unified Attention: Set the environment variables `VLLM_ROCM_USE_AITER=0 VLLM_ROCM_USE_AITER_MHA=0` and pass `--attention-config.use_prefill_decode_attention=false` as a CLI argument.
- AITER Unified Attention: Set the environment variables `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0` and pass `--attention-config.use_prefill_decode_attention=false` as a CLI argument.
- Triton Prefill-Decode Attention: Set the environment variables `VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_MHA=0` and pass `--attention-config.use_prefill_decode_attention=true` as a CLI argument.
- AITER Multi-head Attention: Set the environment variables `VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_MHA=1` and pass `--attention-config.use_prefill_decode_attention=false` as a CLI argument.

!!! warning
There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [docker/Dockerfile](../../docker/Dockerfile) for instructions on how to install it.

+ 1
- 1
tests/compile/distributed/test_fusions_e2e.py View File

@@ -20,7 +20,7 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer

from ...utils import flat_product, multi_gpu_test

is_blackwell = lambda: current_platform.is_device_capability(100)
is_blackwell = lambda: current_platform.is_device_capability_family(100)
"""Are we running on Blackwell, a lot of tests depend on it"""




+ 7
- 1
tests/conftest.py View File

@@ -702,10 +702,16 @@ class HfRunner:
**kwargs,
)

# Encoder-decoder models return decoder_hidden_states instead of
# hidden_states
hidden_states = (
getattr(output, "hidden_states", None) or output.decoder_hidden_states
)

(
seq_logprobs_lst,
output_len,
) = self._hidden_states_to_logprobs(output.hidden_states, num_logprobs)
) = self._hidden_states_to_logprobs(hidden_states, num_logprobs)

all_logprobs.append(seq_logprobs_lst)
seq_ids = output.sequences[0]


+ 6
- 3
tests/entrypoints/openai/test_messages.py View File

@@ -79,9 +79,12 @@ async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):

assert chunk_count > 0
assert first_chunk is not None, "message_start chunk was never observed"
assert first_chunk.usage is not None, "first chunk should include usage stats"
assert first_chunk.usage["output_tokens"] == 0
assert first_chunk.usage["input_tokens"] > 5
assert first_chunk.message is not None, "first chunk should include message"
assert first_chunk.message.usage is not None, (
"first chunk should include usage stats"
)
assert first_chunk.message.usage.output_tokens == 0
assert first_chunk.message.usage.input_tokens > 5


@pytest.mark.asyncio


+ 1
- 1
tests/entrypoints/openai/test_serving_engine.py View File

@@ -10,7 +10,7 @@ import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.tokenizers import MistralTokenizer
from vllm.tokenizers.mistral import MistralTokenizer


@pytest.fixture()


+ 107
- 6
tests/entrypoints/test_chat_utils.py View File

@@ -29,7 +29,8 @@ from vllm.multimodal.utils import (
encode_image_base64,
encode_video_base64,
)
from vllm.tokenizers import MistralTokenizer, get_tokenizer
from vllm.tokenizers import get_tokenizer
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.utils.serial_utils import tensor2base64

from ..models.registry import HF_EXAMPLE_MODELS
@@ -796,9 +797,13 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
"content": "<|image_1|>\nWhat's in this image?",
}
]

assert mm_data is not None
assert "image" in mm_data
assert mm_data["image"] is None
assert isinstance(mm_data["image"], list)
assert len(mm_data["image"]) == 1
assert mm_data["image"][0] is None

_assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])


@@ -825,10 +830,11 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
# Should have audio in mm_data as None (UUID provided)
assert mm_data is not None
assert "audio" in mm_data
assert mm_data["audio"] is None
assert isinstance(mm_data["audio"], list)
assert len(mm_data["audio"]) == 1
assert mm_data["audio"][0] is None

# UUID should be recorded
assert mm_uuids is not None
assert "audio" in mm_uuids
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[uuid])


@@ -1121,10 +1127,105 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
mm_data = await mm_future
assert mm_data is not None
assert "image" in mm_data
assert mm_data["image"] is None
assert isinstance(mm_data["image"], list)
assert len(mm_data["image"]) == 1
assert mm_data["image"][0] is None

_assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])


def test_parse_chat_messages_empty_dict_image_embeds(
phi3v_model_config_image_embeds,
):
"""Test that empty dictionary for image_embeds is handled without errors."""
conversation, mm_data, mm_uuids = parse_chat_messages(
[
{
"role": "user",
"content": [
{"type": "image_embeds", "image_embeds": {}},
{"type": "text", "text": "What's in this image?"},
],
}
],
phi3v_model_config_image_embeds,
content_format="string",
)

# Verify conversation structure
assert conversation == [
{
"role": "user",
"content": "<|image_1|>\nWhat's in this image?",
}
]

# Verify mm_data contains an empty dictionary of embeddings
assert mm_data is not None
assert "image" in mm_data
assert isinstance(mm_data["image"], dict)
assert len(mm_data["image"]) == 0

# Verify UUIDs (None since we didn't provide any)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])


def test_parse_chat_messages_multiple_dict_image_embeds(
phi3v_model_config_image_embeds,
):
"""Test that multiple dictionaries for image_embeds is handled without errors."""
# Create two sample image embedding tensors
batch_size = 2
image_embedding_1 = torch.randn(batch_size, 256, 1024)
image_embedding_2 = torch.randn(batch_size, 3)

conversation, mm_data, mm_uuids = parse_chat_messages(
[
{
"role": "user",
"content": [
{
"type": "image_embeds",
"image_embeds": {
"image_embedding_1": tensor2base64(p),
"image_embedding_2": tensor2base64(i),
},
}
for p, i in zip(image_embedding_1, image_embedding_2)
]
+ [
{"type": "text", "text": "Describe these two images."},
],
}
],
phi3v_model_config_image_embeds,
content_format="string",
)

# Verify conversation structure
assert conversation == [
{
"role": "user",
"content": "<|image_1|>\n<|image_2|>\nDescribe these two images.",
}
]

# Verify mm_data contains a dictionary of multi-embeddings
assert mm_data is not None
assert "image" in mm_data
assert isinstance(mm_data["image"], dict)
assert len(mm_data["image"]) == batch_size

# Verify each embedding has the correct shape
assert isinstance(mm_data["image"]["image_embedding_1"], torch.Tensor)
assert mm_data["image"]["image_embedding_1"].shape == image_embedding_1.shape
assert isinstance(mm_data["image"]["image_embedding_2"], torch.Tensor)
assert mm_data["image"]["image_embedding_2"].shape == image_embedding_2.shape

# Verify UUIDs (None since we didn't provide any)
_assert_mm_uuids(mm_uuids, batch_size, expected_uuids=[None, None])


@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_async(
phi3v_model_config,


+ 2
- 2
tests/kernels/attention/test_cutlass_mla_decode.py View File

@@ -32,8 +32,8 @@ def cal_diff(


CUTLASS_MLA_UNSUPPORTED_REASON = (
"Cutlass MLA Requires compute capability of 10 or above."
if not current_platform.is_device_capability(100)
"Cutlass MLA Requires compute capability of 100 or above."
if not current_platform.is_device_capability_family(100)
else "Cutlass MLA is supported"
)



+ 2
- 2
tests/kernels/attention/test_flashinfer_trtllm_attention.py View File

@@ -11,7 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
from vllm.platforms import current_platform
from vllm.utils.math_utils import round_up

if not current_platform.is_device_capability(100):
if not current_platform.is_device_capability_family(100):
pytest.skip(
"This TRTLLM kernel requires NVIDIA Blackwell.", allow_module_level=True
)
@@ -443,7 +443,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
output_trtllm = output_trtllm.reshape(-1, query.shape[1], query.shape[2])

if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
rtol, atol = 1e-1, 2e-1
rtol, atol = 3e-1, 4e-1
elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
rtol, atol = 4e-2, 6e-2
elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:


+ 27
- 0
tests/kernels/attention/test_triton_unified_attention.py View File

@@ -7,6 +7,7 @@ import torch

from vllm.attention.ops.triton_unified_attention import unified_attention
from vllm.platforms import current_platform
from vllm.utils.math_utils import next_power_of_2

NUM_HEADS = [(4, 4), (8, 2)]
HEAD_SIZES = [128, 256]
@@ -22,6 +23,10 @@ QDTYPES = (
# one value small enough to test the schema op check
NUM_BLOCKS = [32768, 2048]

# 0: use 2D kernel for decode
# 8: use 3D kernel for decode
SEQ_THRESHOLD_3D_VALUES = [0, 8]


def ref_paged_attn(
query: torch.Tensor,
@@ -92,6 +97,7 @@ def ref_paged_attn(
@pytest.mark.parametrize("soft_cap", [None, 50.0])
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("q_dtype", QDTYPES)
@pytest.mark.parametrize("seq_threshold_3D", SEQ_THRESHOLD_3D_VALUES)
@torch.inference_mode()
def test_triton_unified_attn(
seq_lens: list[tuple[int, int]],
@@ -103,6 +109,7 @@ def test_triton_unified_attn(
soft_cap: float | None,
num_blocks: int,
q_dtype: torch.dtype | None,
seq_threshold_3D: int,
) -> None:
torch.set_default_device("cuda")

@@ -152,6 +159,21 @@ def test_triton_unified_attn(
k_descale = torch.rand(scale_shape, dtype=torch.float32)
v_descale = torch.rand(scale_shape, dtype=torch.float32)

num_par_softmax_segments = 16
head_size_padded = next_power_of_2(head_size)
softmax_segm_output = torch.empty(
(seq_threshold_3D, num_query_heads, num_par_softmax_segments, head_size_padded),
dtype=torch.float32,
)
softmax_segm_max = torch.empty(
(seq_threshold_3D, num_query_heads, num_par_softmax_segments),
dtype=torch.float32,
)
softmax_segm_expsum = torch.empty(
(seq_threshold_3D, num_query_heads, num_par_softmax_segments),
dtype=torch.float32,
)

unified_attention(
q=maybe_quantized_query,
k=maybe_quantized_key_cache,
@@ -169,6 +191,11 @@ def test_triton_unified_attn(
q_descale=q_descale,
k_descale=k_descale,
v_descale=v_descale,
seq_threshold_3D=seq_threshold_3D,
num_par_softmax_segments=num_par_softmax_segments,
softmax_segm_output=softmax_segm_output,
softmax_segm_max=softmax_segm_max,
softmax_segm_expsum=softmax_segm_expsum,
)

ref_output = ref_paged_attn(


+ 2
- 2
tests/kernels/moe/test_ocp_mx_moe.py View File

@@ -17,7 +17,7 @@ QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse(
) >= version.parse("0.8.99")

TRTLLM_GEN_MXFP4_AVAILABLE = (
current_platform.is_cuda() and current_platform.is_device_capability(100)
current_platform.is_cuda() and current_platform.is_device_capability_family(100)
)

HOPPER_MXFP4_BF16_AVAILABLE = (
@@ -799,7 +799,7 @@ def test_flashinfer_cutlass_mxfp4_fused_moe(
@pytest.mark.skipif(
not (
current_platform.is_cuda()
and current_platform.is_device_capability(100)
and current_platform.is_device_capability_family(100)
and has_flashinfer()
),
reason="NVIDIA GPU sm100 and flashinfer are required for this test",


+ 2
- 2
tests/kernels/quantization/test_fp8_quant_group.py View File

@@ -62,7 +62,7 @@ def test_quantfp8_group_functionality(
assert scales_col.stride(1) == batch_size

# Test column-major scales consistency
assert torch.allclose(scales_col, scales_native, rtol=1e-9, atol=1e-8)
torch.testing.assert_close(scales_col, scales_native, rtol=1e-9, atol=1e-8)

# 3. Test CUDA implementation (only for divisible dimensions)
if is_divisible:
@@ -71,7 +71,7 @@ def test_quantfp8_group_functionality(
assert scales_cuda.shape == (batch_size, expected_num_groups)

# Verify CUDA/native consistency
assert torch.allclose(scales_cuda, scales_native, rtol=1e-9, atol=1e-8)
torch.testing.assert_close(scales_cuda, scales_native, rtol=2e-7, atol=2e-8)

# Quantized values should mostly match
diff_count = (x_quant_cuda != x_quant_native).sum().item()


+ 91
- 0
tests/kernels/quantization/test_scaled_mm_kernel_selection.py View File

@@ -0,0 +1,91 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for ScaledMM kernel selection logic (CPU-only)

Run `pytest tests/kernels/quantization/test_scaled_mm_kernel_selection.py`.
"""

import inspect
from abc import ABC

import pytest

from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
ScaledMMLinearLayerConfig,
)
from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
AiterScaledMMLinearKernel,
)
from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import (
CPUScaledMMLinearKernel,
)
from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501
ScaledMMLinearKernel,
)

pytestmark = pytest.mark.cpu_test


def test_is_supported_is_abstract():
"""Test that is_supported() is properly defined as abstract."""
assert issubclass(ScaledMMLinearKernel, ABC)
assert hasattr(ScaledMMLinearKernel, "is_supported")


def test_cpu_kernel_implements_is_supported():
"""Test that CPUScaledMMLinearKernel implements is_supported() method."""
assert hasattr(CPUScaledMMLinearKernel, "is_supported"), (
"CPUScaledMMLinearKernel missing is_supported() method"
)
# Verify it's a classmethod by checking if it can be called with the class
# and by checking the method type
assert inspect.ismethod(CPUScaledMMLinearKernel.is_supported) or inspect.isfunction(
CPUScaledMMLinearKernel.is_supported
), "CPUScaledMMLinearKernel.is_supported() should be a classmethod"
# Verify it can be called as a classmethod
result, reason = CPUScaledMMLinearKernel.is_supported()
assert isinstance(result, bool), "is_supported() should return a bool"
assert reason is None or isinstance(reason, str), "reason should be str or None"


def test_aiter_kernel_implements_is_supported():
"""Test that AiterScaledMMLinearKernel implements is_supported() method."""
assert hasattr(AiterScaledMMLinearKernel, "is_supported"), (
"AiterScaledMMLinearKernel missing is_supported() method"
)
# Verify it's a classmethod by checking if it can be called with the class
# and by checking the method type
assert inspect.ismethod(
AiterScaledMMLinearKernel.is_supported
) or inspect.isfunction(AiterScaledMMLinearKernel.is_supported), (
"AiterScaledMMLinearKernel.is_supported() should be a classmethod"
)
# Verify it can be called as a classmethod
# (will return False on CPU, which is expected)
result, reason = AiterScaledMMLinearKernel.is_supported()
assert isinstance(result, bool), "is_supported() should return a bool"
assert reason is None or isinstance(reason, str), "reason should be str or None"
# On CPU, it should return False with a reason about requiring ROCm
# This validates the method works correctly even on non-ROCm platforms


def test_cpu_kernel_accepts_all_configs():
"""Test that CPUScaledMMLinearKernel accepts all config combinations."""
configs = [
ScaledMMLinearLayerConfig(
is_channelwise=False,
is_static_input_scheme=True,
input_symmetric=True,
),
ScaledMMLinearLayerConfig(
is_channelwise=True,
is_static_input_scheme=False,
input_symmetric=False,
),
]

for config in configs:
can_impl, reason = CPUScaledMMLinearKernel.can_implement(config)
assert can_impl, (
f"CPUScaledMMLinearKernel should accept config {config}: {reason}"
)

+ 1
- 1
tests/models/language/generation/test_mistral.py View File

@@ -10,7 +10,7 @@ from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
MistralToolParser,
)
from vllm.sampling_params import SamplingParams
from vllm.tokenizers import MistralTokenizer
from vllm.tokenizers.mistral import MistralTokenizer

from ...utils import check_logprobs_close



+ 1
- 1
tests/models/multimodal/generation/test_voxtral.py View File

@@ -9,7 +9,7 @@ from mistral_common.audio import Audio
from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
from mistral_common.protocol.instruct.messages import UserMessage

from vllm.tokenizers import MistralTokenizer
from vllm.tokenizers.mistral import MistralTokenizer

from ....conftest import AudioTestAssets
from ....utils import RemoteOpenAIServer


+ 123
- 111
tests/models/multimodal/generation/test_whisper.py View File

@@ -1,150 +1,146 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from collections.abc import Sequence
from typing import Any

import librosa
import pytest
from transformers import AutoModelForSpeechSeq2Seq

from vllm import SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.platforms import current_platform

from ....conftest import VllmRunner
from ....conftest import HfRunner, PromptAudioInput, VllmRunner
from ....utils import create_new_process_for_each_test, multi_gpu_test
from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close

VLLM_PROMPT = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
HF_PROMPT = ""
# Whisper expects 16kHz audio
WHISPER_SAMPLE_RATE = 16000

PROMPTS = [
{
"prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
"multi_modal_data": {
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
},
},
{ # Test explicit encoder/decoder prompt
"encoder_prompt": {
"prompt": "",
"multi_modal_data": {
"audio": AudioAsset("winning_call").audio_and_sample_rate,
},
},
"decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
},
]

EXPECTED = {
"openai/whisper-tiny": [
" He has birth words I spoke in the original corner of that. And a"
" little piece of black coat poetry. Mary had a little sandwich,"
" sweet, with white and snow. And everyone had it very went the last"
" would sure to go.",
" >> And the old one, fit John the way to Edgar Martinez. >> One more"
" to line down the field line for our base camp. Here comes joy. Here"
" is June and the third base. They're going to wave him in. The throw"
" to the plate will be late. The Mariners are going to play for the"
" American League Championship. I don't believe it. It just continues"
" by all five.",
],
"openai/whisper-small": [
" The first words I spoke in the original pornograph. A little piece"
" of practical poetry. Mary had a little lamb, its fleece was quite a"
" slow, and everywhere that Mary went the lamb was sure to go.",
" And the old one pitch on the way to Edgar Martinez one month. Here"
" comes joy. Here is Junior to third base. They're gonna wave him"
" in. The throw to the plate will be late. The Mariners are going to"
" play for the American League Championship. I don't believe it. It"
" just continues. My, oh my.",
],
"openai/whisper-medium": [
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its fleece was quite as"
" slow, and everywhere that Mary went the lamb was sure to go.",
" And the 0-1 pitch on the way to Edgar Martinez swung on the line"
" down the left field line for Obeyshev. Here comes Joy. Here is"
" Jorgen at third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh"
" my.",
],
"openai/whisper-large-v3": [
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its feet were quite as"
" slow, and everywhere that Mary went, the lamb was sure to go.",
" And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
" Now the left field line for a base hit. Here comes Joy. Here is"
" Junior to third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh,"
" my.",
],
"openai/whisper-large-v3-turbo": [
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its streets were quite"
" as slow, and everywhere that Mary went the lamb was sure to go.",
" And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
" down the left field line for a base hit. Here comes Joy. Here is"
" Junior to third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh,"
" my.",
],
}

@pytest.fixture(autouse=True)
def use_spawn_for_whisper(monkeypatch):
"""Whisper has issues with forked workers, use spawn instead."""
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")


def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], list[str], PromptAudioInput]],
model: str,
*,
max_model_len: int,
dtype: str,
max_tokens: int,
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: str | None = None,
dtype: str = "half",
enforce_eager: bool = True,
) -> None:
prompt_list = PROMPTS * 10
expected_list = EXPECTED[model] * 10
"""Inference result should be the same between hf and vllm.

All the audio fixtures for the test are from AudioAsset.
For huggingface runner, we provide the audio as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
"""
with vllm_runner(
model,
dtype=dtype,
max_model_len=448,
max_model_len=max_model_len,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
# TODO (NickLucche) figure out output differences with non-eager and re-enable
enforce_eager=True,
limit_mm_per_prompt={"audio": 2},
enforce_eager=enforce_eager,
disable_custom_all_reduce=True,
) as vllm_model:
llm = vllm_model.llm

sampling_params = SamplingParams(
temperature=0,
top_p=1.0,
max_tokens=200,
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(
vllm_prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=audios,
)
for vllm_prompts, _, audios in inputs
]

with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
hf_prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=audios,
)
for _, hf_prompts, audios in inputs
]

for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)

outputs = llm.generate(prompt_list, sampling_params)

for output, expected in zip(outputs, expected_list):
print(output.outputs[0].text)
assert output.outputs[0].text == expected
@pytest.fixture
def input_audios() -> list[tuple[list[str], list[str], list[tuple[Any, int]]]]:
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
inputs = []
for asset in audio_assets:
audio, orig_sr = asset.audio_and_sample_rate
# Resample to Whisper's expected sample rate (16kHz)
if orig_sr != WHISPER_SAMPLE_RATE:
audio = librosa.resample(
audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE
)
# vLLM prompts, HF prompts, audio inputs
inputs.append(([VLLM_PROMPT], [HF_PROMPT], [(audio, WHISPER_SAMPLE_RATE)]))
return inputs


@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("dtype", ["half"])
@create_new_process_for_each_test()
def test_models(vllm_runner, model, dtype) -> None:
run_test(
vllm_runner,
model,
tensor_parallel_size=1,
dtype=dtype,
)
def check_model_available(model: str) -> None:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")


@pytest.mark.core_model
@pytest.mark.cpu_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("dtype", ["half"])
def test_models_cpu(vllm_runner, model, dtype) -> None:
# @create_new_process_for_each_test() does not work for some runners
# TODO: to fix cpu privilege issues in run-cpu-test-arm.sh
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("enforce_eager", [True, False])
@create_new_process_for_each_test("spawn")
def test_models(
hf_runner,
vllm_runner,
model: str,
dtype: str,
num_logprobs: int,
input_audios,
enforce_eager: bool,
) -> None:
check_model_available(model)
if current_platform.is_cpu() and not enforce_eager:
pytest.skip("Skipping test for CPU with non-eager mode")
run_test(
hf_runner,
vllm_runner,
input_audios,
model,
tensor_parallel_size=1,
dtype=dtype,
max_model_len=448,
max_tokens=200,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
enforce_eager=enforce_eager,
)


@@ -152,15 +148,31 @@ def test_models_cpu(vllm_runner, model, dtype) -> None:
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@create_new_process_for_each_test()
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [200])
@pytest.mark.parametrize("num_logprobs", [5])
@create_new_process_for_each_test("spawn")
def test_models_distributed(
hf_runner,
vllm_runner,
model,
distributed_executor_backend,
model: str,
distributed_executor_backend: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
input_audios,
) -> None:
check_model_available(model)
run_test(
hf_runner,
vllm_runner,
input_audios,
model,
dtype=dtype,
max_model_len=448,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=False,
)

+ 2
- 5
tests/models/multimodal/processing/test_common.py View File

@@ -22,11 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.tokenizers import (
MistralTokenizer,
TokenizerLike,
cached_tokenizer_from_config,
)
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
from vllm.tokenizers.mistral import MistralTokenizer

from ....multimodal.utils import random_audio, random_image, random_video
from ...registry import (


+ 8
- 0
tests/models/multimodal/processing/test_tensor_schema.py View File

@@ -8,6 +8,7 @@ from typing import Any, TypeAlias

import numpy as np
import pytest
import torch
import torch.nn as nn
from PIL import Image

@@ -35,6 +36,7 @@ from vllm.tokenizers import cached_tokenizer_from_config
from vllm.utils.collection_utils import is_list_of
from vllm.utils.torch_utils import set_default_torch_dtype

from ....utils import create_new_process_for_each_test
from ...registry import HF_EXAMPLE_MODELS
from ...utils import dummy_hf_overrides
from .test_common import get_model_ids_to_test, get_text_token_prompts
@@ -136,6 +138,7 @@ def create_batched_mm_kwargs(
)


# TODO(Isotr0py): Don't initalize model during test
@contextmanager
def initialize_dummy_model(
model_cls: type[nn.Module],
@@ -150,16 +153,21 @@ def initialize_dummy_model(
backend="nccl",
)
initialize_model_parallel(tensor_model_parallel_size=1)

current_device = torch.get_default_device()
vllm_config = VllmConfig(model_config=model_config)
with set_current_vllm_config(vllm_config=vllm_config):
with set_default_torch_dtype(model_config.dtype):
torch.set_default_device(current_platform.device_type)
model = model_cls(vllm_config=vllm_config)
torch.set_default_device(current_device)
yield model

del model
cleanup_dist_env_and_memory()


@create_new_process_for_each_test()
@pytest.mark.parametrize("model_id", get_model_ids_to_test())
def test_model_tensor_schema(model_id: str):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)


+ 8
- 7
tests/models/registry.py View File

@@ -356,7 +356,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
),
"MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
"MistralLarge3ForCausalLM": _HfExamplesInfo(
"mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4", is_available_online=False
"mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4"
),
"MixtralForCausalLM": _HfExamplesInfo(
"mistralai/Mixtral-8x7B-Instruct-v0.1",
@@ -635,7 +635,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
),
"HunYuanVLForConditionalGeneration": _HfExamplesInfo(
"tencent/HunyuanOCR",
is_available_online=False,
hf_overrides={"num_experts": 0},
),
"Idefics3ForConditionalGeneration": _HfExamplesInfo(
"HuggingFaceM4/Idefics3-8B-Llama3",
@@ -674,8 +674,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31",
),
"LightOnOCRForConditionalGeneration": _HfExamplesInfo(
"lightonai/LightOnOCR-1B",
is_available_online=False,
"lightonai/LightOnOCR-1B-1025"
),
"Llama4ForConditionalGeneration": _HfExamplesInfo(
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
@@ -779,8 +778,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"ministral-3": "mistralai/Ministral-3-3B-Instruct-2512",
},
tokenizer_mode="mistral",
# TODO: revert once Mistral-Large-3 and Ministral-3 are publicly available.
is_available_online=False,
),
"QwenVLForConditionalGeneration": _HfExamplesInfo(
"Qwen/Qwen-VL",
@@ -843,7 +840,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
is_available_online=False,
),
# [Encoder-decoder]
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),
"WhisperForConditionalGeneration": _HfExamplesInfo(
"openai/whisper-large-v3-turbo",
extras={"v3": "openai/whisper-large-v3"},
),
# [Cross-encoder]
"JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"),
}
@@ -886,6 +886,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
"EagleMistralLarge3ForCausalLM": _HfExamplesInfo(
"mistralai/Mistral-Large-3-675B-Instruct-2512",
speculative_model="mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle",
# TODO: revert once figuring out OOM in CI
is_available_online=False,
),
"LlamaForCausalLMEagle3": _HfExamplesInfo(


+ 2
- 2
tests/quantization/test_blackwell_moe.py View File

@@ -10,9 +10,9 @@ import pytest
from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform

if not current_platform.is_device_capability(100):
if not current_platform.is_device_capability_family(100):
pytest.skip(
"This test only runs on Blackwell GPUs (SM100).", allow_module_level=True
"This test only runs on Blackwell GPUs (SM10x).", allow_module_level=True
)




+ 1
- 1
tests/reasoning/test_mistral_reasoning_parser.py View File

@@ -5,7 +5,7 @@ import pytest

from tests.reasoning.utils import run_reasoning_extraction_mistral
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.tokenizers import MistralTokenizer
from vllm.tokenizers.mistral import MistralTokenizer

parser_name = "mistral"



+ 1
- 1
tests/reasoning/utils.py View File

@@ -4,7 +4,7 @@

from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
from vllm.reasoning import ReasoningParser
from vllm.tokenizers import MistralTokenizer
from vllm.tokenizers.mistral import MistralTokenizer


class StreamingReasoningReconstructor:


+ 2
- 2
tests/test_inputs.py View File

@@ -7,7 +7,7 @@ from vllm.config import ModelConfig
from vllm.inputs import zip_enc_dec_prompts
from vllm.inputs.parse import parse_raw_prompts
from vllm.inputs.preprocess import InputPreprocessor
from vllm.tokenizers import init_tokenizer_from_config
from vllm.tokenizers import cached_tokenizer_from_config

pytestmark = pytest.mark.cpu_test

@@ -108,7 +108,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
)
def test_preprocessor_always_mm_code_path(model_id, prompt):
model_config = ModelConfig(model=model_id)
tokenizer = init_tokenizer_from_config(model_config)
tokenizer = cached_tokenizer_from_config(model_config)
input_preprocessor = InputPreprocessor(model_config, tokenizer)

# HF processor adds sep token


+ 24
- 23
tests/tokenizers_/test_basic.py View File

@@ -3,38 +3,39 @@
from typing import _get_protocol_attrs # type: ignore

import pytest
from transformers import PreTrainedTokenizerBase
from transformers import (
PreTrainedTokenizer,
PreTrainedTokenizerBase,
PreTrainedTokenizerFast,
)

from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.mistral import MistralTokenizer


def _get_missing_attrs(obj: object, target: type):
return [k for k in _get_protocol_attrs(target) if not hasattr(obj, k)]


def _assert_tokenizer_like(tokenizer: object):
missing_attrs = _get_missing_attrs(tokenizer, TokenizerLike)
assert not missing_attrs, f"Missing attrs: {missing_attrs}"


def test_tokenizer_like_protocol():
assert not (
missing_attrs := _get_missing_attrs(
get_tokenizer("gpt2", use_fast=False),
TokenizerLike,
)
), f"Missing attrs: {missing_attrs}"

assert not (
missing_attrs := _get_missing_attrs(
get_tokenizer("gpt2", use_fast=True),
TokenizerLike,
)
), f"Missing attrs: {missing_attrs}"

assert not (
missing_attrs := _get_missing_attrs(
get_tokenizer(
"mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
),
TokenizerLike,
)
), f"Missing attrs: {missing_attrs}"
tokenizer = get_tokenizer("gpt2", use_fast=False)
assert isinstance(tokenizer, PreTrainedTokenizer)
_assert_tokenizer_like(tokenizer)

tokenizer = get_tokenizer("gpt2", use_fast=True)
assert isinstance(tokenizer, PreTrainedTokenizerFast)
_assert_tokenizer_like(tokenizer)

tokenizer = get_tokenizer(
"mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
)
assert isinstance(tokenizer, MistralTokenizer)
_assert_tokenizer_like(tokenizer)


@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])


+ 1
- 1
tests/tokenizers_/test_detokenize.py View File

@@ -8,7 +8,7 @@ import pytest
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast

from vllm.sampling_params import SamplingParams
from vllm.tokenizers import MistralTokenizer
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.detokenizer import (
FastIncrementalDetokenizer,


+ 21
- 2
tests/tokenizers_/test_registry.py View File

@@ -2,7 +2,14 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path

from vllm.tokenizers import TokenizerLike, TokenizerRegistry, get_tokenizer
import pytest

from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.registry import (
TokenizerRegistry,
get_tokenizer,
resolve_tokenizer_args,
)


class TestTokenizer(TokenizerLike):
@@ -40,10 +47,22 @@ class TestTokenizer(TokenizerLike):
return True


@pytest.mark.parametrize("runner_type", ["generate", "pooling"])
def test_resolve_tokenizer_args_idempotent(runner_type):
tokenizer_mode, tokenizer_name, args, kwargs = resolve_tokenizer_args(
"facebook/opt-125m",
runner_type=runner_type,
)

assert (tokenizer_mode, tokenizer_name, args, kwargs) == resolve_tokenizer_args(
tokenizer_name, *args, **kwargs
)


def test_customized_tokenizer():
TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)

tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer", "abc")
tokenizer = TokenizerRegistry.load_tokenizer("test_tokenizer", "abc")
assert isinstance(tokenizer, TestTokenizer)
assert tokenizer.path_or_repo_id == "abc"
assert tokenizer.bos_token_id == 0


+ 2
- 5
tests/tool_use/test_mistral_tool_parser.py View File

@@ -13,12 +13,9 @@ from partial_json_parser.core.options import Allow

from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolParser
from vllm.tokenizers import (
MistralTokenizer,
TokenizerLike,
get_tokenizer,
)
from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
from vllm.tokenizers.mistral import MistralTokenizer


@pytest.fixture(scope="module")


+ 3
- 1
tests/v1/sample/test_logprobs.py View File

@@ -528,9 +528,11 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
),
],
)
@pytest.mark.parametrize("top_logprobs", [0, 3])
def test_spec_decode_logprobs(
logprobs_mode: LogprobsMode,
model_setup: tuple[str, str, str],
top_logprobs: int,
):
"""Spec decode logprobs should match those of the base model.

@@ -543,7 +545,7 @@ def test_spec_decode_logprobs(

prompt = "Hello world " * 50
sampling_params = SamplingParams(
temperature=0, logprobs=3, max_tokens=10, ignore_eos=False
temperature=0, logprobs=top_logprobs, max_tokens=10, ignore_eos=False
)
method, model_name, spec_model_name = model_setup
max_model_len = 256


+ 1
- 1
tests/v1/sample/test_rejection_sampler.py View File

@@ -111,7 +111,7 @@ def create_sampling_metadata(
top_p=top_p,
top_k=top_k,
generators=generators,
max_num_logprobs=0,
max_num_logprobs=None,
no_penalties=no_penalties,
prompt_token_ids=prompt_token_ids,
frequency_penalties=frequency_penalties,


+ 1
- 1
tools/pre_commit/mypy.py View File

@@ -43,6 +43,7 @@ FILES = [
"vllm/worker",
"vllm/v1/core",
"vllm/v1/engine",
"vllm/v1/executor",
"vllm/v1/metrics",
"vllm/v1/pool",
"vllm/v1/sample",
@@ -60,7 +61,6 @@ SEPARATE_GROUPS = [
"vllm/model_executor",
# v1 related
"vllm/v1/attention",
"vllm/v1/executor",
"vllm/v1/kv_offload",
"vllm/v1/spec_decode",
"vllm/v1/structured_output",


+ 10
- 0
vllm/attention/layer.py View File

@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Attention layer."""

import functools
from collections.abc import Callable
from typing import cast

@@ -17,6 +18,7 @@ from vllm.attention.backends.abstract import (
)
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.selector import get_attn_backend
from vllm.attention.utils.fa_utils import get_flash_attn_version
from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer
from vllm.config import CacheConfig, get_current_vllm_config
@@ -524,6 +526,14 @@ class MultiHeadAttention(nn.Module):
AttentionBackendEnum.ROCM_AITER_FA,
}

self.fa_version = None
if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
self.fa_version = get_flash_attn_version()
assert self._flash_attn_varlen_func is not None
self._flash_attn_varlen_func = functools.partial(
self._flash_attn_varlen_func, fa_version=self.fa_version
)

logger.info_once(
f"Using {self.attn_backend} for MultiHeadAttention in multimodal encoder."
)


+ 30
- 39
vllm/attention/ops/triton_unified_attention.py View File

@@ -355,7 +355,7 @@ def kernel_unified_attention_2d(
@triton.jit
def kernel_unified_attention_3d(
segm_output_ptr,
# [num_tokens, num_query_heads, num_segments, head_size]
# [num_tokens, num_query_heads, num_segments, head_size_padded]
segm_max_ptr, # [num_tokens, num_query_heads, num_segments]
segm_expsum_ptr, # [num_tokens, num_query_heads, num_segments]
query_ptr, # [num_tokens, num_query_heads, head_size]
@@ -749,6 +749,11 @@ def unified_attention(
q_descale,
k_descale,
v_descale,
seq_threshold_3D=None,
num_par_softmax_segments=None,
softmax_segm_output=None,
softmax_segm_max=None,
softmax_segm_expsum=None,
alibi_slopes=None,
output_scale=None,
qq_bias=None,
@@ -793,8 +798,19 @@ def unified_attention(
TILE_SIZE_PREFILL = 32
TILE_SIZE_DECODE = 16 if q.element_size() >= 2 else 32

# if batch contains a prefill
if max_seqlen_q > 1 or total_num_q_blocks * num_kv_heads > 128:
# Launch the 2D kernel if
# 1. No intermediate tiled softmax buffers for the 3D kernel have been allocated, or
# 2. The batch includes at least one prefill request, or
# 3. The number of sequences exceeds the configured threshold
if (
seq_threshold_3D is None
or num_par_softmax_segments is None
or softmax_segm_output is None
or softmax_segm_max is None
or softmax_segm_expsum is None
or max_seqlen_q > 1
or num_seqs > seq_threshold_3D
):
kernel_unified_attention_2d[
(
total_num_q_blocks,
@@ -847,37 +863,12 @@ def unified_attention(
USE_FP8=output_scale is not None,
)
else:
# for initial version, NUM_SEGMENTS = 16 is chosen as a default
# value that showed good performance in tests
NUM_SEGMENTS = 16

segm_output = torch.empty(
q.shape[0],
num_query_heads,
NUM_SEGMENTS,
triton.next_power_of_2(head_size),
dtype=torch.float32,
device=q.device,
)
segm_max = torch.empty(
q.shape[0],
num_query_heads,
NUM_SEGMENTS,
dtype=torch.float32,
device=q.device,
)
segm_expsum = torch.empty(
q.shape[0],
num_query_heads,
NUM_SEGMENTS,
dtype=torch.float32,
device=q.device,
)

kernel_unified_attention_3d[(total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)](
segm_output_ptr=segm_output,
segm_max_ptr=segm_max,
segm_expsum_ptr=segm_expsum,
kernel_unified_attention_3d[
(total_num_q_blocks, num_kv_heads, num_par_softmax_segments)
](
segm_output_ptr=softmax_segm_output,
segm_max_ptr=softmax_segm_max,
segm_expsum_ptr=softmax_segm_expsum,
query_ptr=q,
key_cache_ptr=k,
value_cache_ptr=v,
@@ -917,13 +908,13 @@ def unified_attention(
BLOCK_Q=BLOCK_Q,
num_seqs=num_seqs,
BLOCK_M=BLOCK_M,
NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
NUM_SEGMENTS_PER_SEQ=num_par_softmax_segments,
)
reduce_segments[(q.shape[0], num_query_heads)](
output_ptr=out,
segm_output_ptr=segm_output,
segm_max_ptr=segm_max,
segm_expsum_ptr=segm_expsum,
segm_output_ptr=softmax_segm_output,
segm_max_ptr=softmax_segm_max,
segm_expsum_ptr=softmax_segm_expsum,
seq_lens_ptr=seqused_k,
num_seqs=num_seqs,
num_query_heads=num_query_heads,
@@ -936,6 +927,6 @@ def unified_attention(
HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
query_start_len_ptr=cu_seqlens_q,
BLOCK_Q=BLOCK_Q,
NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
NUM_SEGMENTS_PER_SEQ=num_par_softmax_segments,
USE_FP8=output_scale is not None,
)

+ 22
- 3
vllm/compilation/decorators.py View File

@@ -28,7 +28,7 @@ from vllm.config.compilation import DynamicShapesType
from vllm.logger import init_logger
from vllm.sequence import IntermediateTensors
from vllm.utils.import_utils import resolve_obj_by_qualname
from vllm.utils.torch_utils import supports_dynamo
from vllm.utils.torch_utils import is_torch_equal_or_newer, supports_dynamo

from .monitor import start_monitoring_torch_compile

@@ -316,7 +316,13 @@ def _support_torch_compile(
def _mark_dynamic_inputs(mod, type, *args, **kwargs):
def mark_dynamic(arg, dims):
if type == DynamicShapesType.UNBACKED:
torch._dynamo.decorators.mark_unbacked(arg, dims)
if is_torch_equal_or_newer("2.10.0.dev"):
for dim in dims:
torch._dynamo.decorators.mark_unbacked(
arg, dim, hint_override=arg.size()[dim]
)
else:
torch._dynamo.decorators.mark_unbacked(arg, dims)
else:
torch._dynamo.mark_dynamic(arg, dims)

@@ -350,7 +356,13 @@ def _support_torch_compile(
if isinstance(arg, torch.Tensor):
# In case dims is specified with negative indexing
dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
torch._dynamo.decorators.mark_unbacked(arg, dims)
if is_torch_equal_or_newer("2.10.0.dev"):
for dim in dims:
torch._dynamo.decorators.mark_unbacked(
arg, dim, hint_override=arg.size()[dim]
)
else:
torch._dynamo.decorators.mark_unbacked(arg, dims)

def __call__(self, *args, **kwargs):
# torch.compiler.is_compiling() means we are inside the compilation
@@ -488,6 +500,12 @@ def _support_torch_compile(
if ds_type == DynamicShapesType.BACKED_SIZE_OBLIVIOUS:
fx_config_patches["backed_size_oblivious"] = True

# Prepare inductor config patches
# assume_32bit_indexing is only available in torch 2.10.0.dev+
inductor_config_patches = {}
if is_torch_equal_or_newer("2.10.0.dev"):
inductor_config_patches["assume_32bit_indexing"] = True

with (
patch.object(
InliningInstructionTranslator, "inline_call_", patched_inline_call
@@ -496,6 +514,7 @@ def _support_torch_compile(
maybe_use_cudagraph_partition_wrapper(self.vllm_config),
torch.fx.experimental._config.patch(**fx_config_patches),
_torch27_patch_tensor_subclasses(),
torch._inductor.config.patch(**inductor_config_patches),
):
if envs.VLLM_USE_AOT_COMPILE:
self.aot_compiled_fn = self.aot_compile(*args, **kwargs)


+ 7
- 1
vllm/engine/arg_utils.py View File

@@ -1649,7 +1649,13 @@ class EngineArgs:
"attention_backend and attention_config.backend "
"are mutually exclusive"
)
attention_config.backend = self.attention_backend
# Convert string to enum if needed (CLI parsing returns a string)
if isinstance(self.attention_backend, str):
attention_config.backend = AttentionBackendEnum[
self.attention_backend.upper()
]
else:
attention_config.backend = self.attention_backend

load_config = self.create_load_config()



+ 6
- 6
vllm/entrypoints/anthropic/serving_messages.py View File

@@ -324,12 +324,12 @@ class AnthropicServingMessages(OpenAIServingChat):
id=origin_chunk.id,
content=[],
model=origin_chunk.model,
),
usage=AnthropicUsage(
input_tokens=origin_chunk.usage.prompt_tokens
if origin_chunk.usage
else 0,
output_tokens=0,
usage=AnthropicUsage(
input_tokens=origin_chunk.usage.prompt_tokens
if origin_chunk.usage
else 0,
output_tokens=0,
),
),
)
first_item = False


+ 72
- 31
vllm/entrypoints/chat_utils.py View File

@@ -9,7 +9,7 @@ from collections import Counter, defaultdict, deque
from collections.abc import Awaitable, Callable, Iterable
from functools import cached_property, lru_cache, partial
from pathlib import Path
from typing import Any, Generic, Literal, TypeAlias, TypeVar, cast
from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, TypeVar, cast

import jinja2
import jinja2.ext
@@ -49,11 +49,20 @@ from vllm.logger import init_logger
from vllm.model_executor.models import SupportsMultiModal
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
from vllm.tokenizers import MistralTokenizer, TokenizerLike
from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
from vllm.transformers_utils.processor import cached_get_processor
from vllm.utils import random_uuid
from vllm.utils.collection_utils import is_list_of
from vllm.utils.func_utils import supports_kw
from vllm.utils.import_utils import LazyLoader

if TYPE_CHECKING:
import torch

from vllm.tokenizers.mistral import MistralTokenizer
else:
torch = LazyLoader("torch", globals(), "torch")

logger = init_logger(__name__)

@@ -620,6 +629,44 @@ ModalityStr = Literal["image", "audio", "video", "image_embeds", "audio_embeds"]
_T = TypeVar("_T")


def _extract_embeds(tensors: list[torch.Tensor]):
if len(tensors) == 0:
return tensors

if len(tensors) == 1:
tensors[0]._is_single_item = True # type: ignore
return tensors[0] # To keep backwards compatibility for single item input

first_shape = tensors[0].shape
if all(t.shape == first_shape for t in tensors):
return torch.stack(tensors)

return tensors


def _get_embeds_data(items_by_modality: dict[str, list[Any]], modality: str):
embeds_key = f"{modality}_embeds"
embeds = items_by_modality[embeds_key]

if len(embeds) == 0:
return embeds
if is_list_of(embeds, torch.Tensor):
return _extract_embeds(embeds)
if is_list_of(embeds, dict):
if not embeds:
return {}

first_keys = set(embeds[0].keys())
if any(set(item.keys()) != first_keys for item in embeds[1:]):
raise ValueError(
"All dictionaries in the list of embeddings must have the same keys."
)

return {k: _extract_embeds([item[k] for item in embeds]) for k in first_keys}

return embeds


class BaseMultiModalItemTracker(ABC, Generic[_T]):
"""
Tracks multi-modal items in a given request and ensures that the number
@@ -688,11 +735,14 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
def all_mm_uuids(self) -> MultiModalUUIDDict | None:
if not self._items_by_modality:
return None
mm_uuids = {}
uuids_by_modality = dict(self._uuids_by_modality)
if "image" in uuids_by_modality and "image_embeds" in uuids_by_modality:
raise ValueError("Mixing raw image and embedding inputs is not allowed")
if "audio" in uuids_by_modality and "audio_embeds" in uuids_by_modality:
raise ValueError("Mixing raw audio and embedding inputs is not allowed")

mm_uuids = {}
if "image_embeds" in uuids_by_modality:
mm_uuids["image"] = uuids_by_modality["image_embeds"]
if "image" in uuids_by_modality:
@@ -703,6 +753,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
mm_uuids["audio"] = uuids_by_modality["audio"] # UUIDs of audios
if "video" in uuids_by_modality:
mm_uuids["video"] = uuids_by_modality["video"] # UUIDs of videos

return mm_uuids

@abstractmethod
@@ -714,29 +765,25 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
def all_mm_data(self) -> MultiModalDataDict | None:
if not self._items_by_modality:
return None
mm_inputs = {}
items_by_modality = dict(self._items_by_modality)
if "image" in items_by_modality and "image_embeds" in items_by_modality:
raise ValueError("Mixing raw image and embedding inputs is not allowed")
if "audio" in items_by_modality and "audio_embeds" in items_by_modality:
raise ValueError("Mixing raw audio and embedding inputs is not allowed")

mm_inputs = {}
if "image_embeds" in items_by_modality:
image_embeds_lst = items_by_modality["image_embeds"]
mm_inputs["image"] = (
image_embeds_lst if len(image_embeds_lst) != 1 else image_embeds_lst[0]
)
mm_inputs["image"] = _get_embeds_data(items_by_modality, "image")
if "image" in items_by_modality:
mm_inputs["image"] = items_by_modality["image"] # A list of images
if "audio_embeds" in items_by_modality:
audio_embeds_lst = items_by_modality["audio_embeds"]
mm_inputs["audio"] = (
audio_embeds_lst if len(audio_embeds_lst) != 1 else audio_embeds_lst[0]
)
mm_inputs["audio"] = _get_embeds_data(items_by_modality, "audio")
if "audio" in items_by_modality:
mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
if "video" in items_by_modality:
mm_inputs["video"] = items_by_modality["video"] # A list of videos

return mm_inputs

def create_parser(self) -> "BaseMultiModalContentParser":
@@ -747,38 +794,32 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
async def all_mm_data(self) -> MultiModalDataDict | None:
if not self._items_by_modality:
return None
mm_inputs = {}
items_by_modality = {}
for modality, items in self._items_by_modality.items():
coros = []
for item in items:
if item is not None:
coros.append(item)
else:
coros.append(asyncio.sleep(0))
items_by_modality[modality] = await asyncio.gather(*coros)

coros_by_modality = {
modality: [item or asyncio.sleep(0) for item in items]
for modality, items in self._items_by_modality.items()
}
items_by_modality: dict[str, list[object | None]] = {
modality: await asyncio.gather(*coros)
for modality, coros in coros_by_modality.items()
}
if "image" in items_by_modality and "image_embeds" in items_by_modality:
raise ValueError("Mixing raw image and embedding inputs is not allowed")
if "audio" in items_by_modality and "audio_embeds" in items_by_modality:
raise ValueError("Mixing raw audio and embedding inputs is not allowed")

mm_inputs = {}
if "image_embeds" in items_by_modality:
image_embeds_lst = items_by_modality["image_embeds"]
mm_inputs["image"] = (
image_embeds_lst if len(image_embeds_lst) != 1 else image_embeds_lst[0]
)
mm_inputs["image"] = _get_embeds_data(items_by_modality, "image")
if "image" in items_by_modality:
mm_inputs["image"] = items_by_modality["image"] # A list of images
if "audio_embeds" in items_by_modality:
audio_embeds_lst = items_by_modality["audio_embeds"]
mm_inputs["audio"] = (
audio_embeds_lst if len(audio_embeds_lst) != 1 else audio_embeds_lst[0]
)
mm_inputs["audio"] = _get_embeds_data(items_by_modality, "audio")
if "audio" in items_by_modality:
mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
if "video" in items_by_modality:
mm_inputs["video"] = items_by_modality["video"] # A list of videos

return mm_inputs

def create_parser(self) -> "BaseMultiModalContentParser":
@@ -1792,7 +1833,7 @@ def apply_hf_chat_template(


def apply_mistral_chat_template(
tokenizer: MistralTokenizer,
tokenizer: "MistralTokenizer",
messages: list[ChatCompletionMessageParam],
chat_template: str | None,
tools: list[dict[str, Any]] | None,


+ 2
- 1
vllm/entrypoints/llm.py View File

@@ -72,7 +72,8 @@ from vllm.platforms import current_platform
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams
from vllm.tasks import PoolingTask
from vllm.tokenizers import MistralTokenizer, TokenizerLike
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.usage.usage_lib import UsageContext
from vllm.utils.collection_utils import as_iter, is_list_of
from vllm.utils.counter import Counter


+ 3
- 1
vllm/entrypoints/openai/serving_engine.py View File

@@ -117,7 +117,9 @@ from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
from vllm.pooling_params import PoolingParams
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer, TokenizerLike
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.deepseekv32 import DeepseekV32Tokenizer
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.tracing import (
contains_trace_headers,
extract_trace_headers,


+ 2
- 1
vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py View File

@@ -22,7 +22,8 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import MistralTokenizer, TokenizerLike
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.mistral import MistralTokenizer

logger = init_logger(__name__)



+ 2
- 1
vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py View File

@@ -21,7 +21,8 @@ from vllm.entrypoints.openai.protocol import (
from vllm.entrypoints.openai.tool_parsers import ToolParser
from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
from vllm.logger import init_logger
from vllm.tokenizers import MistralTokenizer, TokenizerLike
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.mistral import MistralTokenizer

logger = init_logger(__name__)



+ 4
- 1
vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py View File

@@ -6,6 +6,7 @@ from collections.abc import Sequence
from enum import Enum, auto
from random import choices
from string import ascii_letters, digits
from typing import Any

import ijson
import regex as re
@@ -24,7 +25,8 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import MistralTokenizer, TokenizerLike
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.mistral import MistralTokenizer

logger = init_logger(__name__)

@@ -84,6 +86,7 @@ class MistralToolParser(ToolParser):

# initialize properties used for state when parsing tool calls in
# streaming mode
self.prev_tool_call_arr: list[dict[str, Any]] = []
self.current_tool_id: int = -1
self.streaming_state: StreamingState = StreamingState.WAITING_FOR_TOOL_START



+ 2
- 1
vllm/entrypoints/pooling/score/serving.py View File

@@ -38,7 +38,8 @@ from vllm.inputs.data import TokensPrompt
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
from vllm.tokenizers import MistralTokenizer, TokenizerLike
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.utils.async_utils import make_async, merge_async_iterators

logger = init_logger(__name__)


+ 1
- 1
vllm/entrypoints/utils.py View File

@@ -30,7 +30,7 @@ from vllm.entrypoints.openai.protocol import (
from vllm.entrypoints.openai.serving_models import LoRAModulePath
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.tokenizers import MistralTokenizer
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.utils.argparse_utils import FlexibleArgumentParser

logger = init_logger(__name__)


+ 1
- 1
vllm/model_executor/layers/batch_invariant.py View File

@@ -936,7 +936,7 @@ def enable_batch_invariant_mode():
# Batch invariant matmuls are no longer needed after cublas overrides
if not is_torch_equal_or_newer("2.10.0.dev"):
if (
current_platform.is_device_capability(100)
current_platform.is_device_capability_family(100)
or current_platform.is_device_capability(80)
or current_platform.is_device_capability(89)
):


+ 4
- 1
vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py View File

@@ -287,7 +287,10 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
"""
DeepGemm supports packed ue8m0 activation scales format in devices == sm100
"""
return is_deep_gemm_e8m0_used() and current_platform.is_device_capability(100)
return (
is_deep_gemm_e8m0_used()
and current_platform.is_device_capability_family(100)
)

def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
# Let PrepareAndFinalize::finalize() decide the impl.


+ 6
- 2
vllm/model_executor/layers/fused_moe/layer.py View File

@@ -1200,10 +1200,14 @@ class FusedMoE(CustomOp):
if full_load:
shard_dim += 1

# Materialize GGUF UninitializedParameter
# Materialize GGUF UninitializedParameter accounting merged weights
if is_gguf_weight and isinstance(param, UninitializedParameter):
# To materialize a tensor, we must have full shape including
# number of experts, making this portion to require `full_load`.
assert full_load
final_shape = list(loaded_weight.shape)
if shard_id in ["w1", "w3"]:
# w1 and w3 are merged per expert.
if shard_id in {"w1", "w3"}:
final_shape[1] *= 2
final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size
param.materialize(final_shape, dtype=loaded_weight.dtype)


+ 3
- 3
vllm/model_executor/layers/quantization/fp8.py View File

@@ -137,7 +137,7 @@ def get_fp8_moe_backend(
if (
current_platform.is_cuda()
and (
current_platform.is_device_capability(100)
current_platform.is_device_capability_family(100)
or current_platform.is_device_capability(90)
)
and envs.VLLM_USE_FLASHINFER_MOE_FP8
@@ -148,7 +148,7 @@ def get_fp8_moe_backend(
logger.info_once("Using FlashInfer FP8 MoE TRTLLM backend for SM100")
return Fp8MoeBackend.FLASHINFER_TRTLLM
else:
if block_quant and current_platform.is_device_capability(100):
if block_quant and current_platform.is_device_capability_family(100):
raise ValueError(
"FlashInfer FP8 MoE throughput backend does not "
"support block quantization. Please use "
@@ -193,7 +193,7 @@ def get_fp8_moe_backend(
# CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
if (
current_platform.is_cuda()
and current_platform.is_device_capability(100)
and current_platform.is_device_capability_family(100)
and block_quant
):
logger.info_once(


+ 4
- 1
vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py View File

@@ -17,7 +17,9 @@ class ScaledMMLinearLayerConfig:
class ScaledMMLinearKernel(ABC):
@classmethod
@abstractmethod
def get_min_capability(cls) -> int:
def is_supported(
cls, compute_capability: int | None = None
) -> tuple[bool, str | None]:
raise NotImplementedError

@classmethod
@@ -35,6 +37,7 @@ class ScaledMMLinearKernel(ABC):
azp_adj_param_name: str,
) -> None:
assert self.can_implement(c)
assert self.is_supported()
self.config = c
self.w_q_name = w_q_param_name
self.w_s_name = w_s_param_name


+ 12
- 28
vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py View File

@@ -27,7 +27,7 @@ from vllm.platforms import PlatformEnum, current_platform
# in priority/performance order (when available)
_POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = {
PlatformEnum.CPU: [CPUScaledMMLinearKernel],
PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
PlatformEnum.CUDA: [CutlassScaledMMLinearKernel, TritonScaledMMLinearKernel],
PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel],
PlatformEnum.TPU: [XLAScaledMMLinearKernel],
}
@@ -55,41 +55,25 @@ def choose_scaled_mm_linear_kernel(
type[ScaledMMLinearKernel]: Chosen kernel.
"""

if compute_capability is None:
_cc = current_platform.get_device_capability()
if _cc is not None:
compute_capability = _cc[0] * 10 + _cc[1]

failure_reasons = []
for kernel in _POSSIBLE_KERNELS[current_platform._enum]:
if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "").split(","):
failure_reasons.append(
f" {kernel.__name__} disabled by environment variable"
)
failure_reasons.append(f"{kernel.__name__}: disabled by env var")
continue

# If the current platform uses compute_capability,
# make sure the kernel supports the compute cability.
if compute_capability is not None:
kernel_min_capability = kernel.get_min_capability()
if (
kernel_min_capability is not None
and kernel_min_capability > compute_capability
):
failure_reasons.append(
f"{kernel.__name__} requires capability "
f"{kernel_min_capability}, current compute capability "
f"is {compute_capability}"
)
continue
is_supported, reason = kernel.is_supported(compute_capability)
if not is_supported:
failure_reasons.append(f"{kernel.__name__}: {reason}")
continue

can_implement, reason = kernel.can_implement(config)
if not can_implement:
failure_reasons.append(f"{kernel.__name__}: {reason}")
continue

can_implement, failure_reason = kernel.can_implement(config)
if can_implement:
return kernel
else:
failure_reasons.append(
f" {kernel.__name__} cannot implement due to: {failure_reason}"
)
return kernel

raise ValueError(
"Failed to find a kernel that can implement the "


+ 15
- 7
vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py View File

@@ -14,17 +14,21 @@ from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig

class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
@classmethod
def get_min_capability(cls) -> int:
return 90

@classmethod
def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
def is_supported(
cls, compute_capability: int | None = None
) -> tuple[bool, str | None]:
if not current_platform.is_rocm():
return (
False,
"AiterScaledMMLinearKernel requires `aiter` which is not "
+ "currently supported on non-ROCm platform.",
)
if compute_capability is None:
_cc = current_platform.get_device_capability()
if _cc is not None:
compute_capability = _cc.major * 10 + _cc.minor
if compute_capability is not None and compute_capability < 90:
return False, f"requires capability 90, got {compute_capability}"

try:
import aiter # noqa: F401 # deliberately attempt to import aiter
@@ -34,8 +38,8 @@ class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
"AiterScaledMMLinearKernel requires `aiter` which is not "
+ "installed on ROCm.",
)
# Check if rocm_aiter_gemm_w8a8_scaled_mm is enabled
if not (rocm_aiter_ops.is_linear_enabled()):
if not rocm_aiter_ops.is_linear_enabled():
return (
False,
"AiterScaledMMLinearKernel is disabled. "
@@ -44,6 +48,10 @@ class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
+ "`VLLM_ROCM_USE_AITER_LINEAR` default is True.",
)

return True, None

@classmethod
def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
if not c.input_symmetric:
return (
False,


+ 6
- 5
vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py View File

@@ -19,14 +19,15 @@ from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfi

class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
@classmethod
def get_min_capability(cls) -> int:
return 75
def is_supported(
cls, compute_capability: int | None = None
) -> tuple[bool, str | None]:
if not current_platform.is_cpu():
return False, "Requires CPU."
return True, None

@classmethod
def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
if not current_platform.is_cpu():
return False, "CPUScaledMM requires running on CPU."

return True, None

def process_weights_after_loading(self, layer: torch.nn.Module) -> None:


+ 12
- 5
vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py View File

@@ -16,14 +16,21 @@ from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfi

class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
@classmethod
def get_min_capability(cls) -> int:
return 75
def is_supported(
cls, compute_capability: int | None = None
) -> tuple[bool, str | None]:
if not current_platform.is_cuda():
return False, "Requires CUDA."
if compute_capability is None:
_cc = current_platform.get_device_capability()
if _cc is not None:
compute_capability = _cc.major * 10 + _cc.minor
if compute_capability is not None and compute_capability < 75:
return False, f"requires capability 75, got {compute_capability}"
return True, None

@classmethod
def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
if not current_platform.is_cuda():
return False, "CutlassScaledMM requires running on CUDA."

return True, None

def process_weights_after_loading(self, layer: torch.nn.Module) -> None:


+ 46
- 17
vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py View File

@@ -4,34 +4,53 @@

import torch

from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm import ( # noqa: E501
triton_scaled_mm,
)
from vllm.model_executor.layers.quantization.utils import replace_parameter
from vllm.platforms import current_platform

from .cutlass import CutlassScaledMMLinearKernel
from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig


class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):
class TritonScaledMMLinearKernel(ScaledMMLinearKernel):
@classmethod
def get_min_capability(cls) -> int:
return 75
def is_supported(
cls, compute_capability: int | None = None
) -> tuple[bool, str | None]:
if current_platform.is_cuda_alike():
return True, None
return False, "Requires ROCm or CUDA."

@classmethod
def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
if current_platform.is_cpu():
return (
False,
"TritonScaledMMLinearKernel requires Triton which is not "
+ "currently supported on CPU.",
)
if not c.input_symmetric:
return (
False,
"TritonScaledMMLinearKernel only supports symmetric " + "quantization.",
)
return False, "Only symmetric input is supported."
return True, None

def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
super().process_weights_after_loading(layer)
weight = getattr(layer, self.w_q_name)
replace_parameter(
layer,
self.w_q_name,
torch.nn.Parameter(weight.t().data, requires_grad=False),
)

# INPUT SCALE
if self.config.is_static_input_scheme:
input_scale = getattr(layer, self.i_s_name)
replace_parameter(
layer,
self.i_s_name,
torch.nn.Parameter(input_scale.max(), requires_grad=False),
)
setattr(layer, self.i_zp_name, None)
else:
setattr(layer, self.i_s_name, None)
setattr(layer, self.i_zp_name, None)

setattr(layer, self.azp_adj_name, None)

def apply_weights(
self,
@@ -39,4 +58,14 @@ class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):
x: torch.Tensor,
bias: torch.Tensor | None = None,
) -> torch.Tensor:
return super().apply_weights(layer, x, bias)
w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)

x_q, x_s, x_zp = ops.scaled_int8_quant(
x.contiguous(), i_s, i_zp, symmetric=True
)

assert x_zp is None, "Triton kernel only supports symmetric quantization"

return triton_scaled_mm(
x_q, w_q, scale_a=x_s, scale_b=w_s, out_dtype=x.dtype, bias=bias
)

+ 6
- 5
vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py View File

@@ -17,11 +17,12 @@ from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfi

class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
@classmethod
def get_min_capability(cls) -> int:
raise NotImplementedError(
"TPU platform does have a concept of compute capability, "
"this method should not be called."
)
def is_supported(
cls, compute_capability: int | None = None
) -> tuple[bool, str | None]:
if not current_platform.is_tpu():
return False, "Requires TPU."
return True, None

@classmethod
def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:


+ 48
- 0
vllm/model_executor/layers/quantization/modelopt.py View File

@@ -81,6 +81,7 @@ from vllm.utils.flashinfer import (
has_flashinfer,
has_flashinfer_moe,
)
from vllm.utils.math_utils import round_up

if TYPE_CHECKING:
from vllm.model_executor.models.utils import WeightsMapper
@@ -607,6 +608,9 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
Only supports pre-quantized checkpoints with FP8 weights and scales.
"""

if self.flashinfer_moe_backend is not None:
self._maybe_pad_intermediate_for_flashinfer(layer)

layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)

@@ -684,6 +688,50 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight)
register_moe_scaling_factors(layer)

def _maybe_pad_intermediate_for_flashinfer(self, layer: torch.nn.Module) -> None:
"""Pad intermediate size so FlashInfer kernels' alignment constraints hold.

Some FlashInfer FP8 MoE kernels require the (gated) intermediate size
used for GEMM to be divisible by a small alignment value. When this is
not satisfied (e.g. with certain tensor-parallel sizes), we pad the
gate/up and down projection weights along the intermediate dim.
"""
if not hasattr(layer, "w13_weight") or not hasattr(layer, "w2_weight"):
return

# Current local intermediate size (per partition) is the K dimension of
# the down projection.
num_experts, hidden_size, intermediate = layer.w2_weight.shape

min_alignment = 16
padded_intermediate = round_up(intermediate, min_alignment)

if padded_intermediate == intermediate:
return

logger.info(
"Padding intermediate size from %d to %d for up/down projection weights.",
intermediate,
padded_intermediate,
)

up_mult = 2 if self.moe.is_act_and_mul else 1
padded_gate_up_dim = up_mult * padded_intermediate

# Pad w13 and w12 along its intermediate dimension.
w13 = layer.w13_weight.data
padded_w13 = w13.new_zeros((num_experts, padded_gate_up_dim, hidden_size))
padded_w13[:, : w13.shape[1], :] = w13
layer.w13_weight.data = padded_w13

w2 = layer.w2_weight.data
padded_w2 = w2.new_zeros((num_experts, hidden_size, padded_intermediate))
padded_w2[:, :, :intermediate] = w2
layer.w2_weight.data = padded_w2

if hasattr(layer, "intermediate_size_per_partition"):
layer.intermediate_size_per_partition = padded_intermediate

def get_fused_moe_quant_config(
self, layer: torch.nn.Module
) -> FusedMoEQuantConfig | None:


+ 4
- 4
vllm/model_executor/layers/quantization/mxfp4.py View File

@@ -118,19 +118,19 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
logger.info_once("Using FlashInfer MXFP4 BF16 backend for SM90")
return Mxfp4Backend.SM90_FI_MXFP4_BF16
elif (
current_platform.is_device_capability(100)
current_platform.is_device_capability_family(100)
and has_flashinfer()
and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS
):
logger.info_once("Using FlashInfer MXFP4 MXFP8 CUTLASS backend for SM100")
return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
elif (
current_platform.is_device_capability(100)
current_platform.is_device_capability_family(100)
and has_flashinfer()
and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
):
return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
elif current_platform.is_device_capability(100) and has_flashinfer():
elif current_platform.is_device_capability_family(100) and has_flashinfer():
logger.info_once(
"Using FlashInfer MXFP4 BF16 backend for SM100, "
"For faster performance on SM100, consider setting "
@@ -139,7 +139,7 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
)
return Mxfp4Backend.SM100_FI_MXFP4_BF16
elif (
current_platform.is_device_capability(100)
current_platform.is_device_capability_family(100)
or current_platform.is_device_capability(90)
) and not has_flashinfer():
logger.warning_once(


+ 1
- 1
vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py View File

@@ -50,7 +50,7 @@ def is_flashinfer_fp4_cutedsl_moe_available() -> bool:
envs.VLLM_USE_FLASHINFER_MOE_FP4
and has_flashinfer_cutedsl_grouped_gemm_nt_masked()
and current_platform.is_cuda()
and current_platform.is_device_capability(100)
and current_platform.is_device_capability_family(100)
)




+ 1
- 1
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py View File

@@ -290,7 +290,7 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
if flashinfer_moe_backend in backend_map:
if (
flashinfer_moe_backend == "latency"
and not current_platform.has_device_capability(100)
and not current_platform.is_device_capability_family(100)
):
logger.info_once(
"Flashinfer TRTLLM MOE backend is only supported on "


+ 6
- 3
vllm/model_executor/layers/quantization/utils/fp8_utils.py View File

@@ -247,7 +247,7 @@ class W8A8BlockFp8LinearOp:
self.act_quant_group_shape = act_quant_group_shape
self.is_deep_gemm_supported = is_deep_gemm_supported()
self.is_hopper = current_platform.is_device_capability(90)
self.is_blackwell = current_platform.is_device_capability(100)
self.is_blackwell = current_platform.is_device_capability_family(100)
self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()

# Get the correct blockscale mul and input quant operations.
@@ -762,9 +762,12 @@ def per_token_group_quant_fp8(
)
assert x.stride(-1) == 1, "`x` groups must be contiguous"

# Using the default value (240.0) from pytorch will cause accuracy
# issue on dynamic quantization models. Here use 224.0 for fnuz on ROCm
# platforms that use the torch.float8_e4mefnuz dtype.
finfo = torch.finfo(dtype)
fp8_min = finfo.min
fp8_max = finfo.max
fp8_min = -224.0 if current_platform.is_fp8_fnuz() else finfo.min
fp8_max = 224.0 if current_platform.is_fp8_fnuz() else finfo.max

assert out_q is None or out_q.shape == x.shape
x_q = out_q


+ 12
- 6
vllm/model_executor/layers/quantization/utils/mxfp4_utils.py View File

@@ -57,12 +57,18 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
mx_axis=1, num_warps=num_warps
)
)
if current_platform.is_cuda() and current_platform.is_device_capability(100):
constraints = {
"is_persistent": True,
"epilogue_subtile": 1,
}
opt_flags.update_opt_flags_constraints(constraints)
if current_platform.is_cuda():
if current_platform.is_device_capability(90):
constraints = {
"split_k": 1,
}
opt_flags.update_opt_flags_constraints(constraints)
elif current_platform.is_device_capability_family(100):
constraints = {
"is_persistent": True,
"epilogue_subtile": 1,
}
opt_flags.update_opt_flags_constraints(constraints)
# transpose the tensor so that the quantization axis is on dim1
quant_tensor = quant_tensor.transpose(-2, -1)
scale = scale.transpose(-2, -1)


+ 1
- 1
vllm/model_executor/models/config.py View File

@@ -363,7 +363,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
else:
kernel_block_alignment_size = 16
if (
current_platform.is_device_capability(100)
current_platform.is_device_capability_family(100)
and model_config.get_head_size() == 256
and (
attention_config.backend is None


+ 2
- 1
vllm/model_executor/models/pixtral.py View File

@@ -59,7 +59,8 @@ from vllm.multimodal.processing import (
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config
from vllm.tokenizers import cached_tokenizer_from_config
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.utils.tensor_schema import TensorSchema, TensorShape

from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP


+ 7
- 0
vllm/model_executor/models/qwen3_next.py View File

@@ -1092,6 +1092,8 @@ class Qwen3NextModel(nn.Module):
name.endswith(".bias") or name.endswith("_bias")
) and name not in params_dict:
continue
if name not in params_dict:
continue
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(
@@ -1108,6 +1110,11 @@ class Qwen3NextModel(nn.Module):
continue
if is_pp_missing_parameter(name, self):
continue
if name not in params_dict:
logger.warning_once(
f"Parameter {name} not found in params_dict, skip loading"
)
continue
param = params_dict[name]
weight_loader = getattr(
param, "weight_loader", default_weight_loader


+ 2
- 1
vllm/model_executor/models/voxtral.py View File

@@ -51,7 +51,8 @@ from vllm.multimodal.processing import (
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors
from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config
from vllm.tokenizers import cached_tokenizer_from_config
from vllm.tokenizers.mistral import MistralTokenizer

from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription
from .utils import init_vllm_registered_model, maybe_prefix


+ 1
- 1
vllm/platforms/cuda.py View File

@@ -182,7 +182,7 @@ class CudaPlatformBase(Platform):

if vllm_config.attention_config.backend is None:
# Default case
if cls.is_device_capability(100) and not use_sparse:
if cls.is_device_capability_family(100) and not use_sparse:
# Blackwell => Force CutlassMLA (unless sparse, i.e. DSv3.2).
use_cutlass_mla = True
# Set the backend in AttentionConfig so it's used during


+ 15
- 0
vllm/platforms/interface.py View File

@@ -301,6 +301,21 @@ class Platform:

return current_capability.to_int() == capability

@classmethod
def is_device_capability_family(
cls,
capability: int,
device_id: int = 0,
) -> bool:
"""
Returns True if the device capability is any <major>.x.
Mirrors CUDA 13 'family' architecture semantics (e.g. 10.x, 11.x, 12.x).
"""
current_capability = cls.get_device_capability(device_id=device_id)
if current_capability is None:
return False
return (current_capability.to_int() // 10) == (capability // 10)

@classmethod
def get_device_name(cls, device_id: int = 0) -> str:
"""Get the name of a device."""


+ 1
- 1
vllm/reasoning/mistral_reasoning_parser.py View File

@@ -10,7 +10,7 @@ from vllm.entrypoints.openai.protocol import (
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
from vllm.tokenizers import MistralTokenizer
from vllm.tokenizers.mistral import MistralTokenizer

logger = init_logger(__name__)



+ 0
- 6
vllm/tokenizers/__init__.py View File

@@ -1,9 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from .deepseekv32 import DeepseekV32Tokenizer
from .hf import HfTokenizer
from .mistral import MistralTokenizer
from .protocol import TokenizerLike
from .registry import (
TokenizerRegistry,
@@ -15,12 +12,9 @@ from .registry import (

__all__ = [
"TokenizerLike",
"HfTokenizer",
"MistralTokenizer",
"TokenizerRegistry",
"cached_get_tokenizer",
"get_tokenizer",
"cached_tokenizer_from_config",
"init_tokenizer_from_config",
"DeepseekV32Tokenizer",
]

+ 33
- 14
vllm/tokenizers/deepseekv32.py View File

@@ -2,24 +2,18 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from pathlib import Path
from typing import Any

from transformers import BatchEncoding

from .deepseek_v32_encoding import encode_messages
from .hf import HfTokenizer, TokenizerLike
from .registry import TokenizerRegistry
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam

from .deepseek_v32_encoding import encode_messages
from .hf import CachedHfTokenizer
from .protocol import TokenizerLike

@TokenizerRegistry.register("deepseek_v32")
class DeepseekV32Tokenizer(HfTokenizer):
def __init__(self, tokenizer: TokenizerLike):
self.tokenizer = tokenizer
self.name_or_path = (
tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
)
self._added_vocab = self.tokenizer.get_added_vocab()
self._added_vocab_size = len(self._added_vocab)

class DeepseekV32Tokenizer(CachedHfTokenizer):
@classmethod
def from_pretrained(
cls,
@@ -40,7 +34,21 @@ class DeepseekV32Tokenizer(HfTokenizer):
)
return DeepseekV32Tokenizer(tokenizer)

def apply_chat_template(self, messages, tools=None, **kwargs):
def __init__(self, tokenizer: TokenizerLike) -> None:
super().__init__()

self.tokenizer = tokenizer
self.name_or_path = getattr(tokenizer, "name_or_path", "")

self._added_vocab = self.tokenizer.get_added_vocab()
self._added_vocab_size = len(self._added_vocab)

def apply_chat_template(
self,
messages: list["ChatCompletionMessageParam"],
tools: list[dict[str, Any]] | None = None,
**kwargs,
) -> str | list[int]:
thinking = kwargs.get("thinking", False)
thinking_mode = "thinking"
if not thinking:
@@ -49,13 +57,24 @@ class DeepseekV32Tokenizer(HfTokenizer):
messages = conversation.copy()
if tools is not None and len(tools) > 0:
messages.insert(0, {"role": "system"})
messages[0]["tools"] = tools
messages[0]["tools"] = tools # type: ignore[typeddict-unknown-key]

# Historical reasoning content is dropped when a new user message is introduced
drop_thinking = messages[-1]["role"] == "user"

encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking)
prompt_str = encode_messages(messages, **encode_config) # type: ignore

if kwargs.get("tokenize", True):
tokenizer_kwargs = {
k: kwargs[k] for k in ("truncation", "max_length") if k in kwargs
}
return self.encode(
prompt_str,
add_special_tokens=False,
**tokenizer_kwargs,
)

return prompt_str

def num_special_tokens_to_add(self) -> int:


+ 7
- 12
vllm/tokenizers/hf.py View File

@@ -3,22 +3,18 @@
import contextlib
import copy
from pathlib import Path
from typing import TYPE_CHECKING
from typing import TypeAlias

from transformers import AutoTokenizer
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast

from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config

from .protocol import TokenizerLike
from .registry import TokenizerRegistry

if TYPE_CHECKING:
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
HfTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast


def get_cached_tokenizer(
tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast",
) -> TokenizerLike:
def get_cached_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
"""
By default, transformers will recompute multiple tokenizer properties
each time they are called, leading to a significant slowdown.
@@ -65,11 +61,10 @@ def get_cached_tokenizer(
CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"

cached_tokenizer.__class__ = CachedTokenizer
return cached_tokenizer # type: ignore
return cached_tokenizer


@TokenizerRegistry.register("hf")
class HfTokenizer(TokenizerLike):
class CachedHfTokenizer(TokenizerLike):
@classmethod
def from_pretrained(
cls,
@@ -79,7 +74,7 @@ class HfTokenizer(TokenizerLike):
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> "TokenizerLike":
) -> HfTokenizer:
try:
tokenizer = AutoTokenizer.from_pretrained(
path_or_repo_id,


+ 2
- 5
vllm/tokenizers/mistral.py View File

@@ -3,10 +3,11 @@
from pathlib import Path
from typing import TYPE_CHECKING, Any, cast

from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.logger import init_logger

from .protocol import TokenizerLike
from .registry import TokenizerRegistry

if TYPE_CHECKING:
from mistral_common.protocol.instruct.request import (
@@ -15,9 +16,6 @@ if TYPE_CHECKING:
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
from transformers import BatchEncoding

from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.entrypoints.openai.protocol import ChatCompletionRequest

try:
# Transformers v5
from transformers.tokenization_mistral_common import MistralCommonBackend
@@ -201,7 +199,6 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
return tokenizer.unk_id


@TokenizerRegistry.register("mistral")
class MistralTokenizer(TokenizerLike):
@classmethod
def from_pretrained(


+ 1
- 1
vllm/tokenizers/protocol.py View File

@@ -97,7 +97,7 @@ class TokenizerLike(Protocol):
messages: list["ChatCompletionMessageParam"],
tools: list[dict[str, Any]] | None = None,
**kwargs,
) -> list[int]:
) -> str | list[int]:
raise NotImplementedError

def convert_tokens_to_string(self, tokens: list[str]) -> str:


+ 100
- 100
vllm/tokenizers/registry.py View File

@@ -1,13 +1,13 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib.util
from collections.abc import Callable
from dataclasses import dataclass, field
from functools import lru_cache
from pathlib import Path
from typing import TYPE_CHECKING, TypeVar, overload
from typing import TYPE_CHECKING

import huggingface_hub
from typing_extensions import assert_never
from typing_extensions import TypeVar, assert_never, deprecated

import vllm.envs as envs
from vllm.logger import init_logger
@@ -24,46 +24,25 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
from .protocol import TokenizerLike

if TYPE_CHECKING:
from vllm.config import ModelConfig
from vllm.config.model import ModelConfig, RunnerType

logger = init_logger(__name__)

_T = TypeVar("_T", bound=type[TokenizerLike])

_VLLM_TOKENIZERS = {
"deepseekv32": ("deepseekv32", "DeepseekV32Tokenizer"),
"hf": ("hf", "CachedHfTokenizer"),
"mistral": ("mistral", "MistralTokenizer"),
}

class TokenizerRegistry:
# Tokenizer name -> tokenizer_cls or (tokenizer module, tokenizer class)
REGISTRY: dict[str, type[TokenizerLike] | tuple[str, str]] = {}

# In-tree tokenizers
@staticmethod
@overload
def register(tokenizer_mode: str) -> Callable[[_T], _T]: ...
@dataclass
class _TokenizerRegistry:
# Tokenizer mode -> (tokenizer module, tokenizer class)
tokenizers: dict[str, tuple[str, str]] = field(default_factory=dict)

# OOT tokenizers
@staticmethod
@overload
def register(tokenizer_mode: str, module: str, class_name: str) -> None: ...

@staticmethod
def register(
tokenizer_mode: str,
module: str | None = None,
class_name: str | None = None,
) -> Callable[[_T], _T] | None:
# In-tree tokenizers
if module is None or class_name is None:

def wrapper(tokenizer_cls: _T) -> _T:
assert tokenizer_mode not in TokenizerRegistry.REGISTRY
TokenizerRegistry.REGISTRY[tokenizer_mode] = tokenizer_cls

return tokenizer_cls

return wrapper

# OOT tokenizers
if tokenizer_mode in TokenizerRegistry.REGISTRY:
def register(self, tokenizer_mode: str, module: str, class_name: str) -> None:
if tokenizer_mode in self.tokenizers:
logger.warning(
"%s.%s is already registered for tokenizer_mode=%r. "
"It is overwritten by the new one.",
@@ -72,36 +51,42 @@ class TokenizerRegistry:
tokenizer_mode,
)

TokenizerRegistry.REGISTRY[tokenizer_mode] = (module, class_name)
self.tokenizers[tokenizer_mode] = (module, class_name)

return None

@staticmethod
def get_tokenizer(tokenizer_mode: str, *args, **kwargs) -> "TokenizerLike":
if tokenizer_mode not in TokenizerRegistry.REGISTRY:
def load_tokenizer_cls(self, tokenizer_mode: str) -> type[TokenizerLike]:
if tokenizer_mode not in self.tokenizers:
raise ValueError(f"No tokenizer registered for {tokenizer_mode=!r}.")

item = TokenizerRegistry.REGISTRY[tokenizer_mode]
if isinstance(item, type):
return item.from_pretrained(*args, **kwargs)

module, class_name = item
module, class_name = self.tokenizers[tokenizer_mode]
logger.debug_once(f"Loading {class_name} for {tokenizer_mode=!r}")

class_ = resolve_obj_by_qualname(f"{module}.{class_name}")
return class_.from_pretrained(*args, **kwargs)
return resolve_obj_by_qualname(f"{module}.{class_name}")

def load_tokenizer(self, tokenizer_mode: str, *args, **kwargs) -> TokenizerLike:
tokenizer_cls = self.load_tokenizer_cls(tokenizer_mode)
return tokenizer_cls.from_pretrained(*args, **kwargs)

def get_tokenizer(

TokenizerRegistry = _TokenizerRegistry(
{
mode: (f"vllm.tokenizers.{mod_relname}", cls_name)
for mode, (mod_relname, cls_name) in _VLLM_TOKENIZERS.items()
}
)


def resolve_tokenizer_args(
tokenizer_name: str | Path,
*args,
runner_type: "RunnerType" = "generate",
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> TokenizerLike:
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
):
revision: str | None = kwargs.get("revision")
download_dir: str | None = kwargs.get("download_dir")

if envs.VLLM_USE_MODELSCOPE:
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
@@ -125,16 +110,6 @@ def get_tokenizer(
)
tokenizer_name = tokenizer_path

if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")

tokenizer_mode = "hf"
kwargs["use_fast"] = False

if "truncation_side" not in kwargs:
kwargs["truncation_side"] = "left"

# Separate model folder from file path for GGUF models
if is_gguf(tokenizer_name):
if check_gguf_file(tokenizer_name):
@@ -150,6 +125,21 @@ def get_tokenizer(
)
kwargs["gguf_file"] = gguf_file

if "truncation_side" not in kwargs:
if runner_type == "generate" or runner_type == "draft":
kwargs["truncation_side"] = "left"
elif runner_type == "pooling":
kwargs["truncation_side"] = "right"
else:
assert_never(runner_type)

if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")

tokenizer_mode = "hf"
kwargs["use_fast"] = False

# Try to use official Mistral tokenizer if possible
if tokenizer_mode == "auto" and importlib.util.find_spec("mistral_common"):
allow_patterns = ["tekken.json", "tokenizer.model.v*"]
@@ -165,49 +155,70 @@ def get_tokenizer(
if tokenizer_mode == "auto":
tokenizer_mode = "hf"

tokenizer_args = (tokenizer_name, *args)
tokenizer_kwargs = dict(
return tokenizer_mode, tokenizer_name, args, kwargs


cached_resolve_tokenizer_args = lru_cache(resolve_tokenizer_args)


def tokenizer_args_from_config(config: "ModelConfig", **kwargs):
return cached_resolve_tokenizer_args(
config.tokenizer,
runner_type=config.runner_type,
tokenizer_mode=config.tokenizer_mode,
revision=config.tokenizer_revision,
trust_remote_code=config.trust_remote_code,
**kwargs,
)


_T = TypeVar("_T", bound=TokenizerLike, default=TokenizerLike)


def get_tokenizer(
tokenizer_name: str | Path,
*args,
tokenizer_cls: type[_T] = TokenizerLike, # type: ignore[assignment]
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> _T:
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
tokenizer_mode, tokenizer_name, args, kwargs = cached_resolve_tokenizer_args(
tokenizer_name,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
download_dir=download_dir,
**kwargs,
)

if tokenizer_mode == "custom":
logger.warning_once(
"TokenizerRegistry now uses `tokenizer_mode` as the registry key "
"instead of `tokenizer_name`. "
"Please update the definition of `.from_pretrained` in "
"your custom tokenizer to accept `args=%s`, `kwargs=%s`. "
"Then, you can pass `tokenizer_mode=%r` instead of "
"`tokenizer_mode='custom'` when initializing vLLM.",
tokenizer_args,
str(tokenizer_kwargs),
tokenizer_name,
)

tokenizer_mode = str(tokenizer_name)
if tokenizer_cls == TokenizerLike:
tokenizer_cls_ = TokenizerRegistry.load_tokenizer_cls(tokenizer_mode)
else:
tokenizer_cls_ = tokenizer_cls

tokenizer = TokenizerRegistry.get_tokenizer(
tokenizer_mode,
*tokenizer_args,
**tokenizer_kwargs,
)
tokenizer = tokenizer_cls_.from_pretrained(tokenizer_name, *args, **kwargs)
if not tokenizer.is_fast:
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead."
)

return tokenizer
return tokenizer # type: ignore


cached_get_tokenizer = lru_cache(get_tokenizer)


def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
if model_config.skip_tokenizer_init:
return None

return cached_get_tokenizer(
model_config.tokenizer,
runner_type=model_config.runner_type,
tokenizer_mode=model_config.tokenizer_mode,
revision=model_config.tokenizer_revision,
trust_remote_code=model_config.trust_remote_code,
@@ -215,19 +226,8 @@ def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
)


@deprecated(
"Renamed to `cached_tokenizer_from_config`. The old name will be removed in v0.14."
)
def init_tokenizer_from_config(model_config: "ModelConfig"):
runner_type = model_config.runner_type
if runner_type == "generate" or runner_type == "draft":
truncation_side = "left"
elif runner_type == "pooling":
truncation_side = "right"
else:
assert_never(runner_type)

return get_tokenizer(
model_config.tokenizer,
tokenizer_mode=model_config.tokenizer_mode,
trust_remote_code=model_config.trust_remote_code,
revision=model_config.tokenizer_revision,
truncation_side=truncation_side,
)
return cached_tokenizer_from_config(model_config)

+ 3
- 3
vllm/transformers_utils/tokenizer.py View File

@@ -60,17 +60,17 @@ def __getattr__(name: str):

return cached_tokenizer_from_config
if name == "init_tokenizer_from_configs":
from vllm.tokenizers import init_tokenizer_from_config
from vllm.tokenizers import cached_tokenizer_from_config

warnings.warn(
"`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` "
"has been moved to `vllm.tokenizers.init_tokenizer_from_config`. "
"has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)

return init_tokenizer_from_config
return cached_tokenizer_from_config

raise AttributeError(f"module {__name__!r} has no attribute {name!r}")



+ 2
- 2
vllm/utils/deep_gemm.py View File

@@ -38,7 +38,7 @@ class DeepGemmQuantScaleFMT(Enum):
return DeepGemmQuantScaleFMT.FLOAT32
return (
DeepGemmQuantScaleFMT.UE8M0
if current_platform.is_device_capability(100)
if current_platform.is_device_capability_family(100)
else DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0
)

@@ -50,7 +50,7 @@ def is_deep_gemm_supported() -> bool:
"""
is_supported_arch = current_platform.is_cuda() and (
current_platform.is_device_capability(90)
or current_platform.is_device_capability(100)
or current_platform.is_device_capability_family(100)
)
return envs.VLLM_USE_DEEP_GEMM and has_deep_gemm() and is_supported_arch



+ 3
- 1
vllm/utils/flashinfer.py View File

@@ -264,7 +264,9 @@ def supports_trtllm_attention() -> bool:
return False

# Requires SM100 and NVIDIA artifactory to be accessible to download cubins
return current_platform.is_device_capability(100) and has_nvidia_artifactory()
return (
current_platform.is_device_capability_family(100) and has_nvidia_artifactory()
)


def force_use_trtllm_attention() -> bool | None:


+ 23
- 2
vllm/utils/torch_utils.py View File

@@ -194,12 +194,33 @@ def get_kv_cache_torch_dtype(
return torch_dtype


def get_kv_cache_quant_algo_dtype(quant_cfg: dict[str, Any]) -> torch.dtype | None:
quant_method = quant_cfg.get("quant_method", "")
if quant_method.startswith("modelopt"):
quantization_inner = quant_cfg.get("quantization", quant_cfg)
# Check if quant config is specified and use kv cache quant algo
kv_algo = quantization_inner.get("kv_cache_quant_algo") or quant_cfg.get(
"kv_cache_quant_algo"
)
if isinstance(kv_algo, str):
return STR_DTYPE_TO_TORCH_DTYPE[kv_algo.lower()]
return None


def kv_cache_dtype_str_to_dtype(
kv_cache_dtype: str, model_config: ModelConfig
) -> torch.dtype:
# Model config may not be specified for unit tests, default to float16
dtype = model_config.dtype if model_config else torch.half
if kv_cache_dtype == "auto":
# Model config may not be specified for unit tests, default to float16
return model_config.dtype if model_config else torch.half
hf_cfg = getattr(model_config, "hf_config", None)
if hf_cfg is not None:
quant_cfg = getattr(hf_cfg, "quantization_config", None)
if quant_cfg is not None:
kv_algo_dtype = get_kv_cache_quant_algo_dtype(quant_cfg)
return kv_algo_dtype if kv_algo_dtype is not None else dtype
return dtype

return STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]




+ 1
- 1
vllm/v1/attention/backends/flashinfer.py View File

@@ -564,7 +564,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
)
self.paged_kv_last_page_len_np = self.paged_kv_last_page_len_cpu.numpy()

if self.head_dim == 256 and current_platform.is_device_capability(100):
if self.head_dim == 256 and current_platform.is_device_capability_family(100):
# https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that
# head size 256 and block size 16 is not supported on blackwell.
assert kv_cache_spec.block_size != 16, (


+ 3
- 3
vllm/v1/attention/backends/mla/common.py View File

@@ -446,7 +446,7 @@ def use_flashinfer_prefill() -> bool:
and flashinfer_available
and not vllm_config.attention_config.use_cudnn_prefill
and not vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill
and current_platform.is_device_capability(100)
and current_platform.is_device_capability_family(100)
)


@@ -457,7 +457,7 @@ def use_cudnn_prefill() -> bool:
return (
flashinfer_available
and vllm_config.attention_config.use_cudnn_prefill
and current_platform.is_device_capability(100)
and current_platform.is_device_capability_family(100)
and has_nvidia_artifactory()
)

@@ -470,7 +470,7 @@ def use_trtllm_ragged_deepseek_prefill() -> bool:
return (
flashinfer_available
and vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill
and current_platform.is_device_capability(100)
and current_platform.is_device_capability_family(100)
)




+ 2
- 2
vllm/v1/attention/backends/mla/flashmla_sparse.py View File

@@ -420,7 +420,7 @@ class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetad
max_num_sm_parts = int(
max((sm_count // 2) / h_k // (cdiv(h_q // h_k, 2 * 64) * s_q), 1)
)
if current_platform.is_device_capability(100):
if current_platform.is_device_capability_family(100):
max_num_sm_parts *= 2
self.tile_scheduler_metadata_buffer = torch.empty(
# TileSchedulerMetaDataSize = 8
@@ -719,7 +719,7 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
self.softmax_scale = scale
assert indexer is not None
self.topk_indices_buffer = indexer.topk_indices_buffer
self.padding = 128 if current_platform.is_device_capability(100) else 64
self.padding = 128 if current_platform.is_device_capability_family(100) else 64

if kv_cache_dtype == "fp8_ds_mla":
# Reserve workspace during initialization


+ 83
- 1
vllm/v1/attention/backends/triton_attn.py View File

@@ -17,7 +17,7 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
triton_reshape_and_cache_flash,
)
from vllm.attention.ops.triton_unified_attention import unified_attention
from vllm.config import VllmConfig
from vllm.config import CUDAGraphMode, VllmConfig
from vllm.config.cache import CacheDType
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -26,6 +26,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
)
from vllm.platforms import current_platform
from vllm.platforms.interface import DeviceCapability
from vllm.utils.math_utils import next_power_of_2
from vllm.v1.attention.backends.utils import (
AttentionCGSupport,
AttentionMetadataBuilder,
@@ -36,6 +37,11 @@ from vllm.v1.kv_cache_interface import AttentionSpec
logger = init_logger(__name__)


# constants
MIN_LAUNCH_GRID_SIZE_2D = 128 # Minimum launch grid size of 2D kernel
NUM_PAR_SOFTMAX_SEGMENTS = 16 # Number of parallel tiled softmax segments


@dataclass
class TritonAttentionMetadata:
# NOTE(sang): Definition of context_len, query_len, and seq_len.
@@ -54,6 +60,12 @@ class TritonAttentionMetadata:
block_table: torch.Tensor
slot_mapping: torch.Tensor

seq_threshold_3D: int
num_par_softmax_segments: int
softmax_segm_output: torch.Tensor
softmax_segm_max: torch.Tensor
softmax_segm_expsum: torch.Tensor

# For cascade attention.
use_cascade: bool
common_prefix_len: int
@@ -87,6 +99,60 @@ class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMet
self.num_heads_kv = model_config.get_num_kv_heads(vllm_config.parallel_config)
self.headdim = model_config.get_head_size()

# Check if CUDA Graphs are enabled for decode
self.decode_cudagraph_enabled = (
self.vllm_config.compilation_config.cudagraph_mode
in (
CUDAGraphMode.FULL_AND_PIECEWISE,
CUDAGraphMode.FULL_DECODE_ONLY,
CUDAGraphMode.FULL,
)
)

# The launch grid for the 2D kernel is defined as (num_q_blocks, num_heads_kv).
# A lower bound for num_q_blocks is the number of sequences.
# To ensure the minimum launch grid size is achieved, the number of sequences
# must be at least equal to the threshold below.
# If this threshold is not reached (i.e., the batch size is not large enough),
# the 3D kernel will be selected instead.
self.seq_threshold_3D = MIN_LAUNCH_GRID_SIZE_2D // self.num_heads_kv

# Modify the threshold if needed.
if self.decode_cudagraph_enabled:
capture_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes
assert capture_sizes, "CUDA Graphs enabled but no capture sizes specified."

# Select the CUDA Graph capture size closest to self.seq_threshold_3D
# as threshold. This ensures that each captured graph covers the
# correct execution path.
self.seq_threshold_3D = min(
capture_sizes,
key=lambda x: abs(x - self.seq_threshold_3D),
)

self.num_par_softmax_segments = NUM_PAR_SOFTMAX_SEGMENTS
headdim_padded = next_power_of_2(self.headdim)
self.softmax_segm_output = torch.empty(
(
self.seq_threshold_3D,
self.num_heads_q,
self.num_par_softmax_segments,
headdim_padded,
),
dtype=torch.float32,
device=device,
)
self.softmax_segm_max = torch.empty(
(self.seq_threshold_3D, self.num_heads_q, self.num_par_softmax_segments),
dtype=torch.float32,
device=device,
)
self.softmax_segm_expsum = torch.empty(
(self.seq_threshold_3D, self.num_heads_q, self.num_par_softmax_segments),
dtype=torch.float32,
device=device,
)

def build_for_cudagraph_capture(
self, common_attn_metadata: CommonAttentionMetadata
) -> TritonAttentionMetadata:
@@ -143,6 +209,11 @@ class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMet
prefix_kv_lens=prefix_kv_lens,
suffix_kv_lens=suffix_kv_lens,
prefix_scheduler_metadata=prefix_scheduler_metadata,
seq_threshold_3D=self.seq_threshold_3D,
num_par_softmax_segments=self.num_par_softmax_segments,
softmax_segm_output=self.softmax_segm_output,
softmax_segm_max=self.softmax_segm_max,
softmax_segm_expsum=self.softmax_segm_expsum,
)
return attn_metadata

@@ -349,6 +420,12 @@ class TritonAttentionImpl(AttentionImpl):
max_seqlen_k = attn_metadata.max_seq_len
block_table = attn_metadata.block_table

seq_threshold_3D = attn_metadata.seq_threshold_3D
num_par_softmax_segments = attn_metadata.num_par_softmax_segments
softmax_segm_output = attn_metadata.softmax_segm_output
softmax_segm_max = attn_metadata.softmax_segm_max
softmax_segm_expsum = attn_metadata.softmax_segm_expsum

descale_shape = (cu_seqlens_q.shape[0] - 1, key_cache.shape[2])

unified_attention(
@@ -369,6 +446,11 @@ class TritonAttentionImpl(AttentionImpl):
q_descale=None, # Not supported
k_descale=layer._k_scale.expand(descale_shape),
v_descale=layer._v_scale.expand(descale_shape),
seq_threshold_3D=seq_threshold_3D,
num_par_softmax_segments=num_par_softmax_segments,
softmax_segm_output=softmax_segm_output,
softmax_segm_max=softmax_segm_max,
softmax_segm_expsum=softmax_segm_expsum,
sinks=self.sinks,
output_scale=output_scale,
)


+ 5
- 6
vllm/v1/core/sched/scheduler.py View File

@@ -1117,6 +1117,7 @@ class Scheduler(SchedulerInterface):
stopped = False
new_logprobs = None
new_token_ids = generated_token_ids
pooler_output = pooler_outputs[req_index] if pooler_outputs else None
kv_transfer_params = None
status_before_stop = request.status

@@ -1125,12 +1126,10 @@ class Scheduler(SchedulerInterface):
new_token_ids, stopped = self._update_request_with_output(
request, new_token_ids
)

# Stop checking for pooler models.
pooler_output = None
if pooler_outputs:
pooler_output = pooler_outputs[req_index]
stopped = check_stop(request, self.max_model_len, pooler_output)
elif request.pooling_params and pooler_output is not None:
# Pooling stops as soon as there is output.
request.status = RequestStatus.FINISHED_STOPPED
stopped = True

if stopped:
kv_transfer_params = self._free_request(request)


+ 2
- 10
vllm/v1/core/sched/utils.py View File

@@ -2,8 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib

import torch

from vllm.v1.request import Request, RequestStatus


@@ -39,14 +37,8 @@ def remove_all(lst: list, items_to_remove: set) -> list:
return [item for item in lst if item not in items_to_remove]


def check_stop(
request: Request, max_model_len: int, pooler_output: torch.Tensor | None = None
) -> bool:
if request.pooling_params:
if pooler_output is not None:
request.status = RequestStatus.FINISHED_STOPPED
return True
return False
def check_stop(request: Request, max_model_len: int) -> bool:
assert not request.pooling_params

sampling_params = request.sampling_params
assert sampling_params is not None


+ 2
- 2
vllm/v1/engine/async_llm.py View File

@@ -26,7 +26,7 @@ from vllm.plugins.io_processors import get_io_processor
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.tasks import SupportedTask
from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
from vllm.tracing import init_tracer
from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
from vllm.usage.usage_lib import UsageContext
@@ -111,7 +111,7 @@ class AsyncLLM(EngineClient):
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = init_tokenizer_from_config(self.model_config)
tokenizer = cached_tokenizer_from_config(self.model_config)

self.input_processor = InputProcessor(self.vllm_config, tokenizer)
self.io_processor = get_io_processor(


+ 22
- 11
vllm/v1/engine/input_processor.py View File

@@ -19,7 +19,8 @@ from vllm.multimodal.processing import EncDecMultiModalProcessor
from vllm.multimodal.utils import argsort_mm_positions
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.tokenizers import MistralTokenizer, TokenizerLike
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.utils import length_from_prompt_token_ids_or_embeds
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.metrics.stats import MultiModalCacheStats
@@ -188,29 +189,39 @@ class InputProcessor:
def _validate_single_prompt(single_prompt: dict | str) -> None:
if not isinstance(single_prompt, dict):
return

mm_data = single_prompt.get("multi_modal_data")
mm_uuids = single_prompt.get("multi_modal_uuids")
if not mm_data or not mm_uuids:
return

import torch

def _get_len(items: object):
if isinstance(items, dict): # Embedding inputs
return _get_len(next(iter(items.values()))) if items else 1

if isinstance(items, list):
return len(items)
if isinstance(items, torch.Tensor):
# To keep backwards compatibility for single item embedding input
return 1 if getattr(items, "_is_single_item", False) else len(items)

return 1

for modality, items in mm_data.items():
if modality in mm_uuids:
data_len = len(items) if isinstance(items, list) else 1
uuid_len = (
len(mm_uuids[modality])
if isinstance(mm_uuids[modality], list)
else 1
)
data_len = _get_len(items)
uuid_len = _get_len(mm_uuids[modality])
if uuid_len != data_len:
raise ValueError(
f"multi_modal_uuids for modality '{modality}' "
f"multi_modal_uuids for modality {modality!r} "
"must have same length as data: got "
f"{uuid_len} uuids vs "
f"{data_len} items."
f"{uuid_len} uuids vs {data_len} items."
)
else:
raise ValueError(
f"multi_modal_uuids for modality '{modality}' must "
f"multi_modal_uuids for modality {modality!r} must "
"be provided if multi_modal_data is provided."
)



+ 2
- 2
vllm/v1/engine/llm_engine.py View File

@@ -23,7 +23,7 @@ from vllm.plugins.io_processors import get_io_processor
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.tasks import SupportedTask
from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
from vllm.tracing import init_tracer
from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine import EngineCoreRequest
@@ -86,7 +86,7 @@ class LLMEngine:
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = init_tokenizer_from_config(self.model_config)
tokenizer = cached_tokenizer_from_config(self.model_config)

self.input_processor = InputProcessor(self.vllm_config, tokenizer)
self.io_processor = get_io_processor(


+ 1
- 1
vllm/v1/executor/abstract.py View File

@@ -219,7 +219,7 @@ class Executor(ABC):

def sample_tokens(
self, grammar_output: GrammarOutput | None, non_block: bool = False
) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
output = self.collective_rpc( # type: ignore[call-overload]
"sample_tokens", args=(grammar_output,), non_block=non_block
)


+ 7
- 3
vllm/v1/executor/multiproc_executor.py View File

@@ -294,8 +294,8 @@ class MultiprocExecutor(Executor):
kwargs: dict | None = None,
non_block: bool = False,
unique_reply_rank: int | None = None,
kv_output_aggregator: KVOutputAggregator = None,
) -> Any | list[Any] | Future[Any | list[Any]]:
kv_output_aggregator: KVOutputAggregator | None = None,
) -> Any:
"""Returns single result if unique_reply_rank and/or kv_output_aggregator
is provided, otherwise list."""
assert self.rpc_broadcast_mq is not None, (
@@ -476,6 +476,8 @@ class WorkerProc:
"""Wrapper that runs one Worker in a separate process."""

READY_STR = "READY"
rpc_broadcast_mq: MessageQueue | None
worker_response_mq: MessageQueue | None

def _init_message_queues(
self, input_shm_handle: Handle, vllm_config: VllmConfig
@@ -487,7 +489,7 @@ class WorkerProc:
)

# Initializes a message queue for sending the model output
self.worker_response_mq: MessageQueue = MessageQueue(1, 1)
self.worker_response_mq = MessageQueue(1, 1)
self.peer_response_handles = []
else:
# Initialize remote MessageQueue for receiving SchedulerOutput across nodes
@@ -720,6 +722,7 @@ class WorkerProc:
try:
reader.close()
worker = WorkerProc(*args, **kwargs)
assert worker.worker_response_mq is not None

# Send READY once we know everything is loaded
ready_writer.send(
@@ -804,6 +807,7 @@ class WorkerProc:

def worker_busy_loop(self, cancel: threading.Event | None = None):
"""Main busy loop for Multiprocessing Workers"""
assert self.rpc_broadcast_mq is not None
while True:
method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue(
cancel=cancel, indefinite=True


+ 3
- 3
vllm/v1/executor/ray_executor.py View File

@@ -413,7 +413,7 @@ class RayDistributedExecutor(Executor):
self,
grammar_output: "GrammarOutput | None",
non_block: bool = False,
) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
"""Execute the model on the Ray workers.

The scheduler output to use should have been provided in
@@ -428,7 +428,7 @@ class RayDistributedExecutor(Executor):
"""
scheduler_output = self.scheduler_output
if scheduler_output is None:
return COMPLETED_NONE_FUTURE if non_block else None # noqa
return COMPLETED_NONE_FUTURE if non_block else None

self.scheduler_output = None

@@ -439,7 +439,7 @@ class RayDistributedExecutor(Executor):
scheduler_output: SchedulerOutput,
grammar_output: "GrammarOutput | None",
non_block: bool = False,
) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
# Build the compiled DAG for the first time.
if self.forward_dag is None: # type: ignore
self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)


+ 8
- 5
vllm/v1/executor/uniproc_executor.py View File

@@ -67,7 +67,7 @@ class UniProcExecutor(Executor):
kwargs: dict | None = None,
non_block: bool = False,
single_value: bool = False,
) -> Any | list[Any] | Future[Any | list[Any]]:
) -> Any:
if kwargs is None:
kwargs = {}

@@ -79,10 +79,13 @@ class UniProcExecutor(Executor):
result = run_method(self.driver_worker, method, args, kwargs)
if isinstance(result, AsyncModelRunnerOutput):
if (async_thread := self.async_output_thread) is not None:
get_output = result.get_output
if not single_value:
get_output = lambda go=result.get_output: [go()]
return async_thread.submit(get_output)
if single_value:
return async_thread.submit(result.get_output)

def get_output_list() -> list[Any]:
return [result.get_output()]

return async_thread.submit(get_output_list)
result = result.get_output()
future = Future[Any]()
future.set_result(result if single_value else [result])


+ 1
- 1
vllm/v1/sample/rejection_sampler.py View File

@@ -145,7 +145,7 @@ class RejectionSampler(nn.Module):
)

logprobs_tensors = None
if sampling_metadata.max_num_logprobs:
if sampling_metadata.max_num_logprobs is not None:
logprobs_tensors = self._get_logprobs_tensors(
sampling_metadata.max_num_logprobs,
metadata,


+ 2
- 2
vllm/v1/structured_output/__init__.py View File

@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING
from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
from vllm.tokenizers import init_tokenizer_from_config
from vllm.tokenizers import cached_tokenizer_from_config
from vllm.utils.import_utils import LazyLoader
from vllm.v1.structured_output.backend_guidance import GuidanceBackend
from vllm.v1.structured_output.backend_types import (
@@ -71,7 +71,7 @@ class StructuredOutputManager:
# of CPUs.
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
self.executor = ThreadPoolExecutor(max_workers=max_workers)
self.tokenizer = init_tokenizer_from_config(
self.tokenizer = cached_tokenizer_from_config(
model_config=self.vllm_config.model_config
)
reasoning_parser = (


+ 2
- 1
vllm/v1/structured_output/backend_xgrammar.py View File

@@ -10,7 +10,8 @@ import torch
import vllm.envs
from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams
from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer
from vllm.tokenizers.deepseekv32 import DeepseekV32Tokenizer
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.utils.import_utils import LazyLoader
from vllm.v1.structured_output.backend_types import (
StructuredOutputBackend,


Loading…
Cancel
Save
Baidu
map