Merge branch 'main' into wentao-small-refactor

set assume_32bit_indexing and pass unbacked hints (#30459 )
Signed-off-by: Laith Sakka <lsakka@meta.com>
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -71,6 +71,20 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

  # x86 CPU wheel build
  - label: "Build x86 CPU wheel"
    depends_on: ~
    id: build-wheel-x86-cpu
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
    env:
      DOCKER_BUILDKIT: "1"

  # Build release images (12.9)
  - label: "Build release image (x86)"
    depends_on: ~
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -692,6 +692,7 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/models/test_initialization.py
  - tests/models/registry.py
  commands:
    # Run a subset of model initialization tests
    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
@@ -704,6 +705,7 @@ steps:
  - vllm/model_executor/models/
  - vllm/transformers_utils/
  - tests/models/test_initialization.py
  - tests/models/registry.py
  commands:
    # Only when vLLM model source is modified - test initialization of a large
    # subset of supported models (the complement of the small subset in the above
@@ -836,7 +838,7 @@ steps:
  - tests/models/multimodal
  no_gpu: true
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py

 - label: Multi-Modal Processor Test
@@ -1346,6 +1348,7 @@ steps:
 - label: Prime-RL Integration Test # 15min
  timeout_in_minutes: 30
  optional: true
  soft_fail: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
@@ -1379,4 +1382,4 @@ steps:
  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
--- a/README.md
+++ b/README.md
@@ -143,11 +143,13 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
 - IBM
 - Intel
 - Lambda Lab
 - Nebius
 - Novita AI
 - NVIDIA
 - Red Hat
 - Replicate
 - Roblox
 - RunPod
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -620,7 +620,7 @@ def get_tokenizer(
        kwargs["use_fast"] = False
    if tokenizer_mode == "mistral":
        try:
            from vllm.tokenizers import MistralTokenizer
            from vllm.tokenizers.mistral import MistralTokenizer
        except ImportError as e:
            raise ImportError(
                "MistralTokenizer requires vllm package.\n"
--- a/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
@@ -22,6 +22,62 @@ __device__ __forceinline__ float GroupReduceMax(float val) {
  return val;
 }

 template <typename T, bool SCALE_UE8M0>
 __device__ __forceinline__ float ComputeGroupScale(
    const T* __restrict__ group_input, T* __restrict__ smem_group,
    const int group_size, const int lane_id, const int threads_per_group,
    const float eps, const float max_8bit) {
  float local_absmax = eps;

  constexpr int vec_size = 16 / sizeof(T);

  // copy global -> shared & compute absmax
  auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
    float abs_v = fabsf(static_cast<float>(src));
    local_absmax = fmaxf(local_absmax, abs_v);
    dst = src;
  };

  vllm::vectorize_with_alignment<vec_size>(
      group_input,        // in
      smem_group,         // out (shared)
      group_size,         // elements per group
      lane_id,            // thread id
      threads_per_group,  // stride in group
      scalar_op_cache);   // scalar handler

  local_absmax = GroupReduceMax(local_absmax);

  float y_s = local_absmax / max_8bit;
  if constexpr (SCALE_UE8M0) {
    y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
  }

  return y_s;
 }

 template <typename T, typename DST_DTYPE>
 __device__ __forceinline__ void QuantizeGroup(
    const T* __restrict__ smem_group, DST_DTYPE* __restrict__ group_output,
    const int group_size, const int lane_id, const int threads_per_group,
    const float y_s, const float min_8bit, const float max_8bit) {
  constexpr int vec_size = 16 / sizeof(T);

  // quantize shared -> global 8-bit
  auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
    float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
    dst = DST_DTYPE(q);
  };

  vllm::vectorize_with_alignment<vec_size>(
      smem_group,         // in (shared)
      group_output,       // out (global quant tensor)
      group_size,         // elements
      lane_id,            // tid
      threads_per_group,  // stride
      scalar_op_quant);   // scalar handler
 }

 template <typename T, typename DST_DTYPE, bool IS_COLUMN_MAJOR = false,
          bool SCALE_UE8M0 = false, typename scale_packed_t = float>
 __global__ void per_token_group_quant_8bit_kernel(
@@ -38,8 +94,6 @@ __global__ void per_token_group_quant_8bit_kernel(
  const int64_t global_group_id = block_group_id + local_group_id;
  const int64_t block_group_offset = global_group_id * group_size;

  float local_absmax = eps;

  using scale_element_t = float;
  static_assert(sizeof(scale_packed_t) % sizeof(scale_element_t) == 0);

@@ -68,30 +122,9 @@ __global__ void per_token_group_quant_8bit_kernel(
  T* smem = reinterpret_cast<T*>(smem_raw);
  T* smem_group = smem + local_group_id * group_size;

  constexpr int vec_size = 16 / sizeof(T);
  using vec_t = vllm::vec_n_t<T, vec_size>;

  // copy global -> shared & compute absmax
  auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
    float abs_v = fabsf(static_cast<float>(src));
    local_absmax = fmaxf(local_absmax, abs_v);
    dst = src;
  };

  vllm::vectorize_with_alignment<vec_size>(
      group_input,        // in
      smem_group,         // out (shared)
      group_size,         // elements per group
      lane_id,            // thread id
      threads_per_group,  // stride in group
      scalar_op_cache);   // scalar handler

  local_absmax = GroupReduceMax(local_absmax);

  float y_s = local_absmax / max_8bit;
  if constexpr (SCALE_UE8M0) {
    y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
  }
  const float y_s = ComputeGroupScale<T, SCALE_UE8M0>(
      group_input, smem_group, group_size, lane_id, threads_per_group, eps,
      max_8bit);

  scale_element_t y_s_quant = y_s;

@@ -101,19 +134,24 @@ __global__ void per_token_group_quant_8bit_kernel(

  __syncthreads();

  // quantize shared -> global 8-bit
  auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
    float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
    dst = DST_DTYPE(q);
  };
  QuantizeGroup<T, DST_DTYPE>(smem_group, group_output, group_size, lane_id,
                              threads_per_group, y_s, min_8bit, max_8bit);
 }

  vllm::vectorize_with_alignment<vec_size>(
      smem_group,         // in (shared)
      group_output,       // out (global quant tensor)
      group_size,         // elements
      lane_id,            // tid
      threads_per_group,  // stride
      scalar_op_quant);   // scalar handler
 inline int GetGroupsPerBlock(int64_t num_groups) {
  if (num_groups % 16 == 0) {
    return 16;
  }
  if (num_groups % 8 == 0) {
    return 8;
  }
  if (num_groups % 4 == 0) {
    return 4;
  }
  if (num_groups % 2 == 0) {
    return 2;
  }
  return 1;
 }

 void per_token_group_quant_8bit(const torch::Tensor& input,
@@ -133,17 +171,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,

  constexpr int THREADS_PER_GROUP = 16;

  int groups_per_block = 1;

  if (num_groups % 16 == 0) {
    groups_per_block = 16;
  } else if (num_groups % 8 == 0) {
    groups_per_block = 8;
  } else if (num_groups % 4 == 0) {
    groups_per_block = 4;
  } else if (num_groups % 2 == 0) {
    groups_per_block = 2;
  }
  const int groups_per_block = GetGroupsPerBlock(num_groups);

  auto dst_type = output_q.scalar_type();
  const int num_blocks = num_groups / groups_per_block;
@@ -225,8 +253,6 @@ __global__ void per_token_group_quant_8bit_packed_kernel(

  const int64_t block_group_offset = global_group_id * group_size;

  float local_absmax = eps;

  const T* group_input = input + block_group_offset;
  DST_DTYPE* group_output =
      static_cast<DST_DTYPE*>(output_q) + block_group_offset;
@@ -235,29 +261,9 @@ __global__ void per_token_group_quant_8bit_packed_kernel(
  extern __shared__ __align__(16) char smem_raw[];
  T* smem = reinterpret_cast<T*>(smem_raw);
  T* smem_group = smem + local_group_id * group_size;

  constexpr int vec_size = 16 / sizeof(T);
  using vec_t = vllm::vec_n_t<T, vec_size>;

  // copy global -> shared & compute absmax
  auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
    float abs_v = fabsf(static_cast<float>(src));
    local_absmax = fmaxf(local_absmax, abs_v);
    dst = src;
  };

  vllm::vectorize_with_alignment<vec_size>(
      group_input,        // in
      smem_group,         // out (shared)
      group_size,         // elements per group
      lane_id,            // thread id
      threads_per_group,  // stride in group
      scalar_op_cache);   // scalar handler

  local_absmax = GroupReduceMax(local_absmax);

  float y_s = local_absmax / max_8bit;
  y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
  const float y_s =
      ComputeGroupScale<T, true>(group_input, smem_group, group_size, lane_id,
                                 threads_per_group, eps, max_8bit);

  // pack 4 scales into a uint32
  if (lane_id == 0) {
@@ -284,19 +290,8 @@ __global__ void per_token_group_quant_8bit_packed_kernel(

  __syncthreads();

  // quantize shared -> global 8-bit
  auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
    float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
    dst = DST_DTYPE(q);
  };

  vllm::vectorize_with_alignment<vec_size>(
      smem_group,         // in (shared)
      group_output,       // out (global quant tensor)
      group_size,         // elements
      lane_id,            // tid
      threads_per_group,  // stride
      scalar_op_quant);   // scalar handler
  QuantizeGroup<T, DST_DTYPE>(smem_group, group_output, group_size, lane_id,
                              threads_per_group, y_s, min_8bit, max_8bit);
 }

 void per_token_group_quant_8bit_packed(const torch::Tensor& input,
@@ -337,17 +332,7 @@ void per_token_group_quant_8bit_packed(const torch::Tensor& input,

  constexpr int THREADS_PER_GROUP = 16;

  int groups_per_block = 1;

  if (num_groups % 16 == 0) {
    groups_per_block = 16;
  } else if (num_groups % 8 == 0) {
    groups_per_block = 8;
  } else if (num_groups % 4 == 0) {
    groups_per_block = 4;
  } else if (num_groups % 2 == 0) {
    groups_per_block = 2;
  }
  const int groups_per_block = GetGroupsPerBlock(num_groups);

  auto dst_type = output_q.scalar_type();
  const int num_blocks = num_groups / groups_per_block;
--- a/docs/community/sponsors.md
+++ b/docs/community/sponsors.md
@@ -24,11 +24,13 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
 - IBM
 - Intel
 - Lambda Lab
 - Nebius
 - Novita AI
 - NVIDIA
 - Red Hat
 - Replicate
 - Roblox
 - RunPod
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -281,17 +281,27 @@ Alternatively, you can use the `openai` Python package:

 Currently, vLLM supports multiple backends for efficient Attention computation across different platforms and accelerator architectures. It automatically selects the most performant backend compatible with your system and model specifications.

 If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options:
 If desired, you can also manually set the backend of your choice using the `--attention-backend` CLI argument:

 ```bash
 # For online serving
 vllm serve Qwen/Qwen2.5-1.5B-Instruct --attention-backend FLASH_ATTN

 # For offline inference
 python script.py --attention-backend FLASHINFER
 ```

 Some of the available backend options include:

 - On NVIDIA CUDA: `FLASH_ATTN` or `FLASHINFER`.
 - On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.

 For AMD ROCm, you can further control the specific Attention implementation using the following variables:
 For AMD ROCm, you can further control the specific Attention implementation using the following options:

 - Triton Unified Attention: `VLLM_ROCM_USE_AITER=0 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
 - AITER Unified Attention: `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
 - Triton Prefill-Decode Attention: `VLLM_ROCM_USE_AITER=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0`
 - AITER Multi-head Attention: `VLLM_ROCM_USE_AITER=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=1`
 - Triton Unified Attention: Set the environment variables `VLLM_ROCM_USE_AITER=0 VLLM_ROCM_USE_AITER_MHA=0` and pass `--attention-config.use_prefill_decode_attention=false` as a CLI argument.
 - AITER Unified Attention: Set the environment variables `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0` and pass `--attention-config.use_prefill_decode_attention=false` as a CLI argument.
 - Triton Prefill-Decode Attention: Set the environment variables `VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_MHA=0` and pass `--attention-config.use_prefill_decode_attention=true` as a CLI argument.
 - AITER Multi-head Attention: Set the environment variables `VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_MHA=1` and pass `--attention-config.use_prefill_decode_attention=false` as a CLI argument.

 !!! warning
    There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [docker/Dockerfile](../../docker/Dockerfile) for instructions on how to install it.
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -20,7 +20,7 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer

 from ...utils import flat_product, multi_gpu_test

 is_blackwell = lambda: current_platform.is_device_capability(100)
 is_blackwell = lambda: current_platform.is_device_capability_family(100)
 """Are we running on Blackwell, a lot of tests depend on it"""


--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -702,10 +702,16 @@ class HfRunner:
                **kwargs,
            )

            # Encoder-decoder models return decoder_hidden_states instead of
            # hidden_states
            hidden_states = (
                getattr(output, "hidden_states", None) or output.decoder_hidden_states
            )

            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.hidden_states, num_logprobs)
            ) = self._hidden_states_to_logprobs(hidden_states, num_logprobs)

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
--- a/tests/entrypoints/openai/test_messages.py
+++ b/tests/entrypoints/openai/test_messages.py
@@ -79,9 +79,12 @@ async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):

    assert chunk_count > 0
    assert first_chunk is not None, "message_start chunk was never observed"
    assert first_chunk.usage is not None, "first chunk should include usage stats"
    assert first_chunk.usage["output_tokens"] == 0
    assert first_chunk.usage["input_tokens"] > 5
    assert first_chunk.message is not None, "first chunk should include message"
    assert first_chunk.message.usage is not None, (
        "first chunk should include usage stats"
    )
    assert first_chunk.message.usage.output_tokens == 0
    assert first_chunk.message.usage.input_tokens > 5


@pytest.mark.asyncio
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -10,7 +10,7 @@ import pytest
 from vllm.config import ModelConfig
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.tokenizers import MistralTokenizer
 from vllm.tokenizers.mistral import MistralTokenizer


@pytest.fixture()
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -29,7 +29,8 @@ from vllm.multimodal.utils import (
    encode_image_base64,
    encode_video_base64,
 )
 from vllm.tokenizers import MistralTokenizer, get_tokenizer
 from vllm.tokenizers import get_tokenizer
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils.serial_utils import tensor2base64

 from ..models.registry import HF_EXAMPLE_MODELS
@@ -796,9 +797,13 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
            "content": "<|image_1|>\nWhat's in this image?",
        }
    ]

    assert mm_data is not None
    assert "image" in mm_data
    assert mm_data["image"] is None
    assert isinstance(mm_data["image"], list)
    assert len(mm_data["image"]) == 1
    assert mm_data["image"][0] is None

    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])


@@ -825,10 +830,11 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
    # Should have audio in mm_data as None (UUID provided)
    assert mm_data is not None
    assert "audio" in mm_data
    assert mm_data["audio"] is None
    assert isinstance(mm_data["audio"], list)
    assert len(mm_data["audio"]) == 1
    assert mm_data["audio"][0] is None

    # UUID should be recorded
    assert mm_uuids is not None
    assert "audio" in mm_uuids
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[uuid])


@@ -1121,10 +1127,105 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
    mm_data = await mm_future
    assert mm_data is not None
    assert "image" in mm_data
    assert mm_data["image"] is None
    assert isinstance(mm_data["image"], list)
    assert len(mm_data["image"]) == 1
    assert mm_data["image"][0] is None

    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])


 def test_parse_chat_messages_empty_dict_image_embeds(
    phi3v_model_config_image_embeds,
 ):
    """Test that empty dictionary for image_embeds is handled without errors."""
    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_embeds", "image_embeds": {}},
                    {"type": "text", "text": "What's in this image?"},
                ],
            }
        ],
        phi3v_model_config_image_embeds,
        content_format="string",
    )

    # Verify conversation structure
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\nWhat's in this image?",
        }
    ]

    # Verify mm_data contains an empty dictionary of embeddings
    assert mm_data is not None
    assert "image" in mm_data
    assert isinstance(mm_data["image"], dict)
    assert len(mm_data["image"]) == 0

    # Verify UUIDs (None since we didn't provide any)
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])


 def test_parse_chat_messages_multiple_dict_image_embeds(
    phi3v_model_config_image_embeds,
 ):
    """Test that multiple dictionaries for image_embeds is handled without errors."""
    # Create two sample image embedding tensors
    batch_size = 2
    image_embedding_1 = torch.randn(batch_size, 256, 1024)
    image_embedding_2 = torch.randn(batch_size, 3)

    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_embeds",
                        "image_embeds": {
                            "image_embedding_1": tensor2base64(p),
                            "image_embedding_2": tensor2base64(i),
                        },
                    }
                    for p, i in zip(image_embedding_1, image_embedding_2)
                ]
                + [
                    {"type": "text", "text": "Describe these two images."},
                ],
            }
        ],
        phi3v_model_config_image_embeds,
        content_format="string",
    )

    # Verify conversation structure
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nDescribe these two images.",
        }
    ]

    # Verify mm_data contains a dictionary of multi-embeddings
    assert mm_data is not None
    assert "image" in mm_data
    assert isinstance(mm_data["image"], dict)
    assert len(mm_data["image"]) == batch_size

    # Verify each embedding has the correct shape
    assert isinstance(mm_data["image"]["image_embedding_1"], torch.Tensor)
    assert mm_data["image"]["image_embedding_1"].shape == image_embedding_1.shape
    assert isinstance(mm_data["image"]["image_embedding_2"], torch.Tensor)
    assert mm_data["image"]["image_embedding_2"].shape == image_embedding_2.shape

    # Verify UUIDs (None since we didn't provide any)
    _assert_mm_uuids(mm_uuids, batch_size, expected_uuids=[None, None])


@pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_async(
    phi3v_model_config,
--- a/tests/kernels/attention/test_cutlass_mla_decode.py
+++ b/tests/kernels/attention/test_cutlass_mla_decode.py
@@ -32,8 +32,8 @@ def cal_diff(


 CUTLASS_MLA_UNSUPPORTED_REASON = (
    "Cutlass MLA Requires compute capability of 10 or above."
    if not current_platform.is_device_capability(100)
    "Cutlass MLA Requires compute capability of 100 or above."
    if not current_platform.is_device_capability_family(100)
    else "Cutlass MLA is supported"
 )

--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -11,7 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import round_up

 if not current_platform.is_device_capability(100):
 if not current_platform.is_device_capability_family(100):
    pytest.skip(
        "This TRTLLM kernel requires NVIDIA Blackwell.", allow_module_level=True
    )
@@ -443,7 +443,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
        output_trtllm = output_trtllm.reshape(-1, query.shape[1], query.shape[2])

    if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
        rtol, atol = 1e-1, 2e-1
        rtol, atol = 3e-1, 4e-1
    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
        rtol, atol = 4e-2, 6e-2
    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -7,6 +7,7 @@ import torch

 from vllm.attention.ops.triton_unified_attention import unified_attention
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import next_power_of_2

 NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]
@@ -22,6 +23,10 @@ QDTYPES = (
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]

 # 0: use 2D kernel for decode
 # 8: use 3D kernel for decode
 SEQ_THRESHOLD_3D_VALUES = [0, 8]


 def ref_paged_attn(
    query: torch.Tensor,
@@ -92,6 +97,7 @@ def ref_paged_attn(
@pytest.mark.parametrize("soft_cap", [None, 50.0])
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("q_dtype", QDTYPES)
@pytest.mark.parametrize("seq_threshold_3D", SEQ_THRESHOLD_3D_VALUES)
@torch.inference_mode()
 def test_triton_unified_attn(
    seq_lens: list[tuple[int, int]],
@@ -103,6 +109,7 @@ def test_triton_unified_attn(
    soft_cap: float | None,
    num_blocks: int,
    q_dtype: torch.dtype | None,
    seq_threshold_3D: int,
 ) -> None:
    torch.set_default_device("cuda")

@@ -152,6 +159,21 @@ def test_triton_unified_attn(
        k_descale = torch.rand(scale_shape, dtype=torch.float32)
        v_descale = torch.rand(scale_shape, dtype=torch.float32)

    num_par_softmax_segments = 16
    head_size_padded = next_power_of_2(head_size)
    softmax_segm_output = torch.empty(
        (seq_threshold_3D, num_query_heads, num_par_softmax_segments, head_size_padded),
        dtype=torch.float32,
    )
    softmax_segm_max = torch.empty(
        (seq_threshold_3D, num_query_heads, num_par_softmax_segments),
        dtype=torch.float32,
    )
    softmax_segm_expsum = torch.empty(
        (seq_threshold_3D, num_query_heads, num_par_softmax_segments),
        dtype=torch.float32,
    )

    unified_attention(
        q=maybe_quantized_query,
        k=maybe_quantized_key_cache,
@@ -169,6 +191,11 @@ def test_triton_unified_attn(
        q_descale=q_descale,
        k_descale=k_descale,
        v_descale=v_descale,
        seq_threshold_3D=seq_threshold_3D,
        num_par_softmax_segments=num_par_softmax_segments,
        softmax_segm_output=softmax_segm_output,
        softmax_segm_max=softmax_segm_max,
        softmax_segm_expsum=softmax_segm_expsum,
    )

    ref_output = ref_paged_attn(
--- a/tests/kernels/moe/test_ocp_mx_moe.py
+++ b/tests/kernels/moe/test_ocp_mx_moe.py
@@ -17,7 +17,7 @@ QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse(
 ) >= version.parse("0.8.99")

 TRTLLM_GEN_MXFP4_AVAILABLE = (
    current_platform.is_cuda() and current_platform.is_device_capability(100)
    current_platform.is_cuda() and current_platform.is_device_capability_family(100)
 )

 HOPPER_MXFP4_BF16_AVAILABLE = (
@@ -799,7 +799,7 @@ def test_flashinfer_cutlass_mxfp4_fused_moe(
@pytest.mark.skipif(
    not (
        current_platform.is_cuda()
        and current_platform.is_device_capability(100)
        and current_platform.is_device_capability_family(100)
        and has_flashinfer()
    ),
    reason="NVIDIA GPU sm100 and flashinfer are required for this test",
--- a/tests/kernels/quantization/test_fp8_quant_group.py
+++ b/tests/kernels/quantization/test_fp8_quant_group.py
@@ -62,7 +62,7 @@ def test_quantfp8_group_functionality(
    assert scales_col.stride(1) == batch_size

    # Test column-major scales consistency
    assert torch.allclose(scales_col, scales_native, rtol=1e-9, atol=1e-8)
    torch.testing.assert_close(scales_col, scales_native, rtol=1e-9, atol=1e-8)

    # 3. Test CUDA implementation (only for divisible dimensions)
    if is_divisible:
@@ -71,7 +71,7 @@ def test_quantfp8_group_functionality(
        assert scales_cuda.shape == (batch_size, expected_num_groups)

        # Verify CUDA/native consistency
        assert torch.allclose(scales_cuda, scales_native, rtol=1e-9, atol=1e-8)
        torch.testing.assert_close(scales_cuda, scales_native, rtol=2e-7, atol=2e-8)

        # Quantized values should mostly match
        diff_count = (x_quant_cuda != x_quant_native).sum().item()
--- a/tests/kernels/quantization/test_scaled_mm_kernel_selection.py
+++ b/tests/kernels/quantization/test_scaled_mm_kernel_selection.py
@@ -0,0 +1,91 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for ScaledMM kernel selection logic (CPU-only)

 Run `pytest tests/kernels/quantization/test_scaled_mm_kernel_selection.py`.
 """

 import inspect
 from abc import ABC

 import pytest

 from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
    ScaledMMLinearLayerConfig,
 )
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
    AiterScaledMMLinearKernel,
 )
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import (
    CPUScaledMMLinearKernel,
 )
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
    ScaledMMLinearKernel,
 )

 pytestmark = pytest.mark.cpu_test


 def test_is_supported_is_abstract():
    """Test that is_supported() is properly defined as abstract."""
    assert issubclass(ScaledMMLinearKernel, ABC)
    assert hasattr(ScaledMMLinearKernel, "is_supported")


 def test_cpu_kernel_implements_is_supported():
    """Test that CPUScaledMMLinearKernel implements is_supported() method."""
    assert hasattr(CPUScaledMMLinearKernel, "is_supported"), (
        "CPUScaledMMLinearKernel missing is_supported() method"
    )
    # Verify it's a classmethod by checking if it can be called with the class
    # and by checking the method type
    assert inspect.ismethod(CPUScaledMMLinearKernel.is_supported) or inspect.isfunction(
        CPUScaledMMLinearKernel.is_supported
    ), "CPUScaledMMLinearKernel.is_supported() should be a classmethod"
    # Verify it can be called as a classmethod
    result, reason = CPUScaledMMLinearKernel.is_supported()
    assert isinstance(result, bool), "is_supported() should return a bool"
    assert reason is None or isinstance(reason, str), "reason should be str or None"


 def test_aiter_kernel_implements_is_supported():
    """Test that AiterScaledMMLinearKernel implements is_supported() method."""
    assert hasattr(AiterScaledMMLinearKernel, "is_supported"), (
        "AiterScaledMMLinearKernel missing is_supported() method"
    )
    # Verify it's a classmethod by checking if it can be called with the class
    # and by checking the method type
    assert inspect.ismethod(
        AiterScaledMMLinearKernel.is_supported
    ) or inspect.isfunction(AiterScaledMMLinearKernel.is_supported), (
        "AiterScaledMMLinearKernel.is_supported() should be a classmethod"
    )
    # Verify it can be called as a classmethod
    # (will return False on CPU, which is expected)
    result, reason = AiterScaledMMLinearKernel.is_supported()
    assert isinstance(result, bool), "is_supported() should return a bool"
    assert reason is None or isinstance(reason, str), "reason should be str or None"
    # On CPU, it should return False with a reason about requiring ROCm
    # This validates the method works correctly even on non-ROCm platforms


 def test_cpu_kernel_accepts_all_configs():
    """Test that CPUScaledMMLinearKernel accepts all config combinations."""
    configs = [
        ScaledMMLinearLayerConfig(
            is_channelwise=False,
            is_static_input_scheme=True,
            input_symmetric=True,
        ),
        ScaledMMLinearLayerConfig(
            is_channelwise=True,
            is_static_input_scheme=False,
            input_symmetric=False,
        ),
    ]

    for config in configs:
        can_impl, reason = CPUScaledMMLinearKernel.can_implement(config)
        assert can_impl, (
            f"CPUScaledMMLinearKernel should accept config {config}: {reason}"
        )
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -10,7 +10,7 @@ from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
    MistralToolParser,
 )
 from vllm.sampling_params import SamplingParams
 from vllm.tokenizers import MistralTokenizer
 from vllm.tokenizers.mistral import MistralTokenizer

 from ...utils import check_logprobs_close

--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -9,7 +9,7 @@ from mistral_common.audio import Audio
 from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
 from mistral_common.protocol.instruct.messages import UserMessage

 from vllm.tokenizers import MistralTokenizer
 from vllm.tokenizers.mistral import MistralTokenizer

 from ....conftest import AudioTestAssets
 from ....utils import RemoteOpenAIServer
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -1,150 +1,146 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from collections.abc import Sequence
 from typing import Any

 import librosa
 import pytest
 from transformers import AutoModelForSpeechSeq2Seq

 from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.platforms import current_platform

 from ....conftest import VllmRunner
 from ....conftest import HfRunner, PromptAudioInput, VllmRunner
 from ....utils import create_new_process_for_each_test, multi_gpu_test
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close

 VLLM_PROMPT = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
 HF_PROMPT = ""
 # Whisper expects 16kHz audio
 WHISPER_SAMPLE_RATE = 16000

 PROMPTS = [
    {
        "prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
        "multi_modal_data": {
            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
        },
    },
    {  # Test explicit encoder/decoder prompt
        "encoder_prompt": {
            "prompt": "",
            "multi_modal_data": {
                "audio": AudioAsset("winning_call").audio_and_sample_rate,
            },
        },
        "decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
    },
 ]

 EXPECTED = {
    "openai/whisper-tiny": [
        " He has birth words I spoke in the original corner of that. And a"
        " little piece of black coat poetry. Mary had a little sandwich,"
        " sweet, with white and snow. And everyone had it very went the last"
        " would sure to go.",
        " >> And the old one, fit John the way to Edgar Martinez. >> One more"
        " to line down the field line for our base camp. Here comes joy. Here"
        " is June and the third base. They're going to wave him in. The throw"
        " to the plate will be late. The Mariners are going to play for the"
        " American League Championship. I don't believe it. It just continues"
        " by all five.",
    ],
    "openai/whisper-small": [
        " The first words I spoke in the original pornograph. A little piece"
        " of practical poetry. Mary had a little lamb, its fleece was quite a"
        " slow, and everywhere that Mary went the lamb was sure to go.",
        " And the old one pitch on the way to Edgar Martinez one month. Here"
        " comes joy. Here is Junior to third base. They're gonna wave him"
        " in. The throw to the plate will be late. The Mariners are going to"
        " play for the American League Championship. I don't believe it. It"
        " just continues. My, oh my.",
    ],
    "openai/whisper-medium": [
        " The first words I spoke in the original phonograph, a little piece"
        " of practical poetry. Mary had a little lamb, its fleece was quite as"
        " slow, and everywhere that Mary went the lamb was sure to go.",
        " And the 0-1 pitch on the way to Edgar Martinez swung on the line"
        " down the left field line for Obeyshev. Here comes Joy. Here is"
        " Jorgen at third base. They're going to wave him in. The throw to the"
        " plate will be late. The Mariners are going to play for the American"
        " League Championship. I don't believe it. It just continues. My, oh"
        " my.",
    ],
    "openai/whisper-large-v3": [
        " The first words I spoke in the original phonograph, a little piece"
        " of practical poetry. Mary had a little lamb, its feet were quite as"
        " slow, and everywhere that Mary went, the lamb was sure to go.",
        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
        " Now the left field line for a base hit. Here comes Joy. Here is"
        " Junior to third base. They're going to wave him in. The throw to the"
        " plate will be late. The Mariners are going to play for the American"
        " League Championship. I don't believe it. It just continues. My, oh,"
        " my.",
    ],
    "openai/whisper-large-v3-turbo": [
        " The first words I spoke in the original phonograph, a little piece"
        " of practical poetry. Mary had a little lamb, its streets were quite"
        " as slow, and everywhere that Mary went the lamb was sure to go.",
        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
        " down the left field line for a base hit. Here comes Joy. Here is"
        " Junior to third base. They're going to wave him in. The throw to the"
        " plate will be late. The Mariners are going to play for the American"
        " League Championship. I don't believe it. It just continues. My, oh,"
        " my.",
    ],
 }

@pytest.fixture(autouse=True)
 def use_spawn_for_whisper(monkeypatch):
    """Whisper has issues with forked workers, use spawn instead."""
    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")


 def run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    inputs: Sequence[tuple[list[str], list[str], PromptAudioInput]],
    model: str,
    *,
    max_model_len: int,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: str | None = None,
    dtype: str = "half",
    enforce_eager: bool = True,
 ) -> None:
    prompt_list = PROMPTS * 10
    expected_list = EXPECTED[model] * 10
    """Inference result should be the same between hf and vllm.

    All the audio fixtures for the test are from AudioAsset.
    For huggingface runner, we provide the audio as input.
    For vllm runner, we provide MultiModalDataDict objects
    and corresponding MultiModalConfig as input.
    """
    with vllm_runner(
        model,
        dtype=dtype,
        max_model_len=448,
        max_model_len=max_model_len,
        tensor_parallel_size=tensor_parallel_size,
        distributed_executor_backend=distributed_executor_backend,
        # TODO (NickLucche) figure out output differences with non-eager and re-enable
        enforce_eager=True,
        limit_mm_per_prompt={"audio": 2},
        enforce_eager=enforce_eager,
        disable_custom_all_reduce=True,
    ) as vllm_model:
        llm = vllm_model.llm

        sampling_params = SamplingParams(
            temperature=0,
            top_p=1.0,
            max_tokens=200,
        vllm_outputs_per_case = [
            vllm_model.generate_greedy_logprobs(
                vllm_prompts,
                max_tokens,
                num_logprobs=num_logprobs,
                audios=audios,
            )
            for vllm_prompts, _, audios in inputs
        ]

    with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
        hf_outputs_per_case = [
            hf_model.generate_greedy_logprobs_limit(
                hf_prompts,
                max_tokens,
                num_logprobs=num_logprobs,
                audios=audios,
            )
            for _, hf_prompts, audios in inputs
        ]

    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_outputs,
            name_0="hf",
            name_1="vllm",
        )

        outputs = llm.generate(prompt_list, sampling_params)

    for output, expected in zip(outputs, expected_list):
        print(output.outputs[0].text)
        assert output.outputs[0].text == expected
@pytest.fixture
 def input_audios() -> list[tuple[list[str], list[str], list[tuple[Any, int]]]]:
    audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
    inputs = []
    for asset in audio_assets:
        audio, orig_sr = asset.audio_and_sample_rate
        # Resample to Whisper's expected sample rate (16kHz)
        if orig_sr != WHISPER_SAMPLE_RATE:
            audio = librosa.resample(
                audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE
            )
        # vLLM prompts, HF prompts, audio inputs
        inputs.append(([VLLM_PROMPT], [HF_PROMPT], [(audio, WHISPER_SAMPLE_RATE)]))
    return inputs


@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("dtype", ["half"])
@create_new_process_for_each_test()
 def test_models(vllm_runner, model, dtype) -> None:
    run_test(
        vllm_runner,
        model,
        tensor_parallel_size=1,
        dtype=dtype,
    )
 def check_model_available(model: str) -> None:
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")


@pytest.mark.core_model
@pytest.mark.cpu_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("dtype", ["half"])
 def test_models_cpu(vllm_runner, model, dtype) -> None:
    # @create_new_process_for_each_test() does not work for some runners
    # TODO: to fix cpu privilege issues in run-cpu-test-arm.sh
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("enforce_eager", [True, False])
@create_new_process_for_each_test("spawn")
 def test_models(
    hf_runner,
    vllm_runner,
    model: str,
    dtype: str,
    num_logprobs: int,
    input_audios,
    enforce_eager: bool,
 ) -> None:
    check_model_available(model)
    if current_platform.is_cpu() and not enforce_eager:
        pytest.skip("Skipping test for CPU with non-eager mode")
    run_test(
        hf_runner,
        vllm_runner,
        input_audios,
        model,
        tensor_parallel_size=1,
        dtype=dtype,
        max_model_len=448,
        max_tokens=200,
        num_logprobs=num_logprobs,
        tensor_parallel_size=1,
        enforce_eager=enforce_eager,
    )


@@ -152,15 +148,31 @@ def test_models_cpu(vllm_runner, model, dtype) -> None:
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@create_new_process_for_each_test()
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [200])
@pytest.mark.parametrize("num_logprobs", [5])
@create_new_process_for_each_test("spawn")
 def test_models_distributed(
    hf_runner,
    vllm_runner,
    model,
    distributed_executor_backend,
    model: str,
    distributed_executor_backend: str,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    input_audios,
 ) -> None:
    check_model_available(model)
    run_test(
        hf_runner,
        vllm_runner,
        input_audios,
        model,
        dtype=dtype,
        max_model_len=448,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        tensor_parallel_size=2,
        distributed_executor_backend=distributed_executor_backend,
        enforce_eager=False,
    )
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -22,11 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.tokenizers import (
    MistralTokenizer,
    TokenizerLike,
    cached_tokenizer_from_config,
 )
 from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
 from vllm.tokenizers.mistral import MistralTokenizer

 from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import (
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -8,6 +8,7 @@ from typing import Any, TypeAlias

 import numpy as np
 import pytest
 import torch
 import torch.nn as nn
 from PIL import Image

@@ -35,6 +36,7 @@ from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_dtype

 from ....utils import create_new_process_for_each_test
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import dummy_hf_overrides
 from .test_common import get_model_ids_to_test, get_text_token_prompts
@@ -136,6 +138,7 @@ def create_batched_mm_kwargs(
    )


 # TODO(Isotr0py): Don't initalize model during test
@contextmanager
 def initialize_dummy_model(
    model_cls: type[nn.Module],
@@ -150,16 +153,21 @@ def initialize_dummy_model(
        backend="nccl",
    )
    initialize_model_parallel(tensor_model_parallel_size=1)

    current_device = torch.get_default_device()
    vllm_config = VllmConfig(model_config=model_config)
    with set_current_vllm_config(vllm_config=vllm_config):
        with set_default_torch_dtype(model_config.dtype):
            torch.set_default_device(current_platform.device_type)
            model = model_cls(vllm_config=vllm_config)
            torch.set_default_device(current_device)
        yield model

    del model
    cleanup_dist_env_and_memory()


@create_new_process_for_each_test()
@pytest.mark.parametrize("model_id", get_model_ids_to_test())
 def test_model_tensor_schema(model_id: str):
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -356,7 +356,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    ),
    "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
    "MistralLarge3ForCausalLM": _HfExamplesInfo(
        "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4", is_available_online=False
        "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4"
    ),
    "MixtralForCausalLM": _HfExamplesInfo(
        "mistralai/Mixtral-8x7B-Instruct-v0.1",
@@ -635,7 +635,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    ),
    "HunYuanVLForConditionalGeneration": _HfExamplesInfo(
        "tencent/HunyuanOCR",
        is_available_online=False,
        hf_overrides={"num_experts": 0},
    ),
    "Idefics3ForConditionalGeneration": _HfExamplesInfo(
        "HuggingFaceM4/Idefics3-8B-Llama3",
@@ -674,8 +674,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        "https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31",
    ),
    "LightOnOCRForConditionalGeneration": _HfExamplesInfo(
        "lightonai/LightOnOCR-1B",
        is_available_online=False,
        "lightonai/LightOnOCR-1B-1025"
    ),
    "Llama4ForConditionalGeneration": _HfExamplesInfo(
        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
@@ -779,8 +778,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
            "ministral-3": "mistralai/Ministral-3-3B-Instruct-2512",
        },
        tokenizer_mode="mistral",
        # TODO: revert once Mistral-Large-3 and Ministral-3 are publicly available.
        is_available_online=False,
    ),
    "QwenVLForConditionalGeneration": _HfExamplesInfo(
        "Qwen/Qwen-VL",
@@ -843,7 +840,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        is_available_online=False,
    ),
    # [Encoder-decoder]
    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),
    "WhisperForConditionalGeneration": _HfExamplesInfo(
        "openai/whisper-large-v3-turbo",
        extras={"v3": "openai/whisper-large-v3"},
    ),
    # [Cross-encoder]
    "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"),
 }
@@ -886,6 +886,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
    "EagleMistralLarge3ForCausalLM": _HfExamplesInfo(
        "mistralai/Mistral-Large-3-675B-Instruct-2512",
        speculative_model="mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle",
        # TODO: revert once figuring out OOM in CI
        is_available_online=False,
    ),
    "LlamaForCausalLMEagle3": _HfExamplesInfo(
--- a/tests/quantization/test_blackwell_moe.py
+++ b/tests/quantization/test_blackwell_moe.py
@@ -10,9 +10,9 @@ import pytest
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform

 if not current_platform.is_device_capability(100):
 if not current_platform.is_device_capability_family(100):
    pytest.skip(
        "This test only runs on Blackwell GPUs (SM100).", allow_module_level=True
        "This test only runs on Blackwell GPUs (SM10x).", allow_module_level=True
    )


--- a/tests/reasoning/test_mistral_reasoning_parser.py
+++ b/tests/reasoning/test_mistral_reasoning_parser.py
@@ -5,7 +5,7 @@ import pytest

 from tests.reasoning.utils import run_reasoning_extraction_mistral
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.tokenizers import MistralTokenizer
 from vllm.tokenizers.mistral import MistralTokenizer

 parser_name = "mistral"

--- a/tests/reasoning/utils.py
+++ b/tests/reasoning/utils.py
@@ -4,7 +4,7 @@

 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.reasoning import ReasoningParser
 from vllm.tokenizers import MistralTokenizer
 from vllm.tokenizers.mistral import MistralTokenizer


 class StreamingReasoningReconstructor:
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -7,7 +7,7 @@ from vllm.config import ModelConfig
 from vllm.inputs import zip_enc_dec_prompts
 from vllm.inputs.parse import parse_raw_prompts
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.tokenizers import init_tokenizer_from_config
 from vllm.tokenizers import cached_tokenizer_from_config

 pytestmark = pytest.mark.cpu_test

@@ -108,7 +108,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
 )
 def test_preprocessor_always_mm_code_path(model_id, prompt):
    model_config = ModelConfig(model=model_id)
    tokenizer = init_tokenizer_from_config(model_config)
    tokenizer = cached_tokenizer_from_config(model_config)
    input_preprocessor = InputPreprocessor(model_config, tokenizer)

    # HF processor adds sep token
--- a/tests/tokenizers_/test_basic.py
+++ b/tests/tokenizers_/test_basic.py
@@ -3,38 +3,39 @@
 from typing import _get_protocol_attrs  # type: ignore

 import pytest
 from transformers import PreTrainedTokenizerBase
 from transformers import (
    PreTrainedTokenizer,
    PreTrainedTokenizerBase,
    PreTrainedTokenizerFast,
 )

 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.mistral import MistralTokenizer


 def _get_missing_attrs(obj: object, target: type):
    return [k for k in _get_protocol_attrs(target) if not hasattr(obj, k)]


 def _assert_tokenizer_like(tokenizer: object):
    missing_attrs = _get_missing_attrs(tokenizer, TokenizerLike)
    assert not missing_attrs, f"Missing attrs: {missing_attrs}"


 def test_tokenizer_like_protocol():
    assert not (
        missing_attrs := _get_missing_attrs(
            get_tokenizer("gpt2", use_fast=False),
            TokenizerLike,
        )
    ), f"Missing attrs: {missing_attrs}"

    assert not (
        missing_attrs := _get_missing_attrs(
            get_tokenizer("gpt2", use_fast=True),
            TokenizerLike,
        )
    ), f"Missing attrs: {missing_attrs}"

    assert not (
        missing_attrs := _get_missing_attrs(
            get_tokenizer(
                "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
            ),
            TokenizerLike,
        )
    ), f"Missing attrs: {missing_attrs}"
    tokenizer = get_tokenizer("gpt2", use_fast=False)
    assert isinstance(tokenizer, PreTrainedTokenizer)
    _assert_tokenizer_like(tokenizer)

    tokenizer = get_tokenizer("gpt2", use_fast=True)
    assert isinstance(tokenizer, PreTrainedTokenizerFast)
    _assert_tokenizer_like(tokenizer)

    tokenizer = get_tokenizer(
        "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
    )
    assert isinstance(tokenizer, MistralTokenizer)
    _assert_tokenizer_like(tokenizer)


@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
--- a/tests/tokenizers_/test_detokenize.py
+++ b/tests/tokenizers_/test_detokenize.py
@@ -8,7 +8,7 @@ import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast

 from vllm.sampling_params import SamplingParams
 from vllm.tokenizers import MistralTokenizer
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.detokenizer import (
    FastIncrementalDetokenizer,
--- a/tests/tokenizers_/test_registry.py
+++ b/tests/tokenizers_/test_registry.py
@@ -2,7 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path

 from vllm.tokenizers import TokenizerLike, TokenizerRegistry, get_tokenizer
 import pytest

 from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.registry import (
    TokenizerRegistry,
    get_tokenizer,
    resolve_tokenizer_args,
 )


 class TestTokenizer(TokenizerLike):
@@ -40,10 +47,22 @@ class TestTokenizer(TokenizerLike):
        return True


@pytest.mark.parametrize("runner_type", ["generate", "pooling"])
 def test_resolve_tokenizer_args_idempotent(runner_type):
    tokenizer_mode, tokenizer_name, args, kwargs = resolve_tokenizer_args(
        "facebook/opt-125m",
        runner_type=runner_type,
    )

    assert (tokenizer_mode, tokenizer_name, args, kwargs) == resolve_tokenizer_args(
        tokenizer_name, *args, **kwargs
    )


 def test_customized_tokenizer():
    TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)

    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer", "abc")
    tokenizer = TokenizerRegistry.load_tokenizer("test_tokenizer", "abc")
    assert isinstance(tokenizer, TestTokenizer)
    assert tokenizer.path_or_repo_id == "abc"
    assert tokenizer.bos_token_id == 0
--- a/tests/tool_use/test_mistral_tool_parser.py
+++ b/tests/tool_use/test_mistral_tool_parser.py
@@ -13,12 +13,9 @@ from partial_json_parser.core.options import Allow

 from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolParser
 from vllm.tokenizers import (
    MistralTokenizer,
    TokenizerLike,
    get_tokenizer,
 )
 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.tokenizers.mistral import MistralTokenizer


@pytest.fixture(scope="module")
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -528,9 +528,11 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
        ),
    ],
 )
@pytest.mark.parametrize("top_logprobs", [0, 3])
 def test_spec_decode_logprobs(
    logprobs_mode: LogprobsMode,
    model_setup: tuple[str, str, str],
    top_logprobs: int,
 ):
    """Spec decode logprobs should match those of the base model.

@@ -543,7 +545,7 @@ def test_spec_decode_logprobs(

    prompt = "Hello world " * 50
    sampling_params = SamplingParams(
        temperature=0, logprobs=3, max_tokens=10, ignore_eos=False
        temperature=0, logprobs=top_logprobs, max_tokens=10, ignore_eos=False
    )
    method, model_name, spec_model_name = model_setup
    max_model_len = 256
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -111,7 +111,7 @@ def create_sampling_metadata(
        top_p=top_p,
        top_k=top_k,
        generators=generators,
        max_num_logprobs=0,
        max_num_logprobs=None,
        no_penalties=no_penalties,
        prompt_token_ids=prompt_token_ids,
        frequency_penalties=frequency_penalties,
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -43,6 +43,7 @@ FILES = [
    "vllm/worker",
    "vllm/v1/core",
    "vllm/v1/engine",
    "vllm/v1/executor",
    "vllm/v1/metrics",
    "vllm/v1/pool",
    "vllm/v1/sample",
@@ -60,7 +61,6 @@ SEPARATE_GROUPS = [
    "vllm/model_executor",
    # v1 related
    "vllm/v1/attention",
    "vllm/v1/executor",
    "vllm/v1/kv_offload",
    "vllm/v1/spec_decode",
    "vllm/v1/structured_output",
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer."""

 import functools
 from collections.abc import Callable
 from typing import cast

@@ -17,6 +18,7 @@ from vllm.attention.backends.abstract import (
 )
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import get_attn_backend
 from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
 from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer
 from vllm.config import CacheConfig, get_current_vllm_config
@@ -524,6 +526,14 @@ class MultiHeadAttention(nn.Module):
            AttentionBackendEnum.ROCM_AITER_FA,
        }

        self.fa_version = None
        if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
            self.fa_version = get_flash_attn_version()
            assert self._flash_attn_varlen_func is not None
            self._flash_attn_varlen_func = functools.partial(
                self._flash_attn_varlen_func, fa_version=self.fa_version
            )

        logger.info_once(
            f"Using {self.attn_backend} for MultiHeadAttention in multimodal encoder."
        )
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -355,7 +355,7 @@ def kernel_unified_attention_2d(
@triton.jit
 def kernel_unified_attention_3d(
    segm_output_ptr,
    # [num_tokens, num_query_heads, num_segments, head_size]
    # [num_tokens, num_query_heads, num_segments, head_size_padded]
    segm_max_ptr,  # [num_tokens, num_query_heads, num_segments]
    segm_expsum_ptr,  # [num_tokens, num_query_heads, num_segments]
    query_ptr,  # [num_tokens, num_query_heads, head_size]
@@ -749,6 +749,11 @@ def unified_attention(
    q_descale,
    k_descale,
    v_descale,
    seq_threshold_3D=None,
    num_par_softmax_segments=None,
    softmax_segm_output=None,
    softmax_segm_max=None,
    softmax_segm_expsum=None,
    alibi_slopes=None,
    output_scale=None,
    qq_bias=None,
@@ -793,8 +798,19 @@ def unified_attention(
    TILE_SIZE_PREFILL = 32
    TILE_SIZE_DECODE = 16 if q.element_size() >= 2 else 32

    # if batch contains a prefill
    if max_seqlen_q > 1 or total_num_q_blocks * num_kv_heads > 128:
    # Launch the 2D kernel if
    # 1. No intermediate tiled softmax buffers for the 3D kernel have been allocated, or
    # 2. The batch includes at least one prefill request, or
    # 3. The number of sequences exceeds the configured threshold
    if (
        seq_threshold_3D is None
        or num_par_softmax_segments is None
        or softmax_segm_output is None
        or softmax_segm_max is None
        or softmax_segm_expsum is None
        or max_seqlen_q > 1
        or num_seqs > seq_threshold_3D
    ):
        kernel_unified_attention_2d[
            (
                total_num_q_blocks,
@@ -847,37 +863,12 @@ def unified_attention(
            USE_FP8=output_scale is not None,
        )
    else:
        # for initial version, NUM_SEGMENTS = 16 is chosen as a default
        # value that showed good performance in tests
        NUM_SEGMENTS = 16

        segm_output = torch.empty(
            q.shape[0],
            num_query_heads,
            NUM_SEGMENTS,
            triton.next_power_of_2(head_size),
            dtype=torch.float32,
            device=q.device,
        )
        segm_max = torch.empty(
            q.shape[0],
            num_query_heads,
            NUM_SEGMENTS,
            dtype=torch.float32,
            device=q.device,
        )
        segm_expsum = torch.empty(
            q.shape[0],
            num_query_heads,
            NUM_SEGMENTS,
            dtype=torch.float32,
            device=q.device,
        )

        kernel_unified_attention_3d[(total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)](
            segm_output_ptr=segm_output,
            segm_max_ptr=segm_max,
            segm_expsum_ptr=segm_expsum,
        kernel_unified_attention_3d[
            (total_num_q_blocks, num_kv_heads, num_par_softmax_segments)
        ](
            segm_output_ptr=softmax_segm_output,
            segm_max_ptr=softmax_segm_max,
            segm_expsum_ptr=softmax_segm_expsum,
            query_ptr=q,
            key_cache_ptr=k,
            value_cache_ptr=v,
@@ -917,13 +908,13 @@ def unified_attention(
            BLOCK_Q=BLOCK_Q,
            num_seqs=num_seqs,
            BLOCK_M=BLOCK_M,
            NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
            NUM_SEGMENTS_PER_SEQ=num_par_softmax_segments,
        )
        reduce_segments[(q.shape[0], num_query_heads)](
            output_ptr=out,
            segm_output_ptr=segm_output,
            segm_max_ptr=segm_max,
            segm_expsum_ptr=segm_expsum,
            segm_output_ptr=softmax_segm_output,
            segm_max_ptr=softmax_segm_max,
            segm_expsum_ptr=softmax_segm_expsum,
            seq_lens_ptr=seqused_k,
            num_seqs=num_seqs,
            num_query_heads=num_query_heads,
@@ -936,6 +927,6 @@ def unified_attention(
            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
            query_start_len_ptr=cu_seqlens_q,
            BLOCK_Q=BLOCK_Q,
            NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
            NUM_SEGMENTS_PER_SEQ=num_par_softmax_segments,
            USE_FP8=output_scale is not None,
        )
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -28,7 +28,7 @@ from vllm.config.compilation import DynamicShapesType
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.torch_utils import supports_dynamo
 from vllm.utils.torch_utils import is_torch_equal_or_newer, supports_dynamo

 from .monitor import start_monitoring_torch_compile

@@ -316,7 +316,13 @@ def _support_torch_compile(
    def _mark_dynamic_inputs(mod, type, *args, **kwargs):
        def mark_dynamic(arg, dims):
            if type == DynamicShapesType.UNBACKED:
                torch._dynamo.decorators.mark_unbacked(arg, dims)
                if is_torch_equal_or_newer("2.10.0.dev"):
                    for dim in dims:
                        torch._dynamo.decorators.mark_unbacked(
                            arg, dim, hint_override=arg.size()[dim]
                        )
                else:
                    torch._dynamo.decorators.mark_unbacked(arg, dims)
            else:
                torch._dynamo.mark_dynamic(arg, dims)

@@ -350,7 +356,13 @@ def _support_torch_compile(
                    if isinstance(arg, torch.Tensor):
                        # In case dims is specified with negative indexing
                        dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
                        torch._dynamo.decorators.mark_unbacked(arg, dims)
                        if is_torch_equal_or_newer("2.10.0.dev"):
                            for dim in dims:
                                torch._dynamo.decorators.mark_unbacked(
                                    arg, dim, hint_override=arg.size()[dim]
                                )
                        else:
                            torch._dynamo.decorators.mark_unbacked(arg, dims)

    def __call__(self, *args, **kwargs):
        # torch.compiler.is_compiling() means we are inside the compilation
@@ -488,6 +500,12 @@ def _support_torch_compile(
        if ds_type == DynamicShapesType.BACKED_SIZE_OBLIVIOUS:
            fx_config_patches["backed_size_oblivious"] = True

        # Prepare inductor config patches
        # assume_32bit_indexing is only available in torch 2.10.0.dev+
        inductor_config_patches = {}
        if is_torch_equal_or_newer("2.10.0.dev"):
            inductor_config_patches["assume_32bit_indexing"] = True

        with (
            patch.object(
                InliningInstructionTranslator, "inline_call_", patched_inline_call
@@ -496,6 +514,7 @@ def _support_torch_compile(
            maybe_use_cudagraph_partition_wrapper(self.vllm_config),
            torch.fx.experimental._config.patch(**fx_config_patches),
            _torch27_patch_tensor_subclasses(),
            torch._inductor.config.patch(**inductor_config_patches),
        ):
            if envs.VLLM_USE_AOT_COMPILE:
                self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1649,7 +1649,13 @@ class EngineArgs:
                    "attention_backend and attention_config.backend "
                    "are mutually exclusive"
                )
            attention_config.backend = self.attention_backend
            # Convert string to enum if needed (CLI parsing returns a string)
            if isinstance(self.attention_backend, str):
                attention_config.backend = AttentionBackendEnum[
                    self.attention_backend.upper()
                ]
            else:
                attention_config.backend = self.attention_backend

        load_config = self.create_load_config()

--- a/vllm/entrypoints/anthropic/serving_messages.py
+++ b/vllm/entrypoints/anthropic/serving_messages.py
@@ -324,12 +324,12 @@ class AnthropicServingMessages(OpenAIServingChat):
                                    id=origin_chunk.id,
                                    content=[],
                                    model=origin_chunk.model,
                                ),
                                usage=AnthropicUsage(
                                    input_tokens=origin_chunk.usage.prompt_tokens
                                    if origin_chunk.usage
                                    else 0,
                                    output_tokens=0,
                                    usage=AnthropicUsage(
                                        input_tokens=origin_chunk.usage.prompt_tokens
                                        if origin_chunk.usage
                                        else 0,
                                        output_tokens=0,
                                    ),
                                ),
                            )
                            first_item = False
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -9,7 +9,7 @@ from collections import Counter, defaultdict, deque
 from collections.abc import Awaitable, Callable, Iterable
 from functools import cached_property, lru_cache, partial
 from pathlib import Path
 from typing import Any, Generic, Literal, TypeAlias, TypeVar, cast
 from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, TypeVar, cast

 import jinja2
 import jinja2.ext
@@ -49,11 +49,20 @@ from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsMultiModal
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
 from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils import random_uuid
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.func_utils import supports_kw
 from vllm.utils.import_utils import LazyLoader

 if TYPE_CHECKING:
    import torch

    from vllm.tokenizers.mistral import MistralTokenizer
 else:
    torch = LazyLoader("torch", globals(), "torch")

 logger = init_logger(__name__)

@@ -620,6 +629,44 @@ ModalityStr = Literal["image", "audio", "video", "image_embeds", "audio_embeds"]
 _T = TypeVar("_T")


 def _extract_embeds(tensors: list[torch.Tensor]):
    if len(tensors) == 0:
        return tensors

    if len(tensors) == 1:
        tensors[0]._is_single_item = True  # type: ignore
        return tensors[0]  # To keep backwards compatibility for single item input

    first_shape = tensors[0].shape
    if all(t.shape == first_shape for t in tensors):
        return torch.stack(tensors)

    return tensors


 def _get_embeds_data(items_by_modality: dict[str, list[Any]], modality: str):
    embeds_key = f"{modality}_embeds"
    embeds = items_by_modality[embeds_key]

    if len(embeds) == 0:
        return embeds
    if is_list_of(embeds, torch.Tensor):
        return _extract_embeds(embeds)
    if is_list_of(embeds, dict):
        if not embeds:
            return {}

        first_keys = set(embeds[0].keys())
        if any(set(item.keys()) != first_keys for item in embeds[1:]):
            raise ValueError(
                "All dictionaries in the list of embeddings must have the same keys."
            )

        return {k: _extract_embeds([item[k] for item in embeds]) for k in first_keys}

    return embeds


 class BaseMultiModalItemTracker(ABC, Generic[_T]):
    """
    Tracks multi-modal items in a given request and ensures that the number
@@ -688,11 +735,14 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
    def all_mm_uuids(self) -> MultiModalUUIDDict | None:
        if not self._items_by_modality:
            return None
        mm_uuids = {}

        uuids_by_modality = dict(self._uuids_by_modality)
        if "image" in uuids_by_modality and "image_embeds" in uuids_by_modality:
            raise ValueError("Mixing raw image and embedding inputs is not allowed")
        if "audio" in uuids_by_modality and "audio_embeds" in uuids_by_modality:
            raise ValueError("Mixing raw audio and embedding inputs is not allowed")

        mm_uuids = {}
        if "image_embeds" in uuids_by_modality:
            mm_uuids["image"] = uuids_by_modality["image_embeds"]
        if "image" in uuids_by_modality:
@@ -703,6 +753,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
            mm_uuids["audio"] = uuids_by_modality["audio"]  # UUIDs of audios
        if "video" in uuids_by_modality:
            mm_uuids["video"] = uuids_by_modality["video"]  # UUIDs of videos

        return mm_uuids

    @abstractmethod
@@ -714,29 +765,25 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
    def all_mm_data(self) -> MultiModalDataDict | None:
        if not self._items_by_modality:
            return None
        mm_inputs = {}

        items_by_modality = dict(self._items_by_modality)
        if "image" in items_by_modality and "image_embeds" in items_by_modality:
            raise ValueError("Mixing raw image and embedding inputs is not allowed")
        if "audio" in items_by_modality and "audio_embeds" in items_by_modality:
            raise ValueError("Mixing raw audio and embedding inputs is not allowed")

        mm_inputs = {}
        if "image_embeds" in items_by_modality:
            image_embeds_lst = items_by_modality["image_embeds"]
            mm_inputs["image"] = (
                image_embeds_lst if len(image_embeds_lst) != 1 else image_embeds_lst[0]
            )
            mm_inputs["image"] = _get_embeds_data(items_by_modality, "image")
        if "image" in items_by_modality:
            mm_inputs["image"] = items_by_modality["image"]  # A list of images
        if "audio_embeds" in items_by_modality:
            audio_embeds_lst = items_by_modality["audio_embeds"]
            mm_inputs["audio"] = (
                audio_embeds_lst if len(audio_embeds_lst) != 1 else audio_embeds_lst[0]
            )
            mm_inputs["audio"] = _get_embeds_data(items_by_modality, "audio")
        if "audio" in items_by_modality:
            mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
        if "video" in items_by_modality:
            mm_inputs["video"] = items_by_modality["video"]  # A list of videos

        return mm_inputs

    def create_parser(self) -> "BaseMultiModalContentParser":
@@ -747,38 +794,32 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
    async def all_mm_data(self) -> MultiModalDataDict | None:
        if not self._items_by_modality:
            return None
        mm_inputs = {}
        items_by_modality = {}
        for modality, items in self._items_by_modality.items():
            coros = []
            for item in items:
                if item is not None:
                    coros.append(item)
                else:
                    coros.append(asyncio.sleep(0))
            items_by_modality[modality] = await asyncio.gather(*coros)

        coros_by_modality = {
            modality: [item or asyncio.sleep(0) for item in items]
            for modality, items in self._items_by_modality.items()
        }
        items_by_modality: dict[str, list[object | None]] = {
            modality: await asyncio.gather(*coros)
            for modality, coros in coros_by_modality.items()
        }
        if "image" in items_by_modality and "image_embeds" in items_by_modality:
            raise ValueError("Mixing raw image and embedding inputs is not allowed")
        if "audio" in items_by_modality and "audio_embeds" in items_by_modality:
            raise ValueError("Mixing raw audio and embedding inputs is not allowed")

        mm_inputs = {}
        if "image_embeds" in items_by_modality:
            image_embeds_lst = items_by_modality["image_embeds"]
            mm_inputs["image"] = (
                image_embeds_lst if len(image_embeds_lst) != 1 else image_embeds_lst[0]
            )
            mm_inputs["image"] = _get_embeds_data(items_by_modality, "image")
        if "image" in items_by_modality:
            mm_inputs["image"] = items_by_modality["image"]  # A list of images
        if "audio_embeds" in items_by_modality:
            audio_embeds_lst = items_by_modality["audio_embeds"]
            mm_inputs["audio"] = (
                audio_embeds_lst if len(audio_embeds_lst) != 1 else audio_embeds_lst[0]
            )
            mm_inputs["audio"] = _get_embeds_data(items_by_modality, "audio")
        if "audio" in items_by_modality:
            mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
        if "video" in items_by_modality:
            mm_inputs["video"] = items_by_modality["video"]  # A list of videos

        return mm_inputs

    def create_parser(self) -> "BaseMultiModalContentParser":
@@ -1792,7 +1833,7 @@ def apply_hf_chat_template(


 def apply_mistral_chat_template(
    tokenizer: MistralTokenizer,
    tokenizer: "MistralTokenizer",
    messages: list[ChatCompletionMessageParam],
    chat_template: str | None,
    tools: list[dict[str, Any]] | None,
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -72,7 +72,8 @@ from vllm.platforms import current_platform
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams
 from vllm.tasks import PoolingTask
 from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.collection_utils import as_iter, is_list_of
 from vllm.utils.counter import Counter
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -117,7 +117,9 @@ from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer, TokenizerLike
 from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.deepseekv32 import DeepseekV32Tokenizer
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.tracing import (
    contains_trace_headers,
    extract_trace_headers,
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -22,7 +22,8 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
    ToolParser,
 )
 from vllm.logger import init_logger
 from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.mistral import MistralTokenizer

 logger = init_logger(__name__)

--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -21,7 +21,8 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
 from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.mistral import MistralTokenizer

 logger = init_logger(__name__)

--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -6,6 +6,7 @@ from collections.abc import Sequence
 from enum import Enum, auto
 from random import choices
 from string import ascii_letters, digits
 from typing import Any

 import ijson
 import regex as re
@@ -24,7 +25,8 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
    ToolParser,
 )
 from vllm.logger import init_logger
 from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.mistral import MistralTokenizer

 logger = init_logger(__name__)

@@ -84,6 +86,7 @@ class MistralToolParser(ToolParser):

        # initialize properties used for state when parsing tool calls in
        # streaming mode
        self.prev_tool_call_arr: list[dict[str, Any]] = []
        self.current_tool_id: int = -1
        self.streaming_state: StreamingState = StreamingState.WAITING_FOR_TOOL_START

--- a/vllm/entrypoints/pooling/score/serving.py
+++ b/vllm/entrypoints/pooling/score/serving.py
@@ -38,7 +38,8 @@ from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
 from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils.async_utils import make_async, merge_async_iterators

 logger = init_logger(__name__)
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -30,7 +30,7 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.serving_models import LoRAModulePath
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.tokenizers import MistralTokenizer
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils.argparse_utils import FlexibleArgumentParser

 logger = init_logger(__name__)
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -936,7 +936,7 @@ def enable_batch_invariant_mode():
    # Batch invariant matmuls are no longer needed after cublas overrides
    if not is_torch_equal_or_newer("2.10.0.dev"):
        if (
            current_platform.is_device_capability(100)
            current_platform.is_device_capability_family(100)
            or current_platform.is_device_capability(80)
            or current_platform.is_device_capability(89)
        ):
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -287,7 +287,10 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
        """
        DeepGemm supports packed ue8m0 activation scales format in devices == sm100
        """
        return is_deep_gemm_e8m0_used() and current_platform.is_device_capability(100)
        return (
            is_deep_gemm_e8m0_used()
            and current_platform.is_device_capability_family(100)
        )

    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        # Let PrepareAndFinalize::finalize() decide the impl.
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1200,10 +1200,14 @@ class FusedMoE(CustomOp):
        if full_load:
            shard_dim += 1

        # Materialize GGUF UninitializedParameter
        # Materialize GGUF UninitializedParameter accounting merged weights
        if is_gguf_weight and isinstance(param, UninitializedParameter):
            # To materialize a tensor, we must have full shape including
            # number of experts, making this portion to require `full_load`.
            assert full_load
            final_shape = list(loaded_weight.shape)
            if shard_id in ["w1", "w3"]:
            # w1 and w3 are merged per expert.
            if shard_id in {"w1", "w3"}:
                final_shape[1] *= 2
            final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size
            param.materialize(final_shape, dtype=loaded_weight.dtype)
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -137,7 +137,7 @@ def get_fp8_moe_backend(
    if (
        current_platform.is_cuda()
        and (
            current_platform.is_device_capability(100)
            current_platform.is_device_capability_family(100)
            or current_platform.is_device_capability(90)
        )
        and envs.VLLM_USE_FLASHINFER_MOE_FP8
@@ -148,7 +148,7 @@ def get_fp8_moe_backend(
            logger.info_once("Using FlashInfer FP8 MoE TRTLLM backend for SM100")
            return Fp8MoeBackend.FLASHINFER_TRTLLM
        else:
            if block_quant and current_platform.is_device_capability(100):
            if block_quant and current_platform.is_device_capability_family(100):
                raise ValueError(
                    "FlashInfer FP8 MoE throughput backend does not "
                    "support block quantization. Please use "
@@ -193,7 +193,7 @@ def get_fp8_moe_backend(
    # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
    if (
        current_platform.is_cuda()
        and current_platform.is_device_capability(100)
        and current_platform.is_device_capability_family(100)
        and block_quant
    ):
        logger.info_once(
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -17,7 +17,9 @@ class ScaledMMLinearLayerConfig:
 class ScaledMMLinearKernel(ABC):
    @classmethod
    @abstractmethod
    def get_min_capability(cls) -> int:
    def is_supported(
        cls, compute_capability: int | None = None
    ) -> tuple[bool, str | None]:
        raise NotImplementedError

    @classmethod
@@ -35,6 +37,7 @@ class ScaledMMLinearKernel(ABC):
        azp_adj_param_name: str,
    ) -> None:
        assert self.can_implement(c)
        assert self.is_supported()
        self.config = c
        self.w_q_name = w_q_param_name
        self.w_s_name = w_s_param_name
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/init.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/init.py
@@ -27,7 +27,7 @@ from vllm.platforms import PlatformEnum, current_platform
 # in priority/performance order (when available)
 _POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = {
    PlatformEnum.CPU: [CPUScaledMMLinearKernel],
    PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
    PlatformEnum.CUDA: [CutlassScaledMMLinearKernel, TritonScaledMMLinearKernel],
    PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel],
    PlatformEnum.TPU: [XLAScaledMMLinearKernel],
 }
@@ -55,41 +55,25 @@ def choose_scaled_mm_linear_kernel(
        type[ScaledMMLinearKernel]: Chosen kernel.
    """

    if compute_capability is None:
        _cc = current_platform.get_device_capability()
        if _cc is not None:
            compute_capability = _cc[0] * 10 + _cc[1]

    failure_reasons = []
    for kernel in _POSSIBLE_KERNELS[current_platform._enum]:
        if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "").split(","):
            failure_reasons.append(
                f" {kernel.__name__} disabled by environment variable"
            )
            failure_reasons.append(f"{kernel.__name__}: disabled by env var")
            continue

        # If the current platform uses compute_capability,
        # make sure the kernel supports the compute cability.
        if compute_capability is not None:
            kernel_min_capability = kernel.get_min_capability()
            if (
                kernel_min_capability is not None
                and kernel_min_capability > compute_capability
            ):
                failure_reasons.append(
                    f"{kernel.__name__} requires capability "
                    f"{kernel_min_capability}, current compute capability "
                    f"is {compute_capability}"
                )
                continue
        is_supported, reason = kernel.is_supported(compute_capability)
        if not is_supported:
            failure_reasons.append(f"{kernel.__name__}: {reason}")
            continue

        can_implement, reason = kernel.can_implement(config)
        if not can_implement:
            failure_reasons.append(f"{kernel.__name__}: {reason}")
            continue

        can_implement, failure_reason = kernel.can_implement(config)
        if can_implement:
            return kernel
        else:
            failure_reasons.append(
                f" {kernel.__name__} cannot implement due to: {failure_reason}"
            )
        return kernel

    raise ValueError(
        "Failed to find a kernel that can implement the "
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -14,17 +14,21 @@ from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig

 class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
    @classmethod
    def get_min_capability(cls) -> int:
        return 90

    @classmethod
    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
    def is_supported(
        cls, compute_capability: int | None = None
    ) -> tuple[bool, str | None]:
        if not current_platform.is_rocm():
            return (
                False,
                "AiterScaledMMLinearKernel requires `aiter` which is not "
                + "currently supported on non-ROCm platform.",
            )
        if compute_capability is None:
            _cc = current_platform.get_device_capability()
            if _cc is not None:
                compute_capability = _cc.major * 10 + _cc.minor
        if compute_capability is not None and compute_capability < 90:
            return False, f"requires capability 90, got {compute_capability}"

        try:
            import aiter  # noqa: F401 # deliberately attempt to import aiter
@@ -34,8 +38,8 @@ class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
                "AiterScaledMMLinearKernel requires `aiter` which is not "
                + "installed on ROCm.",
            )
        # Check if rocm_aiter_gemm_w8a8_scaled_mm is enabled
        if not (rocm_aiter_ops.is_linear_enabled()):

        if not rocm_aiter_ops.is_linear_enabled():
            return (
                False,
                "AiterScaledMMLinearKernel is disabled. "
@@ -44,6 +48,10 @@ class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
                + "`VLLM_ROCM_USE_AITER_LINEAR` default is True.",
            )

        return True, None

    @classmethod
    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
        if not c.input_symmetric:
            return (
                False,
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
@@ -19,14 +19,15 @@ from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfi

 class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
    @classmethod
    def get_min_capability(cls) -> int:
        return 75
    def is_supported(
        cls, compute_capability: int | None = None
    ) -> tuple[bool, str | None]:
        if not current_platform.is_cpu():
            return False, "Requires CPU."
        return True, None

    @classmethod
    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
        if not current_platform.is_cpu():
            return False, "CPUScaledMM requires running on CPU."

        return True, None

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -16,14 +16,21 @@ from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfi

 class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
    @classmethod
    def get_min_capability(cls) -> int:
        return 75
    def is_supported(
        cls, compute_capability: int | None = None
    ) -> tuple[bool, str | None]:
        if not current_platform.is_cuda():
            return False, "Requires CUDA."
        if compute_capability is None:
            _cc = current_platform.get_device_capability()
            if _cc is not None:
                compute_capability = _cc.major * 10 + _cc.minor
        if compute_capability is not None and compute_capability < 75:
            return False, f"requires capability 75, got {compute_capability}"
        return True, None

    @classmethod
    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
        if not current_platform.is_cuda():
            return False, "CutlassScaledMM requires running on CUDA."

        return True, None

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
@@ -4,34 +4,53 @@

 import torch

 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm import (  # noqa: E501
    triton_scaled_mm,
 )
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.platforms import current_platform

 from .cutlass import CutlassScaledMMLinearKernel
 from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
 from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig


 class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):
 class TritonScaledMMLinearKernel(ScaledMMLinearKernel):
    @classmethod
    def get_min_capability(cls) -> int:
        return 75
    def is_supported(
        cls, compute_capability: int | None = None
    ) -> tuple[bool, str | None]:
        if current_platform.is_cuda_alike():
            return True, None
        return False, "Requires ROCm or CUDA."

    @classmethod
    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
        if current_platform.is_cpu():
            return (
                False,
                "TritonScaledMMLinearKernel requires Triton which is not "
                + "currently supported on CPU.",
            )
        if not c.input_symmetric:
            return (
                False,
                "TritonScaledMMLinearKernel only supports symmetric " + "quantization.",
            )
            return False, "Only symmetric input is supported."
        return True, None

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        super().process_weights_after_loading(layer)
        weight = getattr(layer, self.w_q_name)
        replace_parameter(
            layer,
            self.w_q_name,
            torch.nn.Parameter(weight.t().data, requires_grad=False),
        )

        # INPUT SCALE
        if self.config.is_static_input_scheme:
            input_scale = getattr(layer, self.i_s_name)
            replace_parameter(
                layer,
                self.i_s_name,
                torch.nn.Parameter(input_scale.max(), requires_grad=False),
            )
            setattr(layer, self.i_zp_name, None)
        else:
            setattr(layer, self.i_s_name, None)
            setattr(layer, self.i_zp_name, None)

        setattr(layer, self.azp_adj_name, None)

    def apply_weights(
        self,
@@ -39,4 +58,14 @@ class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):
        x: torch.Tensor,
        bias: torch.Tensor | None = None,
    ) -> torch.Tensor:
        return super().apply_weights(layer, x, bias)
        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)

        x_q, x_s, x_zp = ops.scaled_int8_quant(
            x.contiguous(), i_s, i_zp, symmetric=True
        )

        assert x_zp is None, "Triton kernel only supports symmetric quantization"

        return triton_scaled_mm(
            x_q, w_q, scale_a=x_s, scale_b=w_s, out_dtype=x.dtype, bias=bias
        )
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -17,11 +17,12 @@ from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfi

 class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
    @classmethod
    def get_min_capability(cls) -> int:
        raise NotImplementedError(
            "TPU platform does have a concept of compute capability, "
            "this method should not be called."
        )
    def is_supported(
        cls, compute_capability: int | None = None
    ) -> tuple[bool, str | None]:
        if not current_platform.is_tpu():
            return False, "Requires TPU."
        return True, None

    @classmethod
    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -81,6 +81,7 @@ from vllm.utils.flashinfer import (
    has_flashinfer,
    has_flashinfer_moe,
 )
 from vllm.utils.math_utils import round_up

 if TYPE_CHECKING:
    from vllm.model_executor.models.utils import WeightsMapper
@@ -607,6 +608,9 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
        Only supports pre-quantized checkpoints with FP8 weights and scales.
        """

        if self.flashinfer_moe_backend is not None:
            self._maybe_pad_intermediate_for_flashinfer(layer)

        layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
        layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)

@@ -684,6 +688,50 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
                rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight)
        register_moe_scaling_factors(layer)

    def _maybe_pad_intermediate_for_flashinfer(self, layer: torch.nn.Module) -> None:
        """Pad intermediate size so FlashInfer kernels' alignment constraints hold.

        Some FlashInfer FP8 MoE kernels require the (gated) intermediate size
        used for GEMM to be divisible by a small alignment value. When this is
        not satisfied (e.g. with certain tensor-parallel sizes), we pad the
        gate/up and down projection weights along the intermediate dim.
        """
        if not hasattr(layer, "w13_weight") or not hasattr(layer, "w2_weight"):
            return

        # Current local intermediate size (per partition) is the K dimension of
        # the down projection.
        num_experts, hidden_size, intermediate = layer.w2_weight.shape

        min_alignment = 16
        padded_intermediate = round_up(intermediate, min_alignment)

        if padded_intermediate == intermediate:
            return

        logger.info(
            "Padding intermediate size from %d to %d for up/down projection weights.",
            intermediate,
            padded_intermediate,
        )

        up_mult = 2 if self.moe.is_act_and_mul else 1
        padded_gate_up_dim = up_mult * padded_intermediate

        # Pad w13 and w12 along its intermediate dimension.
        w13 = layer.w13_weight.data
        padded_w13 = w13.new_zeros((num_experts, padded_gate_up_dim, hidden_size))
        padded_w13[:, : w13.shape[1], :] = w13
        layer.w13_weight.data = padded_w13

        w2 = layer.w2_weight.data
        padded_w2 = w2.new_zeros((num_experts, hidden_size, padded_intermediate))
        padded_w2[:, :, :intermediate] = w2
        layer.w2_weight.data = padded_w2

        if hasattr(layer, "intermediate_size_per_partition"):
            layer.intermediate_size_per_partition = padded_intermediate

    def get_fused_moe_quant_config(
        self, layer: torch.nn.Module
    ) -> FusedMoEQuantConfig | None:
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -118,19 +118,19 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
            logger.info_once("Using FlashInfer MXFP4 BF16 backend for SM90")
            return Mxfp4Backend.SM90_FI_MXFP4_BF16
        elif (
            current_platform.is_device_capability(100)
            current_platform.is_device_capability_family(100)
            and has_flashinfer()
            and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS
        ):
            logger.info_once("Using FlashInfer MXFP4 MXFP8 CUTLASS backend for SM100")
            return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
        elif (
            current_platform.is_device_capability(100)
            current_platform.is_device_capability_family(100)
            and has_flashinfer()
            and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
        ):
            return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
        elif current_platform.is_device_capability(100) and has_flashinfer():
        elif current_platform.is_device_capability_family(100) and has_flashinfer():
            logger.info_once(
                "Using FlashInfer MXFP4 BF16 backend for SM100, "
                "For faster performance on SM100, consider setting "
@@ -139,7 +139,7 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
            )
            return Mxfp4Backend.SM100_FI_MXFP4_BF16
        elif (
            current_platform.is_device_capability(100)
            current_platform.is_device_capability_family(100)
            or current_platform.is_device_capability(90)
        ) and not has_flashinfer():
            logger.warning_once(
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -50,7 +50,7 @@ def is_flashinfer_fp4_cutedsl_moe_available() -> bool:
        envs.VLLM_USE_FLASHINFER_MOE_FP4
        and has_flashinfer_cutedsl_grouped_gemm_nt_masked()
        and current_platform.is_cuda()
        and current_platform.is_device_capability(100)
        and current_platform.is_device_capability_family(100)
    )


--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -290,7 +290,7 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
    if flashinfer_moe_backend in backend_map:
        if (
            flashinfer_moe_backend == "latency"
            and not current_platform.has_device_capability(100)
            and not current_platform.is_device_capability_family(100)
        ):
            logger.info_once(
                "Flashinfer TRTLLM MOE backend is only supported on "
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -247,7 +247,7 @@ class W8A8BlockFp8LinearOp:
        self.act_quant_group_shape = act_quant_group_shape
        self.is_deep_gemm_supported = is_deep_gemm_supported()
        self.is_hopper = current_platform.is_device_capability(90)
        self.is_blackwell = current_platform.is_device_capability(100)
        self.is_blackwell = current_platform.is_device_capability_family(100)
        self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()

        # Get the correct blockscale mul and input quant operations.
@@ -762,9 +762,12 @@ def per_token_group_quant_fp8(
    )
    assert x.stride(-1) == 1, "`x` groups must be contiguous"

    # Using the default value (240.0) from pytorch will cause accuracy
    # issue on dynamic quantization models. Here use 224.0 for fnuz on ROCm
    # platforms that use the torch.float8_e4mefnuz dtype.
    finfo = torch.finfo(dtype)
    fp8_min = finfo.min
    fp8_max = finfo.max
    fp8_min = -224.0 if current_platform.is_fp8_fnuz() else finfo.min
    fp8_max = 224.0 if current_platform.is_fp8_fnuz() else finfo.max

    assert out_q is None or out_q.shape == x.shape
    x_q = out_q
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -57,12 +57,18 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
                mx_axis=1, num_warps=num_warps
            )
        )
    if current_platform.is_cuda() and current_platform.is_device_capability(100):
        constraints = {
            "is_persistent": True,
            "epilogue_subtile": 1,
        }
        opt_flags.update_opt_flags_constraints(constraints)
    if current_platform.is_cuda():
        if current_platform.is_device_capability(90):
            constraints = {
                "split_k": 1,
            }
            opt_flags.update_opt_flags_constraints(constraints)
        elif current_platform.is_device_capability_family(100):
            constraints = {
                "is_persistent": True,
                "epilogue_subtile": 1,
            }
            opt_flags.update_opt_flags_constraints(constraints)
    # transpose the tensor so that the quantization axis is on dim1
    quant_tensor = quant_tensor.transpose(-2, -1)
    scale = scale.transpose(-2, -1)
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -363,7 +363,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
        else:
            kernel_block_alignment_size = 16
            if (
                current_platform.is_device_capability(100)
                current_platform.is_device_capability_family(100)
                and model_config.get_head_size() == 256
                and (
                    attention_config.backend is None
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -59,7 +59,8 @@ from vllm.multimodal.processing import (
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config
 from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -1092,6 +1092,8 @@ class Qwen3NextModel(nn.Module):
                        name.endswith(".bias") or name.endswith("_bias")
                    ) and name not in params_dict:
                        continue
                    if name not in params_dict:
                        continue
                    param = params_dict[name]
                    weight_loader = param.weight_loader
                    weight_loader(
@@ -1108,6 +1110,11 @@ class Qwen3NextModel(nn.Module):
                        continue
                    if is_pp_missing_parameter(name, self):
                        continue
                    if name not in params_dict:
                        logger.warning_once(
                            f"Parameter {name} not found in params_dict, skip loading"
                        )
                        continue
                    param = params_dict[name]
                    weight_loader = getattr(
                        param, "weight_loader", default_weight_loader
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -51,7 +51,8 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config
 from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.tokenizers.mistral import MistralTokenizer

 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription
 from .utils import init_vllm_registered_model, maybe_prefix
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -182,7 +182,7 @@ class CudaPlatformBase(Platform):

            if vllm_config.attention_config.backend is None:
                # Default case
                if cls.is_device_capability(100) and not use_sparse:
                if cls.is_device_capability_family(100) and not use_sparse:
                    # Blackwell => Force CutlassMLA (unless sparse, i.e. DSv3.2).
                    use_cutlass_mla = True
                    # Set the backend in AttentionConfig so it's used during
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -301,6 +301,21 @@ class Platform:

        return current_capability.to_int() == capability

    @classmethod
    def is_device_capability_family(
        cls,
        capability: int,
        device_id: int = 0,
    ) -> bool:
        """
        Returns True if the device capability is any <major>.x.
        Mirrors CUDA 13 'family' architecture semantics (e.g. 10.x, 11.x, 12.x).
        """
        current_capability = cls.get_device_capability(device_id=device_id)
        if current_capability is None:
            return False
        return (current_capability.to_int() // 10) == (capability // 10)

    @classmethod
    def get_device_name(cls, device_id: int = 0) -> str:
        """Get the name of a device."""
--- a/vllm/reasoning/mistral_reasoning_parser.py
+++ b/vllm/reasoning/mistral_reasoning_parser.py
@@ -10,7 +10,7 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
 from vllm.tokenizers import MistralTokenizer
 from vllm.tokenizers.mistral import MistralTokenizer

 logger = init_logger(__name__)

--- a/vllm/tokenizers/init.py
+++ b/vllm/tokenizers/init.py
@@ -1,9 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from .deepseekv32 import DeepseekV32Tokenizer
 from .hf import HfTokenizer
 from .mistral import MistralTokenizer
 from .protocol import TokenizerLike
 from .registry import (
    TokenizerRegistry,
@@ -15,12 +12,9 @@ from .registry import (

 __all__ = [
    "TokenizerLike",
    "HfTokenizer",
    "MistralTokenizer",
    "TokenizerRegistry",
    "cached_get_tokenizer",
    "get_tokenizer",
    "cached_tokenizer_from_config",
    "init_tokenizer_from_config",
    "DeepseekV32Tokenizer",
 ]
--- a/vllm/tokenizers/deepseekv32.py
+++ b/vllm/tokenizers/deepseekv32.py
@@ -2,24 +2,18 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from pathlib import Path
 from typing import Any

 from transformers import BatchEncoding

 from .deepseek_v32_encoding import encode_messages
 from .hf import HfTokenizer, TokenizerLike
 from .registry import TokenizerRegistry
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam

 from .deepseek_v32_encoding import encode_messages
 from .hf import CachedHfTokenizer
 from .protocol import TokenizerLike

@TokenizerRegistry.register("deepseek_v32")
 class DeepseekV32Tokenizer(HfTokenizer):
    def __init__(self, tokenizer: TokenizerLike):
        self.tokenizer = tokenizer
        self.name_or_path = (
            tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
        )
        self._added_vocab = self.tokenizer.get_added_vocab()
        self._added_vocab_size = len(self._added_vocab)

 class DeepseekV32Tokenizer(CachedHfTokenizer):
    @classmethod
    def from_pretrained(
        cls,
@@ -40,7 +34,21 @@ class DeepseekV32Tokenizer(HfTokenizer):
        )
        return DeepseekV32Tokenizer(tokenizer)

    def apply_chat_template(self, messages, tools=None, **kwargs):
    def __init__(self, tokenizer: TokenizerLike) -> None:
        super().__init__()

        self.tokenizer = tokenizer
        self.name_or_path = getattr(tokenizer, "name_or_path", "")

        self._added_vocab = self.tokenizer.get_added_vocab()
        self._added_vocab_size = len(self._added_vocab)

    def apply_chat_template(
        self,
        messages: list["ChatCompletionMessageParam"],
        tools: list[dict[str, Any]] | None = None,
        **kwargs,
    ) -> str | list[int]:
        thinking = kwargs.get("thinking", False)
        thinking_mode = "thinking"
        if not thinking:
@@ -49,13 +57,24 @@ class DeepseekV32Tokenizer(HfTokenizer):
        messages = conversation.copy()
        if tools is not None and len(tools) > 0:
            messages.insert(0, {"role": "system"})
            messages[0]["tools"] = tools
            messages[0]["tools"] = tools  # type: ignore[typeddict-unknown-key]

        # Historical reasoning content is dropped when a new user message is introduced
        drop_thinking = messages[-1]["role"] == "user"

        encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking)
        prompt_str = encode_messages(messages, **encode_config)  # type: ignore

        if kwargs.get("tokenize", True):
            tokenizer_kwargs = {
                k: kwargs[k] for k in ("truncation", "max_length") if k in kwargs
            }
            return self.encode(
                prompt_str,
                add_special_tokens=False,
                **tokenizer_kwargs,
            )

        return prompt_str

    def num_special_tokens_to_add(self) -> int:
--- a/vllm/tokenizers/hf.py
+++ b/vllm/tokenizers/hf.py
@@ -3,22 +3,18 @@
 import contextlib
 import copy
 from pathlib import Path
 from typing import TYPE_CHECKING
 from typing import TypeAlias

 from transformers import AutoTokenizer
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast

 from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config

 from .protocol import TokenizerLike
 from .registry import TokenizerRegistry

 if TYPE_CHECKING:
    from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 HfTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast


 def get_cached_tokenizer(
    tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast",
 ) -> TokenizerLike:
 def get_cached_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
    """
    By default, transformers will recompute multiple tokenizer properties
    each time they are called, leading to a significant slowdown.
@@ -65,11 +61,10 @@ def get_cached_tokenizer(
    CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"

    cached_tokenizer.__class__ = CachedTokenizer
    return cached_tokenizer  # type: ignore
    return cached_tokenizer


@TokenizerRegistry.register("hf")
 class HfTokenizer(TokenizerLike):
 class CachedHfTokenizer(TokenizerLike):
    @classmethod
    def from_pretrained(
        cls,
@@ -79,7 +74,7 @@ class HfTokenizer(TokenizerLike):
        revision: str | None = None,
        download_dir: str | None = None,
        **kwargs,
    ) -> "TokenizerLike":
    ) -> HfTokenizer:
        try:
            tokenizer = AutoTokenizer.from_pretrained(
                path_or_repo_id,
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -3,10 +3,11 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, cast

 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.logger import init_logger

 from .protocol import TokenizerLike
 from .registry import TokenizerRegistry

 if TYPE_CHECKING:
    from mistral_common.protocol.instruct.request import (
@@ -15,9 +16,6 @@ if TYPE_CHECKING:
    from mistral_common.tokens.tokenizers.tekken import Tekkenizer
    from transformers import BatchEncoding

    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
    from vllm.entrypoints.openai.protocol import ChatCompletionRequest

    try:
        # Transformers v5
        from transformers.tokenization_mistral_common import MistralCommonBackend
@@ -201,7 +199,6 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
        return tokenizer.unk_id


@TokenizerRegistry.register("mistral")
 class MistralTokenizer(TokenizerLike):
    @classmethod
    def from_pretrained(
--- a/vllm/tokenizers/protocol.py
+++ b/vllm/tokenizers/protocol.py
@@ -97,7 +97,7 @@ class TokenizerLike(Protocol):
        messages: list["ChatCompletionMessageParam"],
        tools: list[dict[str, Any]] | None = None,
        **kwargs,
    ) -> list[int]:
    ) -> str | list[int]:
        raise NotImplementedError

    def convert_tokens_to_string(self, tokens: list[str]) -> str:
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib.util
 from collections.abc import Callable
 from dataclasses import dataclass, field
 from functools import lru_cache
 from pathlib import Path
 from typing import TYPE_CHECKING, TypeVar, overload
 from typing import TYPE_CHECKING

 import huggingface_hub
 from typing_extensions import assert_never
 from typing_extensions import TypeVar, assert_never, deprecated

 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -24,46 +24,25 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
 from .protocol import TokenizerLike

 if TYPE_CHECKING:
    from vllm.config import ModelConfig
    from vllm.config.model import ModelConfig, RunnerType

 logger = init_logger(__name__)

 _T = TypeVar("_T", bound=type[TokenizerLike])

 _VLLM_TOKENIZERS = {
    "deepseekv32": ("deepseekv32", "DeepseekV32Tokenizer"),
    "hf": ("hf", "CachedHfTokenizer"),
    "mistral": ("mistral", "MistralTokenizer"),
 }

 class TokenizerRegistry:
    # Tokenizer name -> tokenizer_cls or (tokenizer module, tokenizer class)
    REGISTRY: dict[str, type[TokenizerLike] | tuple[str, str]] = {}

    # In-tree tokenizers
    @staticmethod
    @overload
    def register(tokenizer_mode: str) -> Callable[[_T], _T]: ...
@dataclass
 class _TokenizerRegistry:
    # Tokenizer mode ->  (tokenizer module, tokenizer class)
    tokenizers: dict[str, tuple[str, str]] = field(default_factory=dict)

    # OOT tokenizers
    @staticmethod
    @overload
    def register(tokenizer_mode: str, module: str, class_name: str) -> None: ...

    @staticmethod
    def register(
        tokenizer_mode: str,
        module: str | None = None,
        class_name: str | None = None,
    ) -> Callable[[_T], _T] | None:
        # In-tree tokenizers
        if module is None or class_name is None:

            def wrapper(tokenizer_cls: _T) -> _T:
                assert tokenizer_mode not in TokenizerRegistry.REGISTRY
                TokenizerRegistry.REGISTRY[tokenizer_mode] = tokenizer_cls

                return tokenizer_cls

            return wrapper

        # OOT tokenizers
        if tokenizer_mode in TokenizerRegistry.REGISTRY:
    def register(self, tokenizer_mode: str, module: str, class_name: str) -> None:
        if tokenizer_mode in self.tokenizers:
            logger.warning(
                "%s.%s is already registered for tokenizer_mode=%r. "
                "It is overwritten by the new one.",
@@ -72,36 +51,42 @@ class TokenizerRegistry:
                tokenizer_mode,
            )

        TokenizerRegistry.REGISTRY[tokenizer_mode] = (module, class_name)
        self.tokenizers[tokenizer_mode] = (module, class_name)

        return None

    @staticmethod
    def get_tokenizer(tokenizer_mode: str, *args, **kwargs) -> "TokenizerLike":
        if tokenizer_mode not in TokenizerRegistry.REGISTRY:
    def load_tokenizer_cls(self, tokenizer_mode: str) -> type[TokenizerLike]:
        if tokenizer_mode not in self.tokenizers:
            raise ValueError(f"No tokenizer registered for {tokenizer_mode=!r}.")

        item = TokenizerRegistry.REGISTRY[tokenizer_mode]
        if isinstance(item, type):
            return item.from_pretrained(*args, **kwargs)

        module, class_name = item
        module, class_name = self.tokenizers[tokenizer_mode]
        logger.debug_once(f"Loading {class_name} for {tokenizer_mode=!r}")

        class_ = resolve_obj_by_qualname(f"{module}.{class_name}")
        return class_.from_pretrained(*args, **kwargs)
        return resolve_obj_by_qualname(f"{module}.{class_name}")

    def load_tokenizer(self, tokenizer_mode: str, *args, **kwargs) -> TokenizerLike:
        tokenizer_cls = self.load_tokenizer_cls(tokenizer_mode)
        return tokenizer_cls.from_pretrained(*args, **kwargs)

 def get_tokenizer(

 TokenizerRegistry = _TokenizerRegistry(
    {
        mode: (f"vllm.tokenizers.{mod_relname}", cls_name)
        for mode, (mod_relname, cls_name) in _VLLM_TOKENIZERS.items()
    }
 )


 def resolve_tokenizer_args(
    tokenizer_name: str | Path,
    *args,
    runner_type: "RunnerType" = "generate",
    tokenizer_mode: str = "auto",
    trust_remote_code: bool = False,
    revision: str | None = None,
    download_dir: str | None = None,
    **kwargs,
 ) -> TokenizerLike:
    """Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
 ):
    revision: str | None = kwargs.get("revision")
    download_dir: str | None = kwargs.get("download_dir")

    if envs.VLLM_USE_MODELSCOPE:
        # download model from ModelScope hub,
        # lazy import so that modelscope is not required for normal use.
@@ -125,16 +110,6 @@ def get_tokenizer(
                )
                tokenizer_name = tokenizer_path

    if tokenizer_mode == "slow":
        if kwargs.get("use_fast", False):
            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")

        tokenizer_mode = "hf"
        kwargs["use_fast"] = False

    if "truncation_side" not in kwargs:
        kwargs["truncation_side"] = "left"

    # Separate model folder from file path for GGUF models
    if is_gguf(tokenizer_name):
        if check_gguf_file(tokenizer_name):
@@ -150,6 +125,21 @@ def get_tokenizer(
            )
            kwargs["gguf_file"] = gguf_file

    if "truncation_side" not in kwargs:
        if runner_type == "generate" or runner_type == "draft":
            kwargs["truncation_side"] = "left"
        elif runner_type == "pooling":
            kwargs["truncation_side"] = "right"
        else:
            assert_never(runner_type)

    if tokenizer_mode == "slow":
        if kwargs.get("use_fast", False):
            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")

        tokenizer_mode = "hf"
        kwargs["use_fast"] = False

    # Try to use official Mistral tokenizer if possible
    if tokenizer_mode == "auto" and importlib.util.find_spec("mistral_common"):
        allow_patterns = ["tekken.json", "tokenizer.model.v*"]
@@ -165,49 +155,70 @@ def get_tokenizer(
    if tokenizer_mode == "auto":
        tokenizer_mode = "hf"

    tokenizer_args = (tokenizer_name, *args)
    tokenizer_kwargs = dict(
    return tokenizer_mode, tokenizer_name, args, kwargs


 cached_resolve_tokenizer_args = lru_cache(resolve_tokenizer_args)


 def tokenizer_args_from_config(config: "ModelConfig", **kwargs):
    return cached_resolve_tokenizer_args(
        config.tokenizer,
        runner_type=config.runner_type,
        tokenizer_mode=config.tokenizer_mode,
        revision=config.tokenizer_revision,
        trust_remote_code=config.trust_remote_code,
        **kwargs,
    )


 _T = TypeVar("_T", bound=TokenizerLike, default=TokenizerLike)


 def get_tokenizer(
    tokenizer_name: str | Path,
    *args,
    tokenizer_cls: type[_T] = TokenizerLike,  # type: ignore[assignment]
    trust_remote_code: bool = False,
    revision: str | None = None,
    download_dir: str | None = None,
    **kwargs,
 ) -> _T:
    """Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
    tokenizer_mode, tokenizer_name, args, kwargs = cached_resolve_tokenizer_args(
        tokenizer_name,
        *args,
        trust_remote_code=trust_remote_code,
        revision=revision,
        download_dir=download_dir,
        **kwargs,
    )

    if tokenizer_mode == "custom":
        logger.warning_once(
            "TokenizerRegistry now uses `tokenizer_mode` as the registry key "
            "instead of `tokenizer_name`. "
            "Please update the definition of `.from_pretrained` in "
            "your custom tokenizer to accept `args=%s`, `kwargs=%s`. "
            "Then, you can pass `tokenizer_mode=%r` instead of "
            "`tokenizer_mode='custom'` when initializing vLLM.",
            tokenizer_args,
            str(tokenizer_kwargs),
            tokenizer_name,
        )

        tokenizer_mode = str(tokenizer_name)
    if tokenizer_cls == TokenizerLike:
        tokenizer_cls_ = TokenizerRegistry.load_tokenizer_cls(tokenizer_mode)
    else:
        tokenizer_cls_ = tokenizer_cls

    tokenizer = TokenizerRegistry.get_tokenizer(
        tokenizer_mode,
        *tokenizer_args,
        **tokenizer_kwargs,
    )
    tokenizer = tokenizer_cls_.from_pretrained(tokenizer_name, *args, **kwargs)
    if not tokenizer.is_fast:
        logger.warning(
            "Using a slow tokenizer. This might cause a significant "
            "slowdown. Consider using a fast tokenizer instead."
        )

    return tokenizer
    return tokenizer  # type: ignore


 cached_get_tokenizer = lru_cache(get_tokenizer)


 def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
    if model_config.skip_tokenizer_init:
        return None

    return cached_get_tokenizer(
        model_config.tokenizer,
        runner_type=model_config.runner_type,
        tokenizer_mode=model_config.tokenizer_mode,
        revision=model_config.tokenizer_revision,
        trust_remote_code=model_config.trust_remote_code,
@@ -215,19 +226,8 @@ def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
    )


@deprecated(
    "Renamed to `cached_tokenizer_from_config`. The old name will be removed in v0.14."
 )
 def init_tokenizer_from_config(model_config: "ModelConfig"):
    runner_type = model_config.runner_type
    if runner_type == "generate" or runner_type == "draft":
        truncation_side = "left"
    elif runner_type == "pooling":
        truncation_side = "right"
    else:
        assert_never(runner_type)

    return get_tokenizer(
        model_config.tokenizer,
        tokenizer_mode=model_config.tokenizer_mode,
        trust_remote_code=model_config.trust_remote_code,
        revision=model_config.tokenizer_revision,
        truncation_side=truncation_side,
    )
    return cached_tokenizer_from_config(model_config)
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -60,17 +60,17 @@ def __getattr__(name: str):

        return cached_tokenizer_from_config
    if name == "init_tokenizer_from_configs":
        from vllm.tokenizers import init_tokenizer_from_config
        from vllm.tokenizers import cached_tokenizer_from_config

        warnings.warn(
            "`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` "
            "has been moved to `vllm.tokenizers.init_tokenizer_from_config`. "
            "has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )

        return init_tokenizer_from_config
        return cached_tokenizer_from_config

    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -38,7 +38,7 @@ class DeepGemmQuantScaleFMT(Enum):
            return DeepGemmQuantScaleFMT.FLOAT32
        return (
            DeepGemmQuantScaleFMT.UE8M0
            if current_platform.is_device_capability(100)
            if current_platform.is_device_capability_family(100)
            else DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0
        )

@@ -50,7 +50,7 @@ def is_deep_gemm_supported() -> bool:
    """
    is_supported_arch = current_platform.is_cuda() and (
        current_platform.is_device_capability(90)
        or current_platform.is_device_capability(100)
        or current_platform.is_device_capability_family(100)
    )
    return envs.VLLM_USE_DEEP_GEMM and has_deep_gemm() and is_supported_arch

--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -264,7 +264,9 @@ def supports_trtllm_attention() -> bool:
        return False

    # Requires SM100 and NVIDIA artifactory to be accessible to download cubins
    return current_platform.is_device_capability(100) and has_nvidia_artifactory()
    return (
        current_platform.is_device_capability_family(100) and has_nvidia_artifactory()
    )


 def force_use_trtllm_attention() -> bool | None:
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -194,12 +194,33 @@ def get_kv_cache_torch_dtype(
    return torch_dtype


 def get_kv_cache_quant_algo_dtype(quant_cfg: dict[str, Any]) -> torch.dtype | None:
    quant_method = quant_cfg.get("quant_method", "")
    if quant_method.startswith("modelopt"):
        quantization_inner = quant_cfg.get("quantization", quant_cfg)
        # Check if quant config is specified and use kv cache quant algo
        kv_algo = quantization_inner.get("kv_cache_quant_algo") or quant_cfg.get(
            "kv_cache_quant_algo"
        )
        if isinstance(kv_algo, str):
            return STR_DTYPE_TO_TORCH_DTYPE[kv_algo.lower()]
    return None


 def kv_cache_dtype_str_to_dtype(
    kv_cache_dtype: str, model_config: ModelConfig
 ) -> torch.dtype:
    # Model config may not be specified for unit tests, default to float16
    dtype = model_config.dtype if model_config else torch.half
    if kv_cache_dtype == "auto":
        # Model config may not be specified for unit tests, default to float16
        return model_config.dtype if model_config else torch.half
        hf_cfg = getattr(model_config, "hf_config", None)
        if hf_cfg is not None:
            quant_cfg = getattr(hf_cfg, "quantization_config", None)
            if quant_cfg is not None:
                kv_algo_dtype = get_kv_cache_quant_algo_dtype(quant_cfg)
                return kv_algo_dtype if kv_algo_dtype is not None else dtype
        return dtype

    return STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]


--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -564,7 +564,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
        )
        self.paged_kv_last_page_len_np = self.paged_kv_last_page_len_cpu.numpy()

        if self.head_dim == 256 and current_platform.is_device_capability(100):
        if self.head_dim == 256 and current_platform.is_device_capability_family(100):
            # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that
            # head size 256 and block size 16 is not supported on blackwell.
            assert kv_cache_spec.block_size != 16, (
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -446,7 +446,7 @@ def use_flashinfer_prefill() -> bool:
        and flashinfer_available
        and not vllm_config.attention_config.use_cudnn_prefill
        and not vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill
        and current_platform.is_device_capability(100)
        and current_platform.is_device_capability_family(100)
    )


@@ -457,7 +457,7 @@ def use_cudnn_prefill() -> bool:
    return (
        flashinfer_available
        and vllm_config.attention_config.use_cudnn_prefill
        and current_platform.is_device_capability(100)
        and current_platform.is_device_capability_family(100)
        and has_nvidia_artifactory()
    )

@@ -470,7 +470,7 @@ def use_trtllm_ragged_deepseek_prefill() -> bool:
    return (
        flashinfer_available
        and vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill
        and current_platform.is_device_capability(100)
        and current_platform.is_device_capability_family(100)
    )


--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -420,7 +420,7 @@ class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetad
        max_num_sm_parts = int(
            max((sm_count // 2) / h_k // (cdiv(h_q // h_k, 2 * 64) * s_q), 1)
        )
        if current_platform.is_device_capability(100):
        if current_platform.is_device_capability_family(100):
            max_num_sm_parts *= 2
        self.tile_scheduler_metadata_buffer = torch.empty(
            # TileSchedulerMetaDataSize = 8
@@ -719,7 +719,7 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
        self.softmax_scale = scale
        assert indexer is not None
        self.topk_indices_buffer = indexer.topk_indices_buffer
        self.padding = 128 if current_platform.is_device_capability(100) else 64
        self.padding = 128 if current_platform.is_device_capability_family(100) else 64

        if kv_cache_dtype == "fp8_ds_mla":
            # Reserve workspace during initialization
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -17,7 +17,7 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
    triton_reshape_and_cache_flash,
 )
 from vllm.attention.ops.triton_unified_attention import unified_attention
 from vllm.config import VllmConfig
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -26,6 +26,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 )
 from vllm.platforms import current_platform
 from vllm.platforms.interface import DeviceCapability
 from vllm.utils.math_utils import next_power_of_2
 from vllm.v1.attention.backends.utils import (
    AttentionCGSupport,
    AttentionMetadataBuilder,
@@ -36,6 +37,11 @@ from vllm.v1.kv_cache_interface import AttentionSpec
 logger = init_logger(__name__)


 # constants
 MIN_LAUNCH_GRID_SIZE_2D = 128  # Minimum launch grid size of 2D kernel
 NUM_PAR_SOFTMAX_SEGMENTS = 16  # Number of parallel tiled softmax segments


@dataclass
 class TritonAttentionMetadata:
    # NOTE(sang): Definition of context_len, query_len, and seq_len.
@@ -54,6 +60,12 @@ class TritonAttentionMetadata:
    block_table: torch.Tensor
    slot_mapping: torch.Tensor

    seq_threshold_3D: int
    num_par_softmax_segments: int
    softmax_segm_output: torch.Tensor
    softmax_segm_max: torch.Tensor
    softmax_segm_expsum: torch.Tensor

    # For cascade attention.
    use_cascade: bool
    common_prefix_len: int
@@ -87,6 +99,60 @@ class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMet
        self.num_heads_kv = model_config.get_num_kv_heads(vllm_config.parallel_config)
        self.headdim = model_config.get_head_size()

        # Check if CUDA Graphs are enabled for decode
        self.decode_cudagraph_enabled = (
            self.vllm_config.compilation_config.cudagraph_mode
            in (
                CUDAGraphMode.FULL_AND_PIECEWISE,
                CUDAGraphMode.FULL_DECODE_ONLY,
                CUDAGraphMode.FULL,
            )
        )

        # The launch grid for the 2D kernel is defined as (num_q_blocks, num_heads_kv).
        # A lower bound for num_q_blocks is the number of sequences.
        # To ensure the minimum launch grid size is achieved, the number of sequences
        # must be at least equal to the threshold below.
        # If this threshold is not reached (i.e., the batch size is not large enough),
        # the 3D kernel will be selected instead.
        self.seq_threshold_3D = MIN_LAUNCH_GRID_SIZE_2D // self.num_heads_kv

        # Modify the threshold if needed.
        if self.decode_cudagraph_enabled:
            capture_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes
            assert capture_sizes, "CUDA Graphs enabled but no capture sizes specified."

            # Select the CUDA Graph capture size closest to self.seq_threshold_3D
            # as threshold. This ensures that each captured graph covers the
            # correct execution path.
            self.seq_threshold_3D = min(
                capture_sizes,
                key=lambda x: abs(x - self.seq_threshold_3D),
            )

        self.num_par_softmax_segments = NUM_PAR_SOFTMAX_SEGMENTS
        headdim_padded = next_power_of_2(self.headdim)
        self.softmax_segm_output = torch.empty(
            (
                self.seq_threshold_3D,
                self.num_heads_q,
                self.num_par_softmax_segments,
                headdim_padded,
            ),
            dtype=torch.float32,
            device=device,
        )
        self.softmax_segm_max = torch.empty(
            (self.seq_threshold_3D, self.num_heads_q, self.num_par_softmax_segments),
            dtype=torch.float32,
            device=device,
        )
        self.softmax_segm_expsum = torch.empty(
            (self.seq_threshold_3D, self.num_heads_q, self.num_par_softmax_segments),
            dtype=torch.float32,
            device=device,
        )

    def build_for_cudagraph_capture(
        self, common_attn_metadata: CommonAttentionMetadata
    ) -> TritonAttentionMetadata:
@@ -143,6 +209,11 @@ class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMet
            prefix_kv_lens=prefix_kv_lens,
            suffix_kv_lens=suffix_kv_lens,
            prefix_scheduler_metadata=prefix_scheduler_metadata,
            seq_threshold_3D=self.seq_threshold_3D,
            num_par_softmax_segments=self.num_par_softmax_segments,
            softmax_segm_output=self.softmax_segm_output,
            softmax_segm_max=self.softmax_segm_max,
            softmax_segm_expsum=self.softmax_segm_expsum,
        )
        return attn_metadata

@@ -349,6 +420,12 @@ class TritonAttentionImpl(AttentionImpl):
        max_seqlen_k = attn_metadata.max_seq_len
        block_table = attn_metadata.block_table

        seq_threshold_3D = attn_metadata.seq_threshold_3D
        num_par_softmax_segments = attn_metadata.num_par_softmax_segments
        softmax_segm_output = attn_metadata.softmax_segm_output
        softmax_segm_max = attn_metadata.softmax_segm_max
        softmax_segm_expsum = attn_metadata.softmax_segm_expsum

        descale_shape = (cu_seqlens_q.shape[0] - 1, key_cache.shape[2])

        unified_attention(
@@ -369,6 +446,11 @@ class TritonAttentionImpl(AttentionImpl):
            q_descale=None,  # Not supported
            k_descale=layer._k_scale.expand(descale_shape),
            v_descale=layer._v_scale.expand(descale_shape),
            seq_threshold_3D=seq_threshold_3D,
            num_par_softmax_segments=num_par_softmax_segments,
            softmax_segm_output=softmax_segm_output,
            softmax_segm_max=softmax_segm_max,
            softmax_segm_expsum=softmax_segm_expsum,
            sinks=self.sinks,
            output_scale=output_scale,
        )
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1117,6 +1117,7 @@ class Scheduler(SchedulerInterface):
            stopped = False
            new_logprobs = None
            new_token_ids = generated_token_ids
            pooler_output = pooler_outputs[req_index] if pooler_outputs else None
            kv_transfer_params = None
            status_before_stop = request.status

@@ -1125,12 +1126,10 @@ class Scheduler(SchedulerInterface):
                new_token_ids, stopped = self._update_request_with_output(
                    request, new_token_ids
                )

            # Stop checking for pooler models.
            pooler_output = None
            if pooler_outputs:
                pooler_output = pooler_outputs[req_index]
                stopped = check_stop(request, self.max_model_len, pooler_output)
            elif request.pooling_params and pooler_output is not None:
                # Pooling stops as soon as there is output.
                request.status = RequestStatus.FINISHED_STOPPED
                stopped = True

            if stopped:
                kv_transfer_params = self._free_request(request)
--- a/vllm/v1/core/sched/utils.py
+++ b/vllm/v1/core/sched/utils.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib

 import torch

 from vllm.v1.request import Request, RequestStatus


@@ -39,14 +37,8 @@ def remove_all(lst: list, items_to_remove: set) -> list:
    return [item for item in lst if item not in items_to_remove]


 def check_stop(
    request: Request, max_model_len: int, pooler_output: torch.Tensor | None = None
 ) -> bool:
    if request.pooling_params:
        if pooler_output is not None:
            request.status = RequestStatus.FINISHED_STOPPED
            return True
        return False
 def check_stop(request: Request, max_model_len: int) -> bool:
    assert not request.pooling_params

    sampling_params = request.sampling_params
    assert sampling_params is not None
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -26,7 +26,7 @@ from vllm.plugins.io_processors import get_io_processor
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
 from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config
 from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
 from vllm.tracing import init_tracer
 from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
 from vllm.usage.usage_lib import UsageContext
@@ -111,7 +111,7 @@ class AsyncLLM(EngineClient):
        if self.model_config.skip_tokenizer_init:
            tokenizer = None
        else:
            tokenizer = init_tokenizer_from_config(self.model_config)
            tokenizer = cached_tokenizer_from_config(self.model_config)

        self.input_processor = InputProcessor(self.vllm_config, tokenizer)
        self.io_processor = get_io_processor(
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -19,7 +19,8 @@ from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.metrics.stats import MultiModalCacheStats
@@ -188,29 +189,39 @@ class InputProcessor:
        def _validate_single_prompt(single_prompt: dict | str) -> None:
            if not isinstance(single_prompt, dict):
                return

            mm_data = single_prompt.get("multi_modal_data")
            mm_uuids = single_prompt.get("multi_modal_uuids")
            if not mm_data or not mm_uuids:
                return

            import torch

            def _get_len(items: object):
                if isinstance(items, dict):  # Embedding inputs
                    return _get_len(next(iter(items.values()))) if items else 1

                if isinstance(items, list):
                    return len(items)
                if isinstance(items, torch.Tensor):
                    # To keep backwards compatibility for single item embedding input
                    return 1 if getattr(items, "_is_single_item", False) else len(items)

                return 1

            for modality, items in mm_data.items():
                if modality in mm_uuids:
                    data_len = len(items) if isinstance(items, list) else 1
                    uuid_len = (
                        len(mm_uuids[modality])
                        if isinstance(mm_uuids[modality], list)
                        else 1
                    )
                    data_len = _get_len(items)
                    uuid_len = _get_len(mm_uuids[modality])
                    if uuid_len != data_len:
                        raise ValueError(
                            f"multi_modal_uuids for modality '{modality}' "
                            f"multi_modal_uuids for modality {modality!r} "
                            "must have same length as data: got "
                            f"{uuid_len} uuids vs "
                            f"{data_len} items."
                            f"{uuid_len} uuids vs {data_len} items."
                        )
                else:
                    raise ValueError(
                        f"multi_modal_uuids for modality '{modality}' must "
                        f"multi_modal_uuids for modality {modality!r} must "
                        "be provided if multi_modal_data is provided."
                    )

--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -23,7 +23,7 @@ from vllm.plugins.io_processors import get_io_processor
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
 from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config
 from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
 from vllm.tracing import init_tracer
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine import EngineCoreRequest
@@ -86,7 +86,7 @@ class LLMEngine:
        if self.model_config.skip_tokenizer_init:
            tokenizer = None
        else:
            tokenizer = init_tokenizer_from_config(self.model_config)
            tokenizer = cached_tokenizer_from_config(self.model_config)

        self.input_processor = InputProcessor(self.vllm_config, tokenizer)
        self.io_processor = get_io_processor(
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -219,7 +219,7 @@ class Executor(ABC):

    def sample_tokens(
        self, grammar_output: GrammarOutput | None, non_block: bool = False
    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
        output = self.collective_rpc(  # type: ignore[call-overload]
            "sample_tokens", args=(grammar_output,), non_block=non_block
        )
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -294,8 +294,8 @@ class MultiprocExecutor(Executor):
        kwargs: dict | None = None,
        non_block: bool = False,
        unique_reply_rank: int | None = None,
        kv_output_aggregator: KVOutputAggregator = None,
    ) -> Any | list[Any] | Future[Any | list[Any]]:
        kv_output_aggregator: KVOutputAggregator | None = None,
    ) -> Any:
        """Returns single result if unique_reply_rank and/or kv_output_aggregator
        is provided, otherwise list."""
        assert self.rpc_broadcast_mq is not None, (
@@ -476,6 +476,8 @@ class WorkerProc:
    """Wrapper that runs one Worker in a separate process."""

    READY_STR = "READY"
    rpc_broadcast_mq: MessageQueue | None
    worker_response_mq: MessageQueue | None

    def _init_message_queues(
        self, input_shm_handle: Handle, vllm_config: VllmConfig
@@ -487,7 +489,7 @@ class WorkerProc:
            )

            # Initializes a message queue for sending the model output
            self.worker_response_mq: MessageQueue = MessageQueue(1, 1)
            self.worker_response_mq = MessageQueue(1, 1)
            self.peer_response_handles = []
        else:
            # Initialize remote MessageQueue for receiving SchedulerOutput across nodes
@@ -720,6 +722,7 @@ class WorkerProc:
        try:
            reader.close()
            worker = WorkerProc(*args, **kwargs)
            assert worker.worker_response_mq is not None

            # Send READY once we know everything is loaded
            ready_writer.send(
@@ -804,6 +807,7 @@ class WorkerProc:

    def worker_busy_loop(self, cancel: threading.Event | None = None):
        """Main busy loop for Multiprocessing Workers"""
        assert self.rpc_broadcast_mq is not None
        while True:
            method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue(
                cancel=cancel, indefinite=True
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -413,7 +413,7 @@ class RayDistributedExecutor(Executor):
        self,
        grammar_output: "GrammarOutput | None",
        non_block: bool = False,
    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
        """Execute the model on the Ray workers.

        The scheduler output to use should have been provided in
@@ -428,7 +428,7 @@ class RayDistributedExecutor(Executor):
        """
        scheduler_output = self.scheduler_output
        if scheduler_output is None:
            return COMPLETED_NONE_FUTURE if non_block else None  # noqa
            return COMPLETED_NONE_FUTURE if non_block else None

        self.scheduler_output = None

@@ -439,7 +439,7 @@ class RayDistributedExecutor(Executor):
        scheduler_output: SchedulerOutput,
        grammar_output: "GrammarOutput | None",
        non_block: bool = False,
    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
        # Build the compiled DAG for the first time.
        if self.forward_dag is None:  # type: ignore
            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -67,7 +67,7 @@ class UniProcExecutor(Executor):
        kwargs: dict | None = None,
        non_block: bool = False,
        single_value: bool = False,
    ) -> Any | list[Any] | Future[Any | list[Any]]:
    ) -> Any:
        if kwargs is None:
            kwargs = {}

@@ -79,10 +79,13 @@ class UniProcExecutor(Executor):
            result = run_method(self.driver_worker, method, args, kwargs)
            if isinstance(result, AsyncModelRunnerOutput):
                if (async_thread := self.async_output_thread) is not None:
                    get_output = result.get_output
                    if not single_value:
                        get_output = lambda go=result.get_output: [go()]
                    return async_thread.submit(get_output)
                    if single_value:
                        return async_thread.submit(result.get_output)

                    def get_output_list() -> list[Any]:
                        return [result.get_output()]

                    return async_thread.submit(get_output_list)
                result = result.get_output()
            future = Future[Any]()
            future.set_result(result if single_value else [result])
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -145,7 +145,7 @@ class RejectionSampler(nn.Module):
        )

        logprobs_tensors = None
        if sampling_metadata.max_num_logprobs:
        if sampling_metadata.max_num_logprobs is not None:
            logprobs_tensors = self._get_logprobs_tensors(
                sampling_metadata.max_num_logprobs,
                metadata,
--- a/vllm/v1/structured_output/init.py
+++ b/vllm/v1/structured_output/init.py
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
 from vllm.tokenizers import init_tokenizer_from_config
 from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.utils.import_utils import LazyLoader
 from vllm.v1.structured_output.backend_guidance import GuidanceBackend
 from vllm.v1.structured_output.backend_types import (
@@ -71,7 +71,7 @@ class StructuredOutputManager:
            # of CPUs.
            max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
            self.executor = ThreadPoolExecutor(max_workers=max_workers)
            self.tokenizer = init_tokenizer_from_config(
            self.tokenizer = cached_tokenizer_from_config(
                model_config=self.vllm_config.model_config
            )
            reasoning_parser = (
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -10,7 +10,8 @@ import torch
 import vllm.envs
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
 from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer
 from vllm.tokenizers.deepseekv32 import DeepseekV32Tokenizer
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils.import_utils import LazyLoader
 from vllm.v1.structured_output.backend_types import (
    StructuredOutputBackend,
Author	SHA1	Message	Date
Robert Shaw	968934e77d	Merge branch 'main' into wentao-small-refactor	2 days ago
Laith Sakka	763963aa73	set assume_32bit_indexing and pass unbacked hints (#30459 ) Signed-off-by: Laith Sakka <lsakka@meta.com>	2 days ago
Cyrus Leung	39cefbdf17	[Refactor] `TokenizerRegistry` only uses lazy imports (#30609 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2 days ago
Chen Zhang	ace34e3783	[Bugfix] Qwen3-next with --hf-overrides \{\"num_hidden_layers\":8\} (#30433 ) Signed-off-by: Chen Zhang <zhangch99@outlook.com>	2 days ago
Isotr0py	e5db3e2774	[CI/Build] Fix broken mm processor test Mistral-3-large (#30597 ) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2 days ago
Cyrus Leung	64251f48df	[Chore] Adjust tokenizer import to avoid circular imports (#30601 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2 days ago
Nick Hill	1cec5b7ea9	[Scheduer] Simplify stop checking for pooling models (#30591 ) Signed-off-by: Nick Hill <nhill@redhat.com>	2 days ago
Cyrus Leung	b09806e28f	[Bugfix] Dictionary MM embeddings for online chat (#30507 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	3 days ago
Tsukasa OI	fdc135d768	[Misc][Quantization] Clarify the intent of GGUF `FusedMoE` weight materialization (#30310 ) Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>	3 days ago
Roberto L. Castro	4fa7ce46f3	[Feature] Add SM103 (Blackwell Ultra) Support to vLLM (#30484 ) Signed-off-by: LopezCastroRoberto <robertol.c510@gmail.com> Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com> Co-authored-by: youkaichao <youkaichao@gmail.com>	3 days ago
Nicolò Lucchesi	57e9bf1864	[CI] Whisper logprobs tests (#30504 ) Signed-off-by: NickLucche <nlucches@redhat.com>	3 days ago
Michael Goin	2f32a68d75	[CI] Update several models in registry that are available online now (#30514 ) Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Isotr0py <2037008807@qq.com>	3 days ago
Matthew Bonanni	f5dfbbd8e9	[Docs] Remove references to `VLLM_ATTENTION_BACKEND` (#30564 ) Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>	3 days ago
Michael Goin	fc0119425c	Add IBM and Red Hat to compute resources sponsors (#30581 ) Signed-off-by: Michael Goin <mgoin64@gmail.com>	3 days ago
Matthew Bonanni	86a3261525	[Bugfix] Pass FA version in `MultiHeadAttention` (#30575 ) Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>	3 days ago
rasmith	08f8a5627e	[CI/Build][Kernel][BugFix][AMD] Fix per_token_group_quant_fp8 to use correct fp8 min/max values and update atol/rtol in test_quantfp8_group_functionality (#30292 ) Signed-off-by: Randall Smith <ransmith@amd.com> Co-authored-by: Randall Smith <ransmith@amd.com>	3 days ago
Kevin H. Luu	b4039c08b5	[ci] Mark PrimeRL integration test as soft fail (#30578 ) Signed-off-by: Kevin H. Luu <khluu000@gmail.com>	3 days ago
Wentao Ye	1e6b115300	[Refactor] Reduce duplicate code in `per_token_group_quant` cuda kernels (#30496 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	3 days ago
danielafrimi	13618626df	[MoE-FP8-modelopt] Add FlashInfer alignment padding for intermediate dimensions (#29748 ) Signed-off-by: Daniel Afrimi <dafrimi@pool0-00589.cm.cluster> Signed-off-by: dafrimi <dafrimi@nvidia.com> Co-authored-by: Daniel Afrimi <dafrimi@pool0-00589.cm.cluster> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>	3 days ago
danielafrimi	6ec0d8dbe4	[Fix]Load kv-cache dtype from hf_quant_config.json automatically (#29980 ) Signed-off-by: Daniel Afrimi <dafrimi@nvidia.com>	3 days ago
Li, Jiang	9693dd0fe3	[CI/Build] Add x86 CPU wheel release pipeline (#28848 ) Signed-off-by: jiang1.li <jiang1.li@intel.com>	3 days ago
Xin Yang	1f19d8f899	[Perf] Set split_k to 1 for triton_kernels (#30528 ) Signed-off-by: Xin Yang <xyangx@amazon.com>	3 days ago
shivampr	cd7740ac5c	[ROCm] Enable Triton ScaledMM fallback + kernel selection fix (#26668 ) Signed-off-by: Shivam <shivampr.dev@gmail.com> Signed-off-by: Shivam <shivamprasad91@gmail.com>	3 days ago
Wentao Ye	02a5880394	[CI] Fix mypy for vllm/v1/executor (#30517 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	3 days ago
realliujiaxu	d2c919dcc2	[bugfix] fix bug when top_logprobs=0 with spec decoding (#30059 ) Signed-off-by: realliujiaxu <realliujiaxu@163.com>	3 days ago
Benjamin Bartels	f3237f3f6b	[Frontend] Fixes anthropic streaming message_start usage nesting (#30266 ) Signed-off-by: bbartels <benjamin@bartels.dev>	3 days ago
jvlunteren	9c0ee995a8	[Kernel] Support CUDA Graphs in 3D Triton Attention Kernel (#28306 ) Signed-off-by: Jan van Lunteren <jvl@zurich.ibm.com> Signed-off-by: jvlunteren <161835099+jvlunteren@users.noreply.github.com> Co-authored-by: Thomas Parnell <tom.parnell@gmail.com> Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>	3 days ago
Michael Goin	09ad3b76b3	[Bug] Fix attention_backend arg string parsing (#30534 ) Signed-off-by: mgoin <mgoin64@gmail.com>	3 days ago