Merge branch 'main' into wentao-enable-eplb-with-default-backend

Signed-off-by: yewentao256 <zhyanwentao@126.com>
[Frontend] add tools for dsv32 developer role (#30040 )
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -71,6 +71,20 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

  # x86 CPU wheel build
  - label: "Build x86 CPU wheel"
    depends_on: ~
    id: build-wheel-x86-cpu
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
    env:
      DOCKER_BUILDKIT: "1"

  # Build release images (12.9)
  - label: "Build release image (x86)"
    depends_on: ~
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -0,0 +1,74 @@
 #!/usr/bin/env bash
 set -euxo pipefail

 # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
 THRESHOLD=${1:-0.25}
 NUM_Q=${2:-1319}
 PORT=${3:-8040}
 OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
 mkdir -p "${OUT_DIR}"

 wait_for_server() {
  local port=$1
  timeout 600 bash -c '
    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
      sleep 1
    done'
 }

 MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"

 # Set BACKENDS based on platform
 if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
  # ROCm platform
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
 else
  # Non-ROCm platform (CUDA/other)
  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
 fi

 cleanup() {
  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
    kill "${SERVER_PID}" 2>/dev/null || true
    for _ in {1..20}; do
      kill -0 "${SERVER_PID}" 2>/dev/null || break
      sleep 0.5
    done
    kill -9 "${SERVER_PID}" 2>/dev/null || true
  fi
 }
 trap cleanup EXIT

 for BACK in "${BACKENDS[@]}"; do
  VLLM_DEEP_GEMM_WARMUP=skip \
  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
    --tensor-parallel-size 4 \
    --enable-expert-parallel \
    --enable-eplb \
    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
    --trust-remote-code \
    --max-model-len 2048 \
    --gpu-memory-utilization 0.9 \
    --port $PORT &
  SERVER_PID=$!
  wait_for_server $PORT

  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
 assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
 PY

  cleanup
  SERVER_PID=
  sleep 1
  PORT=$((PORT+1))
 done
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -61,8 +61,8 @@ steps:
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_

 - label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
  timeout_in_minutes: 20
 - label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
  grade: Blocking
@@ -73,6 +73,7 @@ steps:
  - tests/multimodal
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
  no_gpu: true
@@ -82,6 +83,7 @@ steps:
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
  - pytest -v -s config

@@ -759,19 +761,7 @@ steps:
    - vllm/
    - tests/tool_use
  commands:
    - pytest -v -s -m 'not cpu_test' tool_use

 - label: OpenAI-Compatible Tool Use (CPU) # 5 mins
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  timeout_in_minutes: 10
  source_file_dependencies:
    - vllm/
    - tests/tool_use
  no_gpu: true
  commands:
    - pytest -v -s -m 'cpu_test' tool_use
    - pytest -v -s tool_use

 #####  models test  #####

@@ -1629,7 +1619,6 @@ steps:
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -57,8 +57,8 @@ steps:
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_

 - label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
  timeout_in_minutes: 20
 - label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
@@ -66,6 +66,7 @@ steps:
  - tests/multimodal
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
  no_gpu: true
@@ -75,6 +76,7 @@ steps:
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
  - pytest -v -s config

@@ -672,16 +674,7 @@ steps:
    - vllm/
    - tests/tool_use
  commands:
    - pytest -v -s -m 'not cpu_test' tool_use

 - label: OpenAI-Compatible Tool Use (CPU) # 5 mins
  timeout_in_minutes: 10
  source_file_dependencies:
    - vllm/
    - tests/tool_use
  no_gpu: true
  commands:
    - pytest -v -s -m 'cpu_test' tool_use
    - pytest -v -s tool_use

 #####  models test  #####

@@ -692,6 +685,7 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/models/test_initialization.py
  - tests/models/registry.py
  commands:
    # Run a subset of model initialization tests
    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
@@ -704,6 +698,7 @@ steps:
  - vllm/model_executor/models/
  - vllm/transformers_utils/
  - tests/models/test_initialization.py
  - tests/models/registry.py
  commands:
    # Only when vLLM model source is modified - test initialization of a large
    # subset of supported models (the complement of the small subset in the above
@@ -836,7 +831,7 @@ steps:
  - tests/models/multimodal
  no_gpu: true
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py

 - label: Multi-Modal Processor Test
@@ -1346,6 +1341,7 @@ steps:
 - label: Prime-RL Integration Test # 15min
  timeout_in_minutes: 30
  optional: true
  soft_fail: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
@@ -1379,4 +1375,4 @@ steps:
  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -115,7 +115,7 @@ steps:

 - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
  depends_on: ~
  timeout_in_minutes: 20
  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
@@ -123,6 +123,7 @@ steps:
  - tests/multimodal
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
  no_gpu: true
@@ -132,6 +133,7 @@ steps:
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
  - pytest -v -s config

--- a/.buildkite/test_areas/tool_use.yaml
+++ b/.buildkite/test_areas/tool_use.yaml
@@ -10,14 +10,4 @@ steps:
    - vllm/
    - tests/tool_use
  commands:
    - pytest -v -s -m 'not cpu_test' tool_use

 - label: OpenAI-Compatible Tool Use (CPU)
  depends_on: ~
  timeout_in_minutes: 10
  source_file_dependencies:
    - vllm/
    - tests/tool_use
  no_gpu: true
  commands:
    - pytest -v -s -m 'cpu_test' tool_use
    - pytest -v -s tool_use
--- a/README.md
+++ b/README.md
@@ -143,11 +143,13 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
 - IBM
 - Intel
 - Lambda Lab
 - Nebius
 - Novita AI
 - NVIDIA
 - Red Hat
 - Replicate
 - Roblox
 - RunPod
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -620,7 +620,7 @@ def get_tokenizer(
        kwargs["use_fast"] = False
    if tokenizer_mode == "mistral":
        try:
            from vllm.tokenizers import MistralTokenizer
            from vllm.tokenizers.mistral import MistralTokenizer
        except ImportError as e:
            raise ImportError(
                "MistralTokenizer requires vllm package.\n"
--- a/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
@@ -22,6 +22,62 @@ __device__ __forceinline__ float GroupReduceMax(float val) {
  return val;
 }

 template <typename T, bool SCALE_UE8M0>
 __device__ __forceinline__ float ComputeGroupScale(
    const T* __restrict__ group_input, T* __restrict__ smem_group,
    const int group_size, const int lane_id, const int threads_per_group,
    const float eps, const float max_8bit) {
  float local_absmax = eps;

  constexpr int vec_size = 16 / sizeof(T);

  // copy global -> shared & compute absmax
  auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
    float abs_v = fabsf(static_cast<float>(src));
    local_absmax = fmaxf(local_absmax, abs_v);
    dst = src;
  };

  vllm::vectorize_with_alignment<vec_size>(
      group_input,        // in
      smem_group,         // out (shared)
      group_size,         // elements per group
      lane_id,            // thread id
      threads_per_group,  // stride in group
      scalar_op_cache);   // scalar handler

  local_absmax = GroupReduceMax(local_absmax);

  float y_s = local_absmax / max_8bit;
  if constexpr (SCALE_UE8M0) {
    y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
  }

  return y_s;
 }

 template <typename T, typename DST_DTYPE>
 __device__ __forceinline__ void QuantizeGroup(
    const T* __restrict__ smem_group, DST_DTYPE* __restrict__ group_output,
    const int group_size, const int lane_id, const int threads_per_group,
    const float y_s, const float min_8bit, const float max_8bit) {
  constexpr int vec_size = 16 / sizeof(T);

  // quantize shared -> global 8-bit
  auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
    float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
    dst = DST_DTYPE(q);
  };

  vllm::vectorize_with_alignment<vec_size>(
      smem_group,         // in (shared)
      group_output,       // out (global quant tensor)
      group_size,         // elements
      lane_id,            // tid
      threads_per_group,  // stride
      scalar_op_quant);   // scalar handler
 }

 template <typename T, typename DST_DTYPE, bool IS_COLUMN_MAJOR = false,
          bool SCALE_UE8M0 = false, typename scale_packed_t = float>
 __global__ void per_token_group_quant_8bit_kernel(
@@ -38,8 +94,6 @@ __global__ void per_token_group_quant_8bit_kernel(
  const int64_t global_group_id = block_group_id + local_group_id;
  const int64_t block_group_offset = global_group_id * group_size;

  float local_absmax = eps;

  using scale_element_t = float;
  static_assert(sizeof(scale_packed_t) % sizeof(scale_element_t) == 0);

@@ -68,30 +122,9 @@ __global__ void per_token_group_quant_8bit_kernel(
  T* smem = reinterpret_cast<T*>(smem_raw);
  T* smem_group = smem + local_group_id * group_size;

  constexpr int vec_size = 16 / sizeof(T);
  using vec_t = vllm::vec_n_t<T, vec_size>;

  // copy global -> shared & compute absmax
  auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
    float abs_v = fabsf(static_cast<float>(src));
    local_absmax = fmaxf(local_absmax, abs_v);
    dst = src;
  };

  vllm::vectorize_with_alignment<vec_size>(
      group_input,        // in
      smem_group,         // out (shared)
      group_size,         // elements per group
      lane_id,            // thread id
      threads_per_group,  // stride in group
      scalar_op_cache);   // scalar handler

  local_absmax = GroupReduceMax(local_absmax);

  float y_s = local_absmax / max_8bit;
  if constexpr (SCALE_UE8M0) {
    y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
  }
  const float y_s = ComputeGroupScale<T, SCALE_UE8M0>(
      group_input, smem_group, group_size, lane_id, threads_per_group, eps,
      max_8bit);

  scale_element_t y_s_quant = y_s;

@@ -101,19 +134,24 @@ __global__ void per_token_group_quant_8bit_kernel(

  __syncthreads();

  // quantize shared -> global 8-bit
  auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
    float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
    dst = DST_DTYPE(q);
  };
  QuantizeGroup<T, DST_DTYPE>(smem_group, group_output, group_size, lane_id,
                              threads_per_group, y_s, min_8bit, max_8bit);
 }

  vllm::vectorize_with_alignment<vec_size>(
      smem_group,         // in (shared)
      group_output,       // out (global quant tensor)
      group_size,         // elements
      lane_id,            // tid
      threads_per_group,  // stride
      scalar_op_quant);   // scalar handler
 inline int GetGroupsPerBlock(int64_t num_groups) {
  if (num_groups % 16 == 0) {
    return 16;
  }
  if (num_groups % 8 == 0) {
    return 8;
  }
  if (num_groups % 4 == 0) {
    return 4;
  }
  if (num_groups % 2 == 0) {
    return 2;
  }
  return 1;
 }

 void per_token_group_quant_8bit(const torch::Tensor& input,
@@ -133,17 +171,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,

  constexpr int THREADS_PER_GROUP = 16;

  int groups_per_block = 1;

  if (num_groups % 16 == 0) {
    groups_per_block = 16;
  } else if (num_groups % 8 == 0) {
    groups_per_block = 8;
  } else if (num_groups % 4 == 0) {
    groups_per_block = 4;
  } else if (num_groups % 2 == 0) {
    groups_per_block = 2;
  }
  const int groups_per_block = GetGroupsPerBlock(num_groups);

  auto dst_type = output_q.scalar_type();
  const int num_blocks = num_groups / groups_per_block;
@@ -225,8 +253,6 @@ __global__ void per_token_group_quant_8bit_packed_kernel(

  const int64_t block_group_offset = global_group_id * group_size;

  float local_absmax = eps;

  const T* group_input = input + block_group_offset;
  DST_DTYPE* group_output =
      static_cast<DST_DTYPE*>(output_q) + block_group_offset;
@@ -235,29 +261,9 @@ __global__ void per_token_group_quant_8bit_packed_kernel(
  extern __shared__ __align__(16) char smem_raw[];
  T* smem = reinterpret_cast<T*>(smem_raw);
  T* smem_group = smem + local_group_id * group_size;

  constexpr int vec_size = 16 / sizeof(T);
  using vec_t = vllm::vec_n_t<T, vec_size>;

  // copy global -> shared & compute absmax
  auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
    float abs_v = fabsf(static_cast<float>(src));
    local_absmax = fmaxf(local_absmax, abs_v);
    dst = src;
  };

  vllm::vectorize_with_alignment<vec_size>(
      group_input,        // in
      smem_group,         // out (shared)
      group_size,         // elements per group
      lane_id,            // thread id
      threads_per_group,  // stride in group
      scalar_op_cache);   // scalar handler

  local_absmax = GroupReduceMax(local_absmax);

  float y_s = local_absmax / max_8bit;
  y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
  const float y_s =
      ComputeGroupScale<T, true>(group_input, smem_group, group_size, lane_id,
                                 threads_per_group, eps, max_8bit);

  // pack 4 scales into a uint32
  if (lane_id == 0) {
@@ -284,19 +290,8 @@ __global__ void per_token_group_quant_8bit_packed_kernel(

  __syncthreads();

  // quantize shared -> global 8-bit
  auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
    float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
    dst = DST_DTYPE(q);
  };

  vllm::vectorize_with_alignment<vec_size>(
      smem_group,         // in (shared)
      group_output,       // out (global quant tensor)
      group_size,         // elements
      lane_id,            // tid
      threads_per_group,  // stride
      scalar_op_quant);   // scalar handler
  QuantizeGroup<T, DST_DTYPE>(smem_group, group_output, group_size, lane_id,
                              threads_per_group, y_s, min_8bit, max_8bit);
 }

 void per_token_group_quant_8bit_packed(const torch::Tensor& input,
@@ -337,17 +332,7 @@ void per_token_group_quant_8bit_packed(const torch::Tensor& input,

  constexpr int THREADS_PER_GROUP = 16;

  int groups_per_block = 1;

  if (num_groups % 16 == 0) {
    groups_per_block = 16;
  } else if (num_groups % 8 == 0) {
    groups_per_block = 8;
  } else if (num_groups % 4 == 0) {
    groups_per_block = 4;
  } else if (num_groups % 2 == 0) {
    groups_per_block = 2;
  }
  const int groups_per_block = GetGroupsPerBlock(num_groups);

  auto dst_type = output_q.scalar_type();
  const int num_blocks = num_groups / groups_per_block;
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -76,6 +76,9 @@ RUN python3 -m pip install -e tests/vllm_test_utils
 ENV NIXL_VERSION=0.7.0
 RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py

 # PyJWT-2.7.0 will influence some wheel behaviors, remove its dist-info to avoid conflicts
 RUN rm /usr/lib/python3/dist-packages/PyJWT-2.7.0.dist-info/ -rf

 # remove torch bundled oneccl to avoid conflicts
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip uninstall oneccl oneccl-devel -y
--- a/docs/community/sponsors.md
+++ b/docs/community/sponsors.md
@@ -24,11 +24,13 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
 - IBM
 - Intel
 - Lambda Lab
 - Nebius
 - Novita AI
 - NVIDIA
 - Red Hat
 - Replicate
 - Roblox
 - RunPod
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -7,7 +7,7 @@ This guide covers optimization strategies and performance tuning for vLLM V1.

 ## Preemption

 Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
 Due to the autoregressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
 In such cases, vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
 available again. When this occurs, you may see the following warning:

--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -82,7 +82,7 @@ DOCKER_BUILDKIT=1 docker build . \

 ## Building for Arm64/aarch64

 A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
 A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper and Grace-Blackwell. Using the flag `--platform "linux/arm64"` will build for arm64.

 !!! note
    Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
@@ -104,6 +104,25 @@ A docker container can be built for aarch64 systems such as the Nvidia Grace-Hop
    --build-arg RUN_WHEEL_CHECK=false
    ```

 For (G)B300, we recommend using CUDA 13, as shown in the following command.

 ??? console "Command"

    ```bash
    DOCKER_BUILDKIT=1 docker build \
    --build-arg CUDA_VERSION=13.0.1 \
    --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 \
    --build-arg max_jobs=256 \
    --build-arg nvcc_threads=2 \
    --build-arg RUN_WHEEL_CHECK=false \
    --build-arg torch_cuda_arch_list='9.0 10.0+PTX' \
    --platform "linux/arm64" \
    --tag vllm/vllm-gb300-openai:latest \
    --target vllm-openai \
    -f docker/Dockerfile \
    .
    ```

 !!! note
    If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.

--- a/docs/deployment/integrations/production-stack.md
+++ b/docs/deployment/integrations/production-stack.md
@@ -4,7 +4,7 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine le

 * **Upstream vLLM compatibility** – It wraps around upstream vLLM without modifying its code.
 * **Ease of use** – Simplified deployment via Helm charts and observability through Grafana dashboards.
 * **High performance** – Optimized for LLM workloads with features like multi-model support, model-aware and prefix-aware routing, fast vLLM bootstrapping, and KV cache offloading with [LMCache](https://github.com/LMCache/LMCache), among others.
 * **High performance** – Optimized for LLM workloads with features like multimodel support, model-aware and prefix-aware routing, fast vLLM bootstrapping, and KV cache offloading with [LMCache](https://github.com/LMCache/LMCache), among others.

 If you are new to Kubernetes, don't worry: in the vLLM production stack [repo](https://github.com/vllm-project/production-stack), we provide a step-by-step [guide](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) and a [short video](https://www.youtube.com/watch?v=EsTJbQtzj0g) to set up everything and get started in **4 minutes**!

--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@@ -41,7 +41,7 @@ These features allow the most flexibility for cudagraph capture and compilation
 * `NONE` — turn CUDA Graphs off. Good for debugging.
 * `PIECEWISE` —  a single-mode strategy (and past default). It is the most flexible: attention or other CUDA Graphs-incompatible operations stay eager, everything else goes into CUDA Graphs. Requires piecewise compilation.
 * `FULL` — a single-mode strategy, which only captures full CUDA Graphs for non-uniform batches, then uniform-decode batches reuse the CUDA Graph of non-uniform batch of the same batch_size, since they are compatible; can be good for small models or workloads with small prompts.
 * `FULL_DECODE_ONLY` — full CUDA Graph for uniform decode, no cudagraph for prefill/mixed etc; suitable for decode instances in a P/D setup where prefill is not as important, this way we can save the memory needed for `PIECEWISE` CUDA Graphs.
 * `FULL_DECODE_ONLY` — full CUDA Graph for uniform decode, no cudagraph for prefill/mixed etc.; suitable for decode instances in a P/D setup where prefill is not as important, this way we can save the memory needed for `PIECEWISE` CUDA Graphs.
 * `FULL_AND_PIECEWISE` — (default mode) full CUDA Graph for uniform decode, piecewise CUDA Graphs for others; generally the most performant setting, especially for low latency with small models or MoEs, but also requires the most memory and takes the longest to capture.

 Defaults: If you’re on v1 with piecewise compilation, we default to `FULL_AND_PIECEWISE` for better performance, (for pooling models, it's still `PIECEWISE`). Otherwise, e.g. if piecewise compilation unavailable, we default to `NONE`.
@@ -49,7 +49,7 @@ Defaults: If you’re on v1 with piecewise compilation, we default to `FULL_AND_
 While `NONE` , `PIECEWISE`, and `FULL` are single-mode configurations and simply equivalent to past implementations of eager execution, piecewise CUDA Graphs, and full CUDA Graphs respectively, `FULL_DECODE_ONLY` and `FULL_AND_PIECEWISE` are newly appended dual-mode configurations, which require dispatching to switch between concrete runtime modes according to runtime batches dynamically.

 !!! note
    Here, the single-modes `NONE`, `PIECEWISE`, and `FULL` are treated as the runtime modes for CUDA Graphs dispatching. If using a dual-mode, the dispatcher will always dispatch to one of its member modes (plus a potantial `NONE` if no suitable CUDA Graph available), depending on the batch composition.
    Here, the single-modes `NONE`, `PIECEWISE`, and `FULL` are treated as the runtime modes for CUDA Graphs dispatching. If using a dual-mode, the dispatcher will always dispatch to one of its member modes (plus a potential `NONE` if no suitable CUDA Graph available), depending on the batch composition.

 While cascade attention is not cudagraph compatible, it is now compatible with all possible cudagraph mode configurations. If a batch uses cascade attention, it always gets dispatched to `PIECEWISE` mode if available (otherwise `NONE`).

--- a/docs/design/optimization_levels.md
+++ b/docs/design/optimization_levels.md
@@ -4,7 +4,7 @@

 ## Overview

 vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechnaism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out of the box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
 vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechanism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out-of-the-box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.

 ## Level Summaries and Usage Examples
 ```bash
--- a/docs/design/paged_attention.md
+++ b/docs/design/paged_attention.md
@@ -36,7 +36,7 @@ the input pointers `q`, `k_cache`, and `v_cache`, which point
 to query, key, and value data on global memory that need to be read
 and processed. The output pointer `out` points to global memory
 where the result should be written. These four pointers actually
 refer to multi-dimensional arrays, but each thread only accesses the
 refer to multidimensional arrays, but each thread only accesses the
 portion of data assigned to it. I have omitted all other runtime
 parameters here for simplicity.

@@ -229,7 +229,7 @@ manner.

 ## QK

 As shown the pseudo code below, before the entire for loop block, we
 As shown the pseudocode below, before the entire for loop block, we
 fetch the query data for one token and store it in `q_vecs`. Then,
 in the outer for loop, we iterate through different `k_ptrs` that
 point to different tokens and prepare the `k_vecs` in the inner for
@@ -403,7 +403,7 @@ for ... { // Iteration over different blocks.
 }
 ```

 As shown in the above pseudo code, in the outer loop, similar to
 As shown in the above pseudocode, in the outer loop, similar to
 `k_ptr`, `logits_vec` iterates over different blocks and reads
 `V_VEC_SIZE` elements from `logits`. In the inner loop, each
 thread reads `V_VEC_SIZE` elements from the same tokens as a
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -420,7 +420,7 @@ Flags: `--tool-call-parser pythonic --chat-template {see_above}`

 ## How to Write a Tool Parser Plugin

 A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in [vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py](../../vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py).
 A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in [vllm/tool_parsers/hermes_tool_parser.py](../../vllm/tool_parsers/hermes_tool_parser.py).

 Here is a summary of a plugin file:

@@ -468,7 +468,7 @@ Here is a summary of a plugin file:
    # register the tool parser to ToolParserManager
    ToolParserManager.register_lazy_module(
        name="example",
        module_path="vllm.entrypoints.openai.tool_parsers.example",
        module_path="vllm.tool_parsers.example",
        class_name="ExampleToolParser",
    )

--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -281,17 +281,27 @@ Alternatively, you can use the `openai` Python package:

 Currently, vLLM supports multiple backends for efficient Attention computation across different platforms and accelerator architectures. It automatically selects the most performant backend compatible with your system and model specifications.

 If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options:
 If desired, you can also manually set the backend of your choice using the `--attention-backend` CLI argument:

 ```bash
 # For online serving
 vllm serve Qwen/Qwen2.5-1.5B-Instruct --attention-backend FLASH_ATTN

 # For offline inference
 python script.py --attention-backend FLASHINFER
 ```

 Some of the available backend options include:

 - On NVIDIA CUDA: `FLASH_ATTN` or `FLASHINFER`.
 - On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.

 For AMD ROCm, you can further control the specific Attention implementation using the following variables:
 For AMD ROCm, you can further control the specific Attention implementation using the following options:

 - Triton Unified Attention: `VLLM_ROCM_USE_AITER=0 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
 - AITER Unified Attention: `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
 - Triton Prefill-Decode Attention: `VLLM_ROCM_USE_AITER=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0`
 - AITER Multi-head Attention: `VLLM_ROCM_USE_AITER=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=1`
 - Triton Unified Attention: Set the environment variables `VLLM_ROCM_USE_AITER=0 VLLM_ROCM_USE_AITER_MHA=0` and pass `--attention-config.use_prefill_decode_attention=false` as a CLI argument.
 - AITER Unified Attention: Set the environment variables `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0` and pass `--attention-config.use_prefill_decode_attention=false` as a CLI argument.
 - Triton Prefill-Decode Attention: Set the environment variables `VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_MHA=0` and pass `--attention-config.use_prefill_decode_attention=true` as a CLI argument.
 - AITER Multi-head Attention: Set the environment variables `VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_MHA=1` and pass `--attention-config.use_prefill_decode_attention=false` as a CLI argument.

 !!! warning
    There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [docker/Dockerfile](../../docker/Dockerfile) for instructions on how to install it.
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -659,7 +659,9 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
 | `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A<sup>+</sup> | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-hf` | ✅︎ | ✅︎ |
 | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
 | `BagelForConditionalGeneration` | BAGEL | T + I<sup>+</sup> | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ |
 | `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
 | `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ |
 | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |
@@ -743,7 +745,7 @@ Some models are supported only via the [Transformers modeling backend](#transfor
    - There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups.

 !!! note
    For `InternVLChatModel`, only InternVL2.5 with Qwen2.5 text backbone (`OpenGVLab/InternVL2.5-1B` etc), InternVL3 and InternVL3.5 have video inputs support currently.
    For `InternVLChatModel`, only InternVL2.5 with Qwen2.5 text backbone (`OpenGVLab/InternVL2.5-1B` etc.), InternVL3 and InternVL3.5 have video inputs support currently.

 !!! note
    To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
--- a/docs/serving/data_parallel_deployment.md
+++ b/docs/serving/data_parallel_deployment.md
@@ -8,11 +8,11 @@ For MoE models, particularly those like DeepSeek that employ MLA (Multi-head Lat

 In these cases, the data parallel ranks are not completely independent. Forward passes must be aligned, and expert layers across all ranks are required to synchronize during every forward pass, even when there are fewer requests to be processed than DP ranks.

 The expert layers will by default form a (DP x TP) sized tensor parallel group. To enable expert parallelism, include the `--enable-expert-parallel` CLI arg (on all nodes in the multi-node case).
 By default, expert layers form a tensor parallel group of size `DP × TP`. To use expert parallelism instead, include the `--enable-expert-parallel` CLI arg (on all nodes in the multi-node case). See [Expert Parallel Deployment](expert_parallel_deployment.md) for details on how attention and expert layers behave differently with EP enabled.

 In vLLM, each DP rank is deployed as a separate "core engine" process that communicates with front-end process(es) via ZMQ sockets. Data Parallel attention can be combined with Tensor Parallel attention, in which case each DP engine owns a number of per-GPU worker processes equal to the configured TP size.

 For MoE models, when any requests are in progress in any rank, we must ensure that empty "dummy" forward passes are performed in all ranks that don't currently have any requests scheduled. This is handled via a separate DP Coordinator process that communicates with all ranks, and a collective operation performed every N steps to determine when all ranks become idle and can be paused. When TP is used in conjunction with DP, expert layers form an EP or TP group of size (DP x TP).
 For MoE models, when any requests are in progress in any rank, we must ensure that empty "dummy" forward passes are performed in all ranks that don't currently have any requests scheduled. This is handled via a separate DP Coordinator process that communicates with all ranks, and a collective operation performed every N steps to determine when all ranks become idle and can be paused. When TP is used in conjunction with DP, expert layers form a group of size `DP × TP` (using either tensor parallelism by default, or expert parallelism if `--enable-expert-parallel` is set).

 In all cases, it is beneficial to load-balance requests between DP ranks. For online deployments, this balancing can be optimized by taking into account the state of each DP engine - in particular its currently scheduled and waiting (queued) requests, and KV cache state. Each DP engine has an independent KV cache, and the benefit of prefix caching can be maximized by directing prompts intelligently.

--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@@ -44,7 +44,27 @@ Where:
 - `DP_SIZE`: Data parallel size
 - `EP_SIZE`: Expert parallel size (computed automatically)

 When EP is enabled, MoE layers use expert parallelism instead of tensor parallelism, while attention layers continue to use tensor parallelism if `TP_SIZE > 1`.
 ### Layer Behavior with EP Enabled

 When EP is enabled, different layers in MoE models behave differently:

 | Layer Type | Behavior | Parallelism Used |
 |------------|----------|------------------|
 | **Expert (MoE) Layers** | Sharded across all EP ranks | Expert Parallel (EP) of size `TP × DP` |
 | **Attention Layers** | Behavior depends on TP size | See below |

 **Attention layer parallelism:**

 - **When `TP = 1`**: Attention weights are **replicated** across all DP ranks (data parallelism)
 - **When `TP > 1`**: Attention weights are **sharded** using tensor parallelism across TP ranks within each DP group

 For example, with `TP=2, DP=4` (8 GPUs total):

 - Expert layers form an EP group of size 8, with experts distributed across all GPUs
 - Attention layers use TP=2 within each of the 4 DP groups

 !!! note "Key Difference from Data Parallel Deployment"
    Without `--enable-expert-parallel`, MoE layers would use tensor parallelism (forming a TP group of size `TP × DP`), similar to dense models. With EP enabled, expert layers switch to expert parallelism, which can provide better efficiency and locality for MoE models.

 ### Example Command

--- a/docs/serving/parallelism_scaling.md
+++ b/docs/serving/parallelism_scaling.md
@@ -62,7 +62,7 @@ If a single node lacks sufficient GPUs to hold the model, deploy vLLM across mul

 ### What is Ray?

 Ray is a distributed computing framework for scaling Python programs. Multi-node vLLM deployments require Ray as the runtime engine.
 Ray is a distributed computing framework for scaling Python programs. Multi-node vLLM deployments can use Ray as the runtime engine.

 vLLM uses Ray to manage the distributed execution of tasks across multiple nodes and control where execution happens.

@@ -130,9 +130,31 @@ vllm serve /path/to/the/model/in/the/container \
     --distributed-executor-backend ray
 ```

 ### Running vLLM with MultiProcessing

 Besides Ray, Multi-node vLLM deployments can also use `multiprocessing` as the runtime engine. Here's an example to deploy model across 2 nodes (8 GPUs per node) with `tp_size=8` and `pp_size=2`.

 Choose one node as the head node and run:

 ```bash
 vllm serve /path/to/the/model/in/the/container \
  --tensor-parallel-size 8 --pipeline-parallel-size 2 \
  --nnodes 2 --node-rank 0 \
  --master-addr <HEAD_NODE_IP>
 ```

 On the other worker node, run:

 ```bash
 vllm serve /path/to/the/model/in/the/container \
  --tensor-parallel-size 8 --pipeline-parallel-size 2 \
  --nnodes 2 --node-rank 1 \
  --master-addr <HEAD_NODE_IP> --headless
 ```

 ## Optimizing network communication for tensor parallelism

 Efficient tensor parallelism requires fast inter-node communication, preferably through high-speed network adapters such as InfiniBand.
 Efficient tensor parallelism requires fast internode communication, preferably through high-speed network adapters such as InfiniBand.
 To set up the cluster to use InfiniBand, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the
 [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh) helper script.
 Contact your system administrator for more information about the required flags.
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@@ -10,7 +10,7 @@ All communications between nodes in a multi-node vLLM deployment are **insecure

 ### Configuration Options for Inter-Node Communications

 The following options control inter-node communications in vLLM:
 The following options control internode communications in vLLM:

 #### 1. **Environment Variables:**

@@ -28,7 +28,7 @@ The following options control inter-node communications in vLLM:

 ### Notes on PyTorch Distributed

 vLLM uses PyTorch's distributed features for some inter-node communication. For
 vLLM uses PyTorch's distributed features for some internode communication. For
 detailed information about PyTorch Distributed security considerations, please
 refer to the [PyTorch Security
 Guide](https://github.com/pytorch/pytorch/security/policy#using-distributed-features).
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -42,60 +42,31 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.


 # Voxtral
 # Make sure to install mistral-common[audio].
 def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
    from mistral_common.audio import Audio
    from mistral_common.protocol.instruct.chunk import (
        AudioChunk,
        RawAudio,
        TextChunk,
    )
    from mistral_common.protocol.instruct.messages import (
        UserMessage,
    )
    from mistral_common.protocol.instruct.request import ChatCompletionRequest
    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer

    model_name = "mistralai/Voxtral-Mini-3B-2507"
    tokenizer = MistralTokenizer.from_hf_hub(model_name)

 # AudioFlamingo3
 def run_audioflamingo3(question: str, audio_count: int) -> ModelRequestData:
    model_name = "nvidia/audio-flamingo-3-hf"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"audio": audio_count},
        config_format="mistral",
        load_format="mistral",
        tokenizer_mode="mistral",
        enforce_eager=True,
        enable_chunked_prefill=False,
    )

    text_chunk = TextChunk(text=question)
    audios = [
        Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
        for i in range(audio_count)
    ]
    audio_chunks = [
        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
    ]

    messages = [UserMessage(content=[*audio_chunks, text_chunk])]

    req = ChatCompletionRequest(messages=messages, model=model_name)

    tokens = tokenizer.encode_chat_completion(req)
    prompt_ids, audios = tokens.tokens, tokens.audios

    audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
    # AudioFlamingo3 uses <sound> token for audio
    audio_placeholder = "<sound>" * audio_count

    multi_modal_data = {"audio": audios_and_sr}
    prompt = (
        "<|im_start|>system\n"
        "You are a helpful assistant.<|im_end|>\n"
        "<|im_start|>user\n"
        f"{audio_placeholder}{question}<|im_end|>\n"
        "<|im_start|>assistant\n"
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt_token_ids=prompt_ids,
        multi_modal_data=multi_modal_data,
        prompt=prompt,
    )


@@ -361,6 +332,63 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
    )


 # Voxtral
 # Make sure to install mistral-common[audio].
 def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
    from mistral_common.audio import Audio
    from mistral_common.protocol.instruct.chunk import (
        AudioChunk,
        RawAudio,
        TextChunk,
    )
    from mistral_common.protocol.instruct.messages import (
        UserMessage,
    )
    from mistral_common.protocol.instruct.request import ChatCompletionRequest
    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer

    model_name = "mistralai/Voxtral-Mini-3B-2507"
    tokenizer = MistralTokenizer.from_hf_hub(model_name)

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={"audio": audio_count},
        config_format="mistral",
        load_format="mistral",
        tokenizer_mode="mistral",
        enforce_eager=True,
        enable_chunked_prefill=False,
    )

    text_chunk = TextChunk(text=question)
    audios = [
        Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
        for i in range(audio_count)
    ]
    audio_chunks = [
        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
    ]

    messages = [UserMessage(content=[*audio_chunks, text_chunk])]

    req = ChatCompletionRequest(messages=messages, model=model_name)

    tokens = tokenizer.encode_chat_completion(req)
    prompt_ids, audios = tokens.tokens, tokens.audios

    audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]

    multi_modal_data = {"audio": audios_and_sr}

    return ModelRequestData(
        engine_args=engine_args,
        prompt_token_ids=prompt_ids,
        multi_modal_data=multi_modal_data,
    )


 # Whisper
 def run_whisper(question: str, audio_count: int) -> ModelRequestData:
    assert audio_count == 1, "Whisper only support single audio input per prompt"
@@ -382,7 +410,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:


 model_example_map = {
    "voxtral": run_voxtral,
    "audioflamingo3": run_audioflamingo3,
    "gemma3n": run_gemma3n,
    "granite_speech": run_granite_speech,
    "midashenglm": run_midashenglm,
@@ -392,6 +420,7 @@ model_example_map = {
    "qwen2_audio": run_qwen2_audio,
    "qwen2_5_omni": run_qwen2_5_omni,
    "ultravox": run_ultravox,
    "voxtral": run_voxtral,
    "whisper": run_whisper,
 }

--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -118,6 +118,32 @@ def run_bee(questions: list[str], modality: str) -> ModelRequestData:
    )


 def run_bagel(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "ByteDance-Seed/BAGEL-7B-MoT"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={modality: 1},
    )

    prompts = [
        (
            f"<|im_start|>user\n<|image_pad|>\n{question}<|im_end|>\n"
            f"<|im_start|>assistant\n"
        )
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


 # BLIP-2
 def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@@ -1832,6 +1858,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
 model_example_map = {
    "aria": run_aria,
    "aya_vision": run_aya_vision,
    "bagel": run_bagel,
    "bee": run_bee,
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
--- a/examples/online_serving/structured_outputs/structured_outputs.py
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
@@ -112,7 +112,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
        "messages": [
            {
                "role": "user",
                "content": "Generate an SQL query to show the 'username' and 'email'from the 'users' table.",
                "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
            }
        ],
        "extra_body": {
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -20,13 +20,14 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer

 from ...utils import flat_product, multi_gpu_test

 is_blackwell = lambda: current_platform.is_device_capability(100)
 is_blackwell = lambda: current_platform.is_device_capability_family(100)
 """Are we running on Blackwell, a lot of tests depend on it"""


 class Matches(NamedTuple):
    attention_fusion: int = 0
    allreduce_fusion: int = 0
    rms_quant_norm_fusion: int = 0
    sequence_parallel: int = 0
    async_tp: int = 0

@@ -40,6 +41,7 @@ class ModelBackendTestCase(NamedTuple):

 MODELS_FP8: list[ModelBackendTestCase] = []
 MODELS_FP4: list[ModelBackendTestCase] = []
 MODELS_GROUP_FP8: list[ModelBackendTestCase] = []
 MODELS: list[ModelBackendTestCase] = []  # tp-only

 if current_platform.is_cuda():
@@ -498,3 +500,79 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
    compilation_config.compile_ranges_split_points = (
        llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
    )


 if current_platform.is_cuda():
    MODELS_GROUP_FP8 = [
        ModelBackendTestCase(
            model_name="Qwen/Qwen3-30B-A3B-FP8",
            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
            backend=AttentionBackendEnum.TRITON_ATTN,
            matches=Matches(
                rms_quant_norm_fusion=48,
            ),
        ),
    ]

 CUSTOM_OPS_QUANT_RMS_NORM = ["+quant_fp8,+rms_norm"]


@pytest.mark.parametrize(
    "model_name, model_kwargs, backend, matches, custom_ops",
    # Test rms norm+group quant_fp8 fusion
    list[tuple[Any, ...]](flat_product(MODELS_GROUP_FP8, CUSTOM_OPS_QUANT_RMS_NORM)),
 )
@pytest.mark.parametrize("inductor_graph_partition", [True, False])
 def test_rms_group_quant(
    model_name: str,
    model_kwargs: dict[str, Any],
    backend: AttentionBackendEnum,
    matches: Matches,
    custom_ops: str,
    inductor_graph_partition: bool,
    caplog_mp_spawn,
    monkeypatch,
 ):
    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
        pytest.skip("Inductor graph partition requires torch>=2.9")

    custom_ops_list = custom_ops.split(",") if custom_ops else []

    if inductor_graph_partition:
        mode = CUDAGraphMode.FULL_AND_PIECEWISE
        splitting_ops: list[str] | None = None
    else:
        mode = CUDAGraphMode.FULL_DECODE_ONLY
        splitting_ops = []

    # Disable, compile cache to make sure custom passes run.
    # Otherwise, we can't verify fusion happened through the logs.
    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")

    # To capture subprocess logs, we need to know whether spawn or fork is used.
    # Force spawn as it is more general.
    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)

    compilation_config = CompilationConfig(
        # Testing properties
        custom_ops=custom_ops_list,
        use_inductor_graph_partition=inductor_graph_partition,
        cudagraph_mode=mode,
        splitting_ops=splitting_ops,
        # Common
        mode=CompilationMode.VLLM_COMPILE,
        pass_config=PassConfig(eliminate_noops=True, enable_fusion=True),
        # Inductor caches custom passes by default as well via uuid
        inductor_compile_config={"force_disable_caches": True},
    )

    with caplog_mp_spawn(logging.DEBUG) as log_holder:
        run_model(compilation_config, model_name, **model_kwargs)

    log_matches = re.findall(
        r"\[fusion.py:\d+] Replaced (\d+) patterns",
        log_holder.text,
    )
    assert len(log_matches) == 1, log_holder.text
    assert int(log_matches[0]) == matches.rms_quant_norm_fusion
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -36,7 +36,7 @@ def get_test_models():
        DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
    ],
 )
@pytest.mark.parametrize("use_aot_compile", ["0"])
@pytest.mark.parametrize("use_aot_compile", ["0", "1"])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
@pytest.mark.parametrize("evaluate_guards", [False, True])
@pytest.mark.skipif(
@@ -54,6 +54,12 @@ def test_dynamic_shapes_compilation(
    if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
        pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")

    if evaluate_guards and shapes_type == DynamicShapesType.UNBACKED:
        pytest.skip("unbacked dynamic shapes do not add guards")

    if evaluate_guards and use_aot_compile:
        pytest.skip("evaluate_guards requires use_aot_compile=0")

    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")

@@ -120,7 +126,7 @@ def test_model_specialization_with_evaluate_guards(
        and dynamic_shapes_type == DynamicShapesType.BACKED
        and evaluate_guards
    ):
        pytest.skip("evaluate_guards for backed does not work with aot_compile =1")
        pytest.skip("evaluate_guards for backed does not work with aot_compile=1")

    @support_torch_compile
    class ModelWithSizeCheck(torch.nn.Module):
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -702,10 +702,16 @@ class HfRunner:
                **kwargs,
            )

            # Encoder-decoder models return decoder_hidden_states instead of
            # hidden_states
            hidden_states = (
                getattr(output, "hidden_states", None) or output.decoder_hidden_states
            )

            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.hidden_states, num_logprobs)
            ) = self._hidden_states_to_logprobs(hidden_states, num_logprobs)

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
--- a/tests/entrypoints/openai/test_chat_error.py
+++ b/tests/entrypoints/openai/test_chat_error.py
@@ -80,10 +80,9 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
        return dict(engine_prompt), {}

    async def _fake_preprocess_chat(*args, **kwargs):
        # return conversation, request_prompts, engine_prompts
        # return conversation, engine_prompts
        return (
            [{"role": "user", "content": "Test"}],
            [[1, 2, 3]],
            [{"prompt_token_ids": [1, 2, 3]}],
        )

--- a/tests/entrypoints/openai/test_messages.py
+++ b/tests/entrypoints/openai/test_messages.py
@@ -79,9 +79,12 @@ async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):

    assert chunk_count > 0
    assert first_chunk is not None, "message_start chunk was never observed"
    assert first_chunk.usage is not None, "first chunk should include usage stats"
    assert first_chunk.usage["output_tokens"] == 0
    assert first_chunk.usage["input_tokens"] > 5
    assert first_chunk.message is not None, "first chunk should include message"
    assert first_chunk.message.usage is not None, (
        "first chunk should include usage stats"
    )
    assert first_chunk.message.usage.output_tokens == 0
    assert first_chunk.message.usage.input_tokens > 5


@pytest.mark.asyncio
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -19,9 +19,9 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.tokenizers import get_tokenizer
 from vllm.tool_parsers import ToolParserManager
 from vllm.v1.engine.async_llm import AsyncLLM

 from ...utils import RemoteOpenAIServer
@@ -877,7 +877,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the first turn's input
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
        input_messages, _ = serving_chat._make_request_with_harmony(req)
        verify_harmony_messages(
            input_messages,
            [
@@ -905,7 +905,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the second turn's input
        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
        input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2)
        input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
        verify_harmony_messages(
            input_messages_2,
            [
@@ -927,7 +927,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the first turn's input
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
        input_messages, _ = serving_chat._make_request_with_harmony(req)
        verify_harmony_messages(
            input_messages,
            [
@@ -971,7 +971,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the second turn's input
        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
        input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2)
        input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
        verify_harmony_messages(
            input_messages_2,
            [
@@ -1008,7 +1008,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the first turn's input
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
        input_messages, _ = serving_chat._make_request_with_harmony(req)
        verify_harmony_messages(
            input_messages,
            [
@@ -1052,7 +1052,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the second turn's input
        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
        input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2)
        input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
        verify_harmony_messages(
            input_messages_2,
            [
@@ -1089,7 +1089,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the first turn's input
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
        input_messages, _ = serving_chat._make_request_with_harmony(req)
        verify_harmony_messages(
            input_messages,
            [
@@ -1133,7 +1133,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the second turn's input
        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
        input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2)
        input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
        verify_harmony_messages(
            input_messages_2,
            [
@@ -1183,7 +1183,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the third turn's input
        req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
        input_messages_3, _, _ = serving_chat._make_request_with_harmony(req_3)
        input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
        verify_harmony_messages(
            input_messages_3,
            [
@@ -1246,7 +1246,7 @@ class TestServingChatWithHarmony:

        # Test the Harmony messages for the fourth turn's input
        req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
        input_messages_4, _, _ = serving_chat._make_request_with_harmony(req_4)
        input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
        verify_harmony_messages(
            input_messages_4,
            [
@@ -1295,7 +1295,7 @@ class TestServingChatWithHarmony:
            },
        ]
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
        input_messages, _ = serving_chat._make_request_with_harmony(req)

        verify_harmony_messages(
            input_messages,
@@ -1327,7 +1327,7 @@ class TestServingChatWithHarmony:
            },
        ]
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
        input_messages, _ = serving_chat._make_request_with_harmony(req)

        verify_harmony_messages(
            input_messages,
@@ -1357,7 +1357,7 @@ class TestServingChatWithHarmony:
            },
        ]
        req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
        input_messages, _, _ = serving_chat._make_request_with_harmony(req)
        input_messages, _ = serving_chat._make_request_with_harmony(req)

        verify_harmony_messages(
            input_messages,
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -10,7 +10,7 @@ import pytest
 from vllm.config import ModelConfig
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.tokenizers import MistralTokenizer
 from vllm.tokenizers.mistral import MistralTokenizer


@pytest.fixture()
--- a/tests/entrypoints/openai/test_serving_responses.py
+++ b/tests/entrypoints/openai/test_serving_responses.py
@@ -21,7 +21,7 @@ from vllm.entrypoints.openai.serving_responses import (
    extract_tool_types,
 )
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.inputs.data import TokensPrompt


 class MockConversationContext(ConversationContext):
@@ -237,7 +237,7 @@ class TestValidateGeneratorInput:
        """Test _validate_generator_input with valid prompt length"""
        # Create an engine prompt with valid length (less than max_model_len)
        valid_prompt_token_ids = list(range(5))  # 5 tokens < 100 max_model_len
        engine_prompt = EngineTokensPrompt(prompt_token_ids=valid_prompt_token_ids)
        engine_prompt = TokensPrompt(prompt_token_ids=valid_prompt_token_ids)

        # Call the method
        result = serving_responses_instance._validate_generator_input(engine_prompt)
@@ -247,7 +247,7 @@ class TestValidateGeneratorInput:

        # create an invalid engine prompt
        invalid_prompt_token_ids = list(range(200))  # 100 tokens >= 100 max_model_len
        engine_prompt = EngineTokensPrompt(prompt_token_ids=invalid_prompt_token_ids)
        engine_prompt = TokensPrompt(prompt_token_ids=invalid_prompt_token_ids)

        # Call the method
        result = serving_responses_instance._validate_generator_input(engine_prompt)
--- a/tests/entrypoints/openai/test_sparse_tensor_validation.py
+++ b/tests/entrypoints/openai/test_sparse_tensor_validation.py
@@ -0,0 +1,342 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Sparse tensor validation in embedding APIs.

 Tests verify that malicious sparse tensors are rejected before they can trigger
 out-of-bounds memory writes during to_dense() operations.
 """

 import base64
 import io

 import pytest
 import torch

 from vllm.entrypoints.renderer import CompletionRenderer
 from vllm.multimodal.audio import AudioEmbeddingMediaIO
 from vllm.multimodal.image import ImageEmbeddingMediaIO


 def _encode_tensor(tensor: torch.Tensor) -> bytes:
    """Helper to encode a tensor as base64 bytes."""
    buffer = io.BytesIO()
    torch.save(tensor, buffer)
    buffer.seek(0)
    return base64.b64encode(buffer.read())


 def _create_malicious_sparse_tensor() -> torch.Tensor:
    """
    Create a malicious sparse COO tensor with out-of-bounds indices.

    This tensor has indices that point beyond the declared shape, which would
    cause an out-of-bounds write when converted to dense format without
    validation.
    """
    # Create a 3x3 sparse tensor but with indices pointing to (10, 10)
    indices = torch.tensor([[10], [10]])  # Out of bounds for 3x3 shape
    values = torch.tensor([1.0])
    shape = (3, 3)

    # Create sparse tensor (this will be invalid)
    sparse_tensor = torch.sparse_coo_tensor(indices, values, shape, dtype=torch.float32)
    return sparse_tensor


 def _create_valid_sparse_tensor() -> torch.Tensor:
    """Create a valid sparse COO tensor for baseline testing."""
    indices = torch.tensor([[0, 1, 2], [0, 1, 2]])
    values = torch.tensor([1.0, 2.0, 3.0])
    shape = (3, 3)

    sparse_tensor = torch.sparse_coo_tensor(indices, values, shape, dtype=torch.float32)
    return sparse_tensor


 def _create_valid_dense_tensor() -> torch.Tensor:
    """Create a valid dense tensor for baseline testing."""
    return torch.randn(10, 768, dtype=torch.float32)  # (seq_len, hidden_size)


 class TestPromptEmbedsValidation:
    """Test sparse tensor validation in prompt embeddings (Completions API)."""

    def test_valid_dense_tensor_accepted(self, model_config):
        """Baseline: Valid dense tensors should work normally."""
        renderer = CompletionRenderer(model_config)

        valid_tensor = _create_valid_dense_tensor()
        encoded = _encode_tensor(valid_tensor)

        # Should not raise any exception
        result = renderer.load_prompt_embeds(encoded)
        assert len(result) == 1
        assert result[0]["prompt_embeds"].shape == valid_tensor.shape

    def test_valid_sparse_tensor_accepted(self):
        """Baseline: Valid sparse tensors should load successfully."""
        io_handler = ImageEmbeddingMediaIO()

        valid_sparse = _create_valid_sparse_tensor()
        encoded = _encode_tensor(valid_sparse)

        # Should not raise any exception (sparse tensors remain sparse)
        result = io_handler.load_base64("", encoded.decode("utf-8"))
        assert result.shape == valid_sparse.shape

    def test_malicious_sparse_tensor_rejected(self, model_config):
        """Security: Malicious sparse tensors should be rejected."""
        renderer = CompletionRenderer(model_config)

        malicious_tensor = _create_malicious_sparse_tensor()
        encoded = _encode_tensor(malicious_tensor)

        # Should raise RuntimeError due to invalid sparse tensor
        with pytest.raises((RuntimeError, ValueError)) as exc_info:
            renderer.load_prompt_embeds(encoded)

        # Error should indicate sparse tensor validation failure
        error_msg = str(exc_info.value).lower()
        assert "sparse" in error_msg or "index" in error_msg or "bounds" in error_msg

    def test_extremely_large_indices_rejected(self, model_config):
        """Security: Sparse tensors with extremely large indices should be rejected."""
        renderer = CompletionRenderer(model_config)

        # Create tensor with indices far beyond reasonable bounds
        indices = torch.tensor([[999999], [999999]])
        values = torch.tensor([1.0])
        shape = (10, 10)

        malicious_tensor = torch.sparse_coo_tensor(
            indices, values, shape, dtype=torch.float32
        )
        encoded = _encode_tensor(malicious_tensor)

        with pytest.raises((RuntimeError, ValueError)):
            renderer.load_prompt_embeds(encoded)

    def test_negative_indices_rejected(self, model_config):
        """Security: Sparse tensors with negative indices should be rejected."""
        renderer = CompletionRenderer(model_config)

        # Create tensor with negative indices
        indices = torch.tensor([[-1], [-1]])
        values = torch.tensor([1.0])
        shape = (10, 10)

        malicious_tensor = torch.sparse_coo_tensor(
            indices, values, shape, dtype=torch.float32
        )
        encoded = _encode_tensor(malicious_tensor)

        with pytest.raises((RuntimeError, ValueError)):
            renderer.load_prompt_embeds(encoded)


 class TestImageEmbedsValidation:
    """Test sparse tensor validation in image embeddings (Chat API)."""

    def test_valid_dense_tensor_accepted(self):
        """Baseline: Valid dense tensors should work normally."""
        io_handler = ImageEmbeddingMediaIO()

        valid_tensor = _create_valid_dense_tensor()
        encoded = _encode_tensor(valid_tensor)

        # Should not raise any exception
        result = io_handler.load_base64("", encoded.decode("utf-8"))
        assert result.shape == valid_tensor.shape

    def test_valid_sparse_tensor_accepted(self):
        """Baseline: Valid sparse tensors should load successfully."""
        io_handler = AudioEmbeddingMediaIO()

        valid_sparse = _create_valid_sparse_tensor()
        encoded = _encode_tensor(valid_sparse)

        # Should not raise any exception (sparse tensors remain sparse)
        result = io_handler.load_base64("", encoded.decode("utf-8"))
        assert result.shape == valid_sparse.shape

    def test_malicious_sparse_tensor_rejected(self):
        """Security: Malicious sparse tensors should be rejected."""
        io_handler = ImageEmbeddingMediaIO()

        malicious_tensor = _create_malicious_sparse_tensor()
        encoded = _encode_tensor(malicious_tensor)

        # Should raise RuntimeError due to invalid sparse tensor
        with pytest.raises((RuntimeError, ValueError)) as exc_info:
            io_handler.load_base64("", encoded.decode("utf-8"))

        error_msg = str(exc_info.value).lower()
        assert "sparse" in error_msg or "index" in error_msg or "bounds" in error_msg

    def test_load_bytes_validates(self):
        """Security: Validation should also work for load_bytes method."""
        io_handler = ImageEmbeddingMediaIO()

        malicious_tensor = _create_malicious_sparse_tensor()
        buffer = io.BytesIO()
        torch.save(malicious_tensor, buffer)
        buffer.seek(0)

        with pytest.raises((RuntimeError, ValueError)):
            io_handler.load_bytes(buffer.read())


 class TestAudioEmbedsValidation:
    """Test sparse tensor validation in audio embeddings (Chat API)."""

    def test_valid_dense_tensor_accepted(self):
        """Baseline: Valid dense tensors should work normally."""
        io_handler = AudioEmbeddingMediaIO()

        valid_tensor = _create_valid_dense_tensor()
        encoded = _encode_tensor(valid_tensor)

        # Should not raise any exception
        result = io_handler.load_base64("", encoded.decode("utf-8"))
        assert result.shape == valid_tensor.shape

    def test_valid_sparse_tensor_accepted(self):
        """Baseline: Valid sparse tensors should be converted successfully."""
        io_handler = AudioEmbeddingMediaIO()

        valid_sparse = _create_valid_sparse_tensor()
        encoded = _encode_tensor(valid_sparse)

        # Should not raise any exception
        result = io_handler.load_base64("", encoded.decode("utf-8"))
        assert result.is_sparse is False

    def test_malicious_sparse_tensor_rejected(self):
        """Security: Malicious sparse tensors should be rejected."""
        io_handler = AudioEmbeddingMediaIO()

        malicious_tensor = _create_malicious_sparse_tensor()
        encoded = _encode_tensor(malicious_tensor)

        # Should raise RuntimeError due to invalid sparse tensor
        with pytest.raises((RuntimeError, ValueError)) as exc_info:
            io_handler.load_base64("", encoded.decode("utf-8"))

        error_msg = str(exc_info.value).lower()
        assert "sparse" in error_msg or "index" in error_msg or "bounds" in error_msg

    def test_load_bytes_validates(self):
        """Security: Validation should also work for load_bytes method."""
        io_handler = AudioEmbeddingMediaIO()

        malicious_tensor = _create_malicious_sparse_tensor()
        buffer = io.BytesIO()
        torch.save(malicious_tensor, buffer)
        buffer.seek(0)

        with pytest.raises((RuntimeError, ValueError)):
            io_handler.load_bytes(buffer.read())


 class TestSparseTensorValidationIntegration:
    """
    These tests verify the complete attack chain is blocked at all entry points.
    """

    def test_attack_scenario_completions_api(self, model_config):
        """
        Simulate a complete attack through the Completions API.

        Attack scenario:
        1. Attacker crafts malicious sparse tensor
        2. Encodes it as base64
        3. Sends to /v1/completions with prompt_embeds parameter
        4. Server should reject before memory corruption occurs
        """
        renderer = CompletionRenderer(model_config)

        # Step 1-2: Attacker creates malicious payload
        attack_payload = _encode_tensor(_create_malicious_sparse_tensor())

        # Step 3-4: Server processes and should reject
        with pytest.raises((RuntimeError, ValueError)):
            renderer.load_prompt_embeds(attack_payload)

    def test_attack_scenario_chat_api_image(self):
        """
        Simulate attack through Chat API with image_embeds.

        Verifies the image embeddings path is protected.
        """
        io_handler = ImageEmbeddingMediaIO()
        attack_payload = _encode_tensor(_create_malicious_sparse_tensor())

        with pytest.raises((RuntimeError, ValueError)):
            io_handler.load_base64("", attack_payload.decode("utf-8"))

    def test_attack_scenario_chat_api_audio(self):
        """
        Simulate attack through Chat API with audio_embeds.

        Verifies the audio embeddings path is protected.
        """
        io_handler = AudioEmbeddingMediaIO()
        attack_payload = _encode_tensor(_create_malicious_sparse_tensor())

        with pytest.raises((RuntimeError, ValueError)):
            io_handler.load_base64("", attack_payload.decode("utf-8"))

    def test_multiple_valid_embeddings_in_batch(self, model_config):
        """
        Regression test: Multiple valid embeddings should still work.

        Ensures the fix doesn't break legitimate batch processing.
        """
        renderer = CompletionRenderer(model_config)

        valid_tensors = [
            _encode_tensor(_create_valid_dense_tensor()),
            _encode_tensor(_create_valid_dense_tensor()),
            _encode_tensor(_create_valid_dense_tensor()),
        ]

        # Should process all without error
        result = renderer.load_prompt_embeds(valid_tensors)
        assert len(result) == 3

    def test_mixed_valid_and_malicious_rejected(self, model_config):
        """
        Security: Batch with one malicious tensor should be rejected.

        Even if most tensors are valid, a single malicious one should
        cause rejection of the entire batch.
        """
        renderer = CompletionRenderer(model_config)

        mixed_batch = [
            _encode_tensor(_create_valid_dense_tensor()),
            _encode_tensor(_create_malicious_sparse_tensor()),  # Malicious
            _encode_tensor(_create_valid_dense_tensor()),
        ]

        # Should fail on the malicious tensor
        with pytest.raises((RuntimeError, ValueError)):
            renderer.load_prompt_embeds(mixed_batch)


 # Pytest fixtures
@pytest.fixture
 def model_config():
    """Mock ModelConfig for testing."""
    from vllm.config import ModelConfig

    return ModelConfig(
        model="facebook/opt-125m",
        tokenizer="facebook/opt-125m",
        tokenizer_mode="auto",
        trust_remote_code=False,
        dtype="float32",
        seed=0,
        enable_prompt_embeds=True,  # Required for prompt embeds tests
    )
--- a/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
@@ -10,8 +10,8 @@ from tests.entrypoints.openai.tool_parsers.utils import (
    run_tool_extraction_streaming,
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers import ToolParser, ToolParserManager

 SIMPLE_ARGS_DICT = {
    "action": "create",
--- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
@@ -6,8 +6,8 @@ import json
 import pytest

 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser

 from ....utils import RemoteOpenAIServer

--- a/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
@@ -12,7 +12,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
    run_tool_extraction_streaming,
 )
 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.tool_parsers import ToolParser, ToolParserManager


 def make_tool_call(name, arguments):
--- a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
@@ -6,8 +6,8 @@ from unittest.mock import MagicMock, patch
 import pytest

 from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
 from vllm.entrypoints.openai.tool_parsers.llama_tool_parser import Llama3JsonToolParser
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers.llama_tool_parser import Llama3JsonToolParser


@pytest.fixture
--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -10,8 +10,8 @@ from tests.entrypoints.openai.tool_parsers.utils import (
    run_tool_extraction_streaming,
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers import ToolParser, ToolParserManager

 # Test cases similar to pythonic parser but with Llama4 specific format
 SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]"
--- a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
@@ -10,8 +10,8 @@ from tests.entrypoints.openai.tool_parsers.utils import (
    run_tool_extraction_streaming,
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers import ToolParser, ToolParserManager

 # https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
 SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -10,8 +10,8 @@ from tests.entrypoints.openai.tool_parsers.utils import (
    run_tool_extraction_streaming,
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers import ToolParser, ToolParserManager

 # https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
 SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -10,8 +10,8 @@ from vllm.entrypoints.openai.protocol import (
    FunctionCall,
    ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers import ToolParser


 class StreamingToolReconstructor:
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -29,7 +29,8 @@ from vllm.multimodal.utils import (
    encode_image_base64,
    encode_video_base64,
 )
 from vllm.tokenizers import MistralTokenizer, get_tokenizer
 from vllm.tokenizers import get_tokenizer
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils.serial_utils import tensor2base64

 from ..models.registry import HF_EXAMPLE_MODELS
@@ -796,9 +797,13 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
            "content": "<|image_1|>\nWhat's in this image?",
        }
    ]

    assert mm_data is not None
    assert "image" in mm_data
    assert mm_data["image"] is None
    assert isinstance(mm_data["image"], list)
    assert len(mm_data["image"]) == 1
    assert mm_data["image"][0] is None

    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])


@@ -825,10 +830,11 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
    # Should have audio in mm_data as None (UUID provided)
    assert mm_data is not None
    assert "audio" in mm_data
    assert mm_data["audio"] is None
    assert isinstance(mm_data["audio"], list)
    assert len(mm_data["audio"]) == 1
    assert mm_data["audio"][0] is None

    # UUID should be recorded
    assert mm_uuids is not None
    assert "audio" in mm_uuids
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[uuid])


@@ -1121,10 +1127,105 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
    mm_data = await mm_future
    assert mm_data is not None
    assert "image" in mm_data
    assert mm_data["image"] is None
    assert isinstance(mm_data["image"], list)
    assert len(mm_data["image"]) == 1
    assert mm_data["image"][0] is None

    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])


 def test_parse_chat_messages_empty_dict_image_embeds(
    phi3v_model_config_image_embeds,
 ):
    """Test that empty dictionary for image_embeds is handled without errors."""
    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_embeds", "image_embeds": {}},
                    {"type": "text", "text": "What's in this image?"},
                ],
            }
        ],
        phi3v_model_config_image_embeds,
        content_format="string",
    )

    # Verify conversation structure
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\nWhat's in this image?",
        }
    ]

    # Verify mm_data contains an empty dictionary of embeddings
    assert mm_data is not None
    assert "image" in mm_data
    assert isinstance(mm_data["image"], dict)
    assert len(mm_data["image"]) == 0

    # Verify UUIDs (None since we didn't provide any)
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])


 def test_parse_chat_messages_multiple_dict_image_embeds(
    phi3v_model_config_image_embeds,
 ):
    """Test that multiple dictionaries for image_embeds is handled without errors."""
    # Create two sample image embedding tensors
    batch_size = 2
    image_embedding_1 = torch.randn(batch_size, 256, 1024)
    image_embedding_2 = torch.randn(batch_size, 3)

    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_embeds",
                        "image_embeds": {
                            "image_embedding_1": tensor2base64(p),
                            "image_embedding_2": tensor2base64(i),
                        },
                    }
                    for p, i in zip(image_embedding_1, image_embedding_2)
                ]
                + [
                    {"type": "text", "text": "Describe these two images."},
                ],
            }
        ],
        phi3v_model_config_image_embeds,
        content_format="string",
    )

    # Verify conversation structure
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nDescribe these two images.",
        }
    ]

    # Verify mm_data contains a dictionary of multi-embeddings
    assert mm_data is not None
    assert "image" in mm_data
    assert isinstance(mm_data["image"], dict)
    assert len(mm_data["image"]) == batch_size

    # Verify each embedding has the correct shape
    assert isinstance(mm_data["image"]["image_embedding_1"], torch.Tensor)
    assert mm_data["image"]["image_embedding_1"].shape == image_embedding_1.shape
    assert isinstance(mm_data["image"]["image_embedding_2"], torch.Tensor)
    assert mm_data["image"]["image_embedding_2"].shape == image_embedding_2.shape

    # Verify UUIDs (None since we didn't provide any)
    _assert_mm_uuids(mm_uuids, batch_size, expected_uuids=[None, None])


@pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_async(
    phi3v_model_config,
--- a/tests/kernels/attention/test_cutlass_mla_decode.py
+++ b/tests/kernels/attention/test_cutlass_mla_decode.py
@@ -32,8 +32,8 @@ def cal_diff(


 CUTLASS_MLA_UNSUPPORTED_REASON = (
    "Cutlass MLA Requires compute capability of 10 or above."
    if not current_platform.is_device_capability(100)
    "Cutlass MLA Requires compute capability of 100 or above."
    if not current_platform.is_device_capability_family(100)
    else "Cutlass MLA is supported"
 )

--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -11,7 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import round_up

 if not current_platform.is_device_capability(100):
 if not current_platform.is_device_capability_family(100):
    pytest.skip(
        "This TRTLLM kernel requires NVIDIA Blackwell.", allow_module_level=True
    )
@@ -443,7 +443,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
        output_trtllm = output_trtllm.reshape(-1, query.shape[1], query.shape[2])

    if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
        rtol, atol = 1e-1, 2e-1
        rtol, atol = 3e-1, 4e-1
    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
        rtol, atol = 4e-2, 6e-2
    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -7,6 +7,7 @@ import torch

 from vllm.attention.ops.triton_unified_attention import unified_attention
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import next_power_of_2

 NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]
@@ -22,6 +23,10 @@ QDTYPES = (
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]

 # 0: use 2D kernel for decode
 # 8: use 3D kernel for decode
 SEQ_THRESHOLD_3D_VALUES = [0, 8]


 def ref_paged_attn(
    query: torch.Tensor,
@@ -92,6 +97,7 @@ def ref_paged_attn(
@pytest.mark.parametrize("soft_cap", [None, 50.0])
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("q_dtype", QDTYPES)
@pytest.mark.parametrize("seq_threshold_3D", SEQ_THRESHOLD_3D_VALUES)
@torch.inference_mode()
 def test_triton_unified_attn(
    seq_lens: list[tuple[int, int]],
@@ -103,6 +109,7 @@ def test_triton_unified_attn(
    soft_cap: float | None,
    num_blocks: int,
    q_dtype: torch.dtype | None,
    seq_threshold_3D: int,
 ) -> None:
    torch.set_default_device("cuda")

@@ -152,6 +159,21 @@ def test_triton_unified_attn(
        k_descale = torch.rand(scale_shape, dtype=torch.float32)
        v_descale = torch.rand(scale_shape, dtype=torch.float32)

    num_par_softmax_segments = 16
    head_size_padded = next_power_of_2(head_size)
    softmax_segm_output = torch.empty(
        (seq_threshold_3D, num_query_heads, num_par_softmax_segments, head_size_padded),
        dtype=torch.float32,
    )
    softmax_segm_max = torch.empty(
        (seq_threshold_3D, num_query_heads, num_par_softmax_segments),
        dtype=torch.float32,
    )
    softmax_segm_expsum = torch.empty(
        (seq_threshold_3D, num_query_heads, num_par_softmax_segments),
        dtype=torch.float32,
    )

    unified_attention(
        q=maybe_quantized_query,
        k=maybe_quantized_key_cache,
@@ -169,6 +191,11 @@ def test_triton_unified_attn(
        q_descale=q_descale,
        k_descale=k_descale,
        v_descale=v_descale,
        seq_threshold_3D=seq_threshold_3D,
        num_par_softmax_segments=num_par_softmax_segments,
        softmax_segm_output=softmax_segm_output,
        softmax_segm_max=softmax_segm_max,
        softmax_segm_expsum=softmax_segm_expsum,
    )

    ref_output = ref_paged_attn(
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -594,7 +594,8 @@ def make_modular_kernel(
    )

    modular_kernel = mk.FusedMoEModularKernel(
        prepare_finalize=prepare_finalize, fused_experts=fused_experts
        prepare_finalize=prepare_finalize,
        fused_experts=fused_experts,
    )

    return modular_kernel
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -5,6 +5,7 @@ from dataclasses import dataclass
 import pytest
 import torch

 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEQuantConfig,
@@ -107,6 +108,19 @@ class TestData:
        layer.w2_input_scale = a2_scale
        layer.w13_weight_scale = w13_weight_scale
        layer.w2_weight_scale = w2_weight_scale
        # Setup dummy config.
        layer.moe_parallel_config = mk.FusedMoEParallelConfig(
            tp_size=1,
            pcp_size=1,
            dp_size=1,
            ep_size=1,
            tp_rank=1,
            pcp_rank=1,
            dp_rank=1,
            ep_rank=1,
            use_ep=False,
            all2all_backend="naive",
        )

        register_moe_scaling_factors(layer)

--- a/tests/kernels/moe/test_ocp_mx_moe.py
+++ b/tests/kernels/moe/test_ocp_mx_moe.py
@@ -17,7 +17,7 @@ QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse(
 ) >= version.parse("0.8.99")

 TRTLLM_GEN_MXFP4_AVAILABLE = (
    current_platform.is_cuda() and current_platform.is_device_capability(100)
    current_platform.is_cuda() and current_platform.is_device_capability_family(100)
 )

 HOPPER_MXFP4_BF16_AVAILABLE = (
@@ -799,7 +799,7 @@ def test_flashinfer_cutlass_mxfp4_fused_moe(
@pytest.mark.skipif(
    not (
        current_platform.is_cuda()
        and current_platform.is_device_capability(100)
        and current_platform.is_device_capability_family(100)
        and has_flashinfer()
    ),
    reason="NVIDIA GPU sm100 and flashinfer are required for this test",
--- a/tests/kernels/quantization/test_awq.py
+++ b/tests/kernels/quantization/test_awq.py
@@ -41,9 +41,9 @@ def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
        qweight = torch.randint(
            -2000000000, 2000000000, (8192, 256), device="cuda", dtype=torch.int32
        )
        scales = torch.randint(
        scales = torch.empty((64, 2048), device="cuda", dtype=torch.float16)
        qzeros = torch.randint(
            -2000000000, 2000000000, (64, 256), device="cuda", dtype=torch.int32
        )
        qzeros = torch.empty((64, 2048), device="cuda", dtype=torch.float16)
        split_k_iters = 8
        opcheck(torch.ops._C.awq_gemm, (input, qweight, qzeros, scales, split_k_iters))
        opcheck(torch.ops._C.awq_gemm, (input, qweight, scales, qzeros, split_k_iters))
--- a/tests/kernels/quantization/test_fp8_quant_group.py
+++ b/tests/kernels/quantization/test_fp8_quant_group.py
@@ -62,7 +62,7 @@ def test_quantfp8_group_functionality(
    assert scales_col.stride(1) == batch_size

    # Test column-major scales consistency
    assert torch.allclose(scales_col, scales_native, rtol=1e-9, atol=1e-8)
    torch.testing.assert_close(scales_col, scales_native, rtol=1e-9, atol=1e-8)

    # 3. Test CUDA implementation (only for divisible dimensions)
    if is_divisible:
@@ -71,7 +71,7 @@ def test_quantfp8_group_functionality(
        assert scales_cuda.shape == (batch_size, expected_num_groups)

        # Verify CUDA/native consistency
        assert torch.allclose(scales_cuda, scales_native, rtol=1e-9, atol=1e-8)
        torch.testing.assert_close(scales_cuda, scales_native, rtol=2e-7, atol=2e-8)

        # Quantized values should mostly match
        diff_count = (x_quant_cuda != x_quant_native).sum().item()
--- a/tests/kernels/quantization/test_scaled_mm_kernel_selection.py
+++ b/tests/kernels/quantization/test_scaled_mm_kernel_selection.py
@@ -0,0 +1,91 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for ScaledMM kernel selection logic (CPU-only)

 Run `pytest tests/kernels/quantization/test_scaled_mm_kernel_selection.py`.
 """

 import inspect
 from abc import ABC

 import pytest

 from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
    ScaledMMLinearLayerConfig,
 )
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
    AiterScaledMMLinearKernel,
 )
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import (
    CPUScaledMMLinearKernel,
 )
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
    ScaledMMLinearKernel,
 )

 pytestmark = pytest.mark.cpu_test


 def test_is_supported_is_abstract():
    """Test that is_supported() is properly defined as abstract."""
    assert issubclass(ScaledMMLinearKernel, ABC)
    assert hasattr(ScaledMMLinearKernel, "is_supported")


 def test_cpu_kernel_implements_is_supported():
    """Test that CPUScaledMMLinearKernel implements is_supported() method."""
    assert hasattr(CPUScaledMMLinearKernel, "is_supported"), (
        "CPUScaledMMLinearKernel missing is_supported() method"
    )
    # Verify it's a classmethod by checking if it can be called with the class
    # and by checking the method type
    assert inspect.ismethod(CPUScaledMMLinearKernel.is_supported) or inspect.isfunction(
        CPUScaledMMLinearKernel.is_supported
    ), "CPUScaledMMLinearKernel.is_supported() should be a classmethod"
    # Verify it can be called as a classmethod
    result, reason = CPUScaledMMLinearKernel.is_supported()
    assert isinstance(result, bool), "is_supported() should return a bool"
    assert reason is None or isinstance(reason, str), "reason should be str or None"


 def test_aiter_kernel_implements_is_supported():
    """Test that AiterScaledMMLinearKernel implements is_supported() method."""
    assert hasattr(AiterScaledMMLinearKernel, "is_supported"), (
        "AiterScaledMMLinearKernel missing is_supported() method"
    )
    # Verify it's a classmethod by checking if it can be called with the class
    # and by checking the method type
    assert inspect.ismethod(
        AiterScaledMMLinearKernel.is_supported
    ) or inspect.isfunction(AiterScaledMMLinearKernel.is_supported), (
        "AiterScaledMMLinearKernel.is_supported() should be a classmethod"
    )
    # Verify it can be called as a classmethod
    # (will return False on CPU, which is expected)
    result, reason = AiterScaledMMLinearKernel.is_supported()
    assert isinstance(result, bool), "is_supported() should return a bool"
    assert reason is None or isinstance(reason, str), "reason should be str or None"
    # On CPU, it should return False with a reason about requiring ROCm
    # This validates the method works correctly even on non-ROCm platforms


 def test_cpu_kernel_accepts_all_configs():
    """Test that CPUScaledMMLinearKernel accepts all config combinations."""
    configs = [
        ScaledMMLinearLayerConfig(
            is_channelwise=False,
            is_static_input_scheme=True,
            input_symmetric=True,
        ),
        ScaledMMLinearLayerConfig(
            is_channelwise=True,
            is_static_input_scheme=False,
            input_symmetric=False,
        ),
    ]

    for config in configs:
        can_impl, reason = CPUScaledMMLinearKernel.can_implement(config)
        assert can_impl, (
            f"CPUScaledMMLinearKernel should accept config {config}: {reason}"
        )
--- a/tests/models/fixtures/audioflamingo3/expected_results_batched.json
+++ b/tests/models/fixtures/audioflamingo3/expected_results_batched.json
@@ -0,0 +1 @@
 {"transcriptions": ["There is no clear relationship between the barking and the music, as they seem to be independent of each other.", "(B) To indicate that language cannot express clearly, satirizing the inversion of black and white in the world"], "token_ids": [[3862, 374, 902, 2797, 5025, 1948, 279, 293, 33452, 323, 279, 4627, 11, 438, 807, 2803, 311, 387, 9489, 315, 1817, 1008, 13, 151645], [5349, 8, 2014, 13216, 429, 4128, 4157, 3158, 9355, 11, 7578, 404, 4849, 279, 46488, 315, 3691, 323, 4158, 304, 279, 1879, 151645, 151671]]}
--- a/tests/models/fixtures/audioflamingo3/expected_results_single.json
+++ b/tests/models/fixtures/audioflamingo3/expected_results_single.json
@@ -0,0 +1 @@
 {"transcriptions": ["The content of the input audio is 'you can ask why over and over and over again forever even if one day we explain every physical interaction and scientific law and hope and dream and regret with a single elegant equation'."], "token_ids": [[785, 2213, 315, 279, 1946, 7699, 374, 364, 9330, 646, 2548, 3170, 916, 323, 916, 323, 916, 1549, 15683, 1496, 421, 825, 1899, 582, 10339, 1449, 6961, 16230, 323, 12344, 2329, 323, 3900, 323, 7904, 323, 22231, 448, 264, 3175, 25777, 23606, 4427, 151645]]}
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -5,12 +5,12 @@ import json

 import pytest

 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
 from vllm.sampling_params import SamplingParams
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.tool_parsers.mistral_tool_parser import (
    MistralToolCall,
    MistralToolParser,
 )
 from vllm.sampling_params import SamplingParams
 from vllm.tokenizers import MistralTokenizer

 from ...utils import check_logprobs_close

--- a/tests/models/language/pooling/test_token_classification.py
+++ b/tests/models/language/pooling/test_token_classification.py
@@ -68,3 +68,34 @@ def test_modernbert_models(
        hf_output = torch.tensor(hf_output).cpu().float()
        vllm_output = torch.tensor(vllm_output).cpu().float()
        assert torch.allclose(hf_output, vllm_output, atol=1e-2)


@pytest.mark.parametrize("model", ["bd2lcco/Qwen3-0.6B-finetuned"])
@pytest.mark.parametrize("dtype", ["float"])
@torch.inference_mode
 def test_auto_conversion(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
 ) -> None:
    with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.token_classify(example_prompts)

    with hf_runner(
        model, dtype=dtype, auto_cls=AutoModelForTokenClassification
    ) as hf_model:
        tokenizer = hf_model.tokenizer
        hf_outputs = []
        for prompt in example_prompts:
            inputs = tokenizer([prompt], return_tensors="pt")
            inputs = hf_model.wrap_device(inputs)
            output = hf_model.model(**inputs)
            hf_outputs.append(softmax(output.logits[0]))

    # check logits difference
    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
        hf_output = torch.tensor(hf_output).cpu().float()
        vllm_output = torch.tensor(vllm_output).cpu().float()
        assert torch.allclose(hf_output, vllm_output, atol=1e-2)
--- a/tests/models/multimodal/generation/test_audioflamingo3.py
+++ b/tests/models/multimodal/generation/test_audioflamingo3.py
@@ -0,0 +1,142 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 # Copyright 2025 The vLLM team.
 # Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import json
 import os

 import pytest

 from tests.models.registry import HF_EXAMPLE_MODELS
 from vllm import LLM, SamplingParams

 MODEL_NAME = "nvidia/audio-flamingo-3-hf"


 def get_fixture_path(filename):
    return os.path.join(
        os.path.dirname(__file__), "../../fixtures/audioflamingo3", filename
    )


@pytest.fixture(scope="module")
 def llm():
    # Check if the model is supported by the current transformers version
    model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
    model_info.check_transformers_version(on_fail="skip")

    try:
        llm = LLM(
            model=MODEL_NAME,
            trust_remote_code=True,
            dtype="bfloat16",
            enforce_eager=True,
            limit_mm_per_prompt={"audio": 1},
        )
        return llm
    except Exception as e:
        pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")


 def test_single_generation(llm):
    fixture_path = get_fixture_path("expected_results_single.json")
    if not os.path.exists(fixture_path):
        pytest.skip(f"Fixture not found: {fixture_path}")

    with open(fixture_path) as f:
        expected = json.load(f)

    audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Why_do_we_ask_questions_converted.wav"

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "audio_url", "audio_url": {"url": audio_url}},
                {"type": "text", "text": "Transcribe the input speech."},
            ],
        }
    ]

    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)

    outputs = llm.chat(
        messages=messages,
        sampling_params=sampling_params,
    )
    generated_text = outputs[0].outputs[0].text.strip()

    expected_text = expected["transcriptions"][0]

    assert expected_text in generated_text or generated_text in expected_text


 def test_batched_generation(llm):
    fixture_path = get_fixture_path("expected_results_batched.json")
    if not os.path.exists(fixture_path):
        pytest.skip(f"Fixture not found: {fixture_path}")

    with open(fixture_path) as f:
        expected = json.load(f)

    items = [
        {
            "audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
            "question": "What is surprising about the relationship "
            "between the barking and the music?",
            "expected_idx": 0,
        },
        {
            "audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
            "question": (
                "Why is the philosopher's name mentioned in the lyrics? "
                "(A) To express a sense of nostalgia "
                "(B) To indicate that language cannot express clearly, "
                "satirizing the inversion of black and white in the world "
                "(C) To add depth and complexity to the lyrics "
                "(D) To showcase the wisdom and influence of the philosopher"
            ),
            "expected_idx": 1,
        },
    ]

    conversations = []
    for item in items:
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "audio_url", "audio_url": {"url": item["audio_url"]}},
                    {"type": "text", "text": item["question"]},
                ],
            }
        ]
        conversations.append(messages)

    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)

    outputs = llm.chat(
        messages=conversations,
        sampling_params=sampling_params,
    )

    for i, output in enumerate(outputs):
        generated_text = output.outputs[0].text.strip()
        expected_text = expected["transcriptions"][i]

        assert expected_text in generated_text or generated_text in expected_text
--- a/tests/models/multimodal/generation/test_vit_backend_functionality.py
+++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py
@@ -0,0 +1,434 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Consolidated test for ViT attention backend functionality across multiple models.

 This test validates that each multimodal model can successfully generate outputs
 using different ViT attention backends. Tests are parametrized by model and backend.
 """

 from dataclasses import asdict
 from typing import Any

 import pytest
 from transformers import AutoProcessor

 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.multimodal.utils import encode_image_base64
 from vllm.multimodal.video import sample_frames_from_video
 from vllm.platforms import current_platform

 from ....utils import create_new_process_for_each_test
 from ...utils import dummy_hf_overrides

 # Dots.OCR prompt from official repository
 # https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/utils/prompts.py#L3
 # ruff: noqa: E501
 DOTS_OCR_PROMPT = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.

 1. Bbox format: [x1, y1, x2, y2]

 2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].

 3. Text Extraction & Formatting Rules:
    - Picture: For the 'Picture' category, the text field should be omitted.
    - Formula: Format its text as LaTeX.
    - Table: Format its text as HTML.
    - All Others (Text, Title, etc.): Format their text as Markdown.

 4. Constraints:
    - The output text must be the original text from the image, with no translation.
    - All layout elements must be sorted according to human reading order.

 5. Final Output: The entire output must be a single JSON object.
 """

 VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"


 # Model configurations
 MODEL_CONFIGS: dict[str, dict[str, Any]] = {
    "dots_ocr": {
        "model_name": "rednote-hilab/dots.ocr",
        "interface": "llm_chat",
        "max_model_len": 32768,
        "max_num_seqs": 1,
        "limit_mm_per_prompt": {"image": 1},
        "sampling_params": {
            "temperature": 0.1,
            "max_tokens": 16384,
            "top_p": 0.9,
            "stop_token_ids": None,
        },
        "use_specific_image": "stop_sign",
        "prompt_builder": "build_dots_ocr_prompt",
        "output_validator": lambda x: len(x) > 10 and "stop" in x.lower(),
    },
    "ernie45_vl": {
        "model_name": "baidu/ERNIE-4.5-VL-28B-A3B-PT",
        "interface": "llm_generate",
        "max_model_len": 16384,
        "max_num_seqs": 2,
        "sampling_params": {
            "temperature": 0.0,
            "max_tokens": 256,
            "stop_token_ids": None,
        },
        "use_processor": True,
        "question": "What is the content of each image?",
    },
    "glm4_1v": {
        "model_name": "zai-org/GLM-4.1V-9B-Thinking",
        "interface": "llm_generate",
        "max_model_len": 32768,
        "max_num_seqs": 2,
        "sampling_params": {
            "temperature": 0.0,
            "max_tokens": 256,
            "stop_token_ids": None,
        },
        "use_processor": True,
        "question": "What is the content of each image?",
    },
    "keye_vl": {
        "model_name": "Kwai-Keye/Keye-VL-8B-Preview",
        "interface": "llm_generate",
        "max_model_len": 8192,
        "max_num_seqs": 5,
        "sampling_params": {
            "temperature": 0.0,
            "max_tokens": 256,
            "stop_token_ids": None,
        },
        "supported_backends": {
            AttentionBackendEnum.FLASH_ATTN,
            AttentionBackendEnum.ROCM_AITER_FA,
        },
        "use_processor": True,
        "question": "What is the content of each image?",
    },
    "ovis2_5": {
        "model_name": "AIDC-AI/Ovis2.5-2B",
        "interface": "llm_generate",
        "max_model_len": 8192,
        "max_num_seqs": 2,
        "sampling_params": {
            "temperature": 0.0,
            "max_tokens": 256,
            "stop_token_ids": None,
        },
        "prompt_builder": "build_ovis_prompt",
        "question": "What is the content of each image?",
    },
    "qwen2_5_vl": {
        "model_name": "Qwen/Qwen2.5-VL-3B-Instruct",
        "interface": "vllm_runner",
        "media_type": "video",
        "max_model_len": 4000,
        "max_num_seqs": 1,
        "limit_mm_per_prompt": {"video": 1},
        "sampling_params": {
            "max_tokens": 128,
        },
        "runner_kwargs": {
            "runner": "generate",
            "dtype": "bfloat16",
        },
        "video_params": {
            "num_frames": 16,
            "pruning_rates": [0.0, 0.75],
        },
    },
    "qwen2_5_omni": {
        "model_name": "Qwen/Qwen2.5-Omni-3B",
        "interface": "llm_generate",
        "max_model_len": 32768,
        "max_num_seqs": 2,
        "limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
        "sampling_params": {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 20,
            "max_tokens": 16384,
        },
        "use_processor": True,
        "question": "What is the content of each image?",
    },
    "qwen3_omni": {
        "model_name": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
        "interface": "llm_generate",
        "max_model_len": 32768,
        "max_num_seqs": 2,
        "limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
        "sampling_params": {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 20,
            "max_tokens": 16384,
        },
        "use_processor": True,
        "question": "What is the content of each image?",
    },
 }


 # Prompt builder functions
 def build_dots_ocr_prompt(images, config):
    """Build Dots.OCR specific prompt with OCR instructions."""
    # Use only stop_sign image for Dots.OCR
    image = images[0]  # Already filtered to stop_sign

    image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"

    placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {
                    "type": "text",
                    "text": f"<|img|><|imgpad|><|endofimg|>{DOTS_OCR_PROMPT}",
                },
            ],
        },
    ]

    return messages


 def build_processor_prompt(images, config):
    """Build prompt using AutoProcessor.apply_chat_template()."""
    processor = AutoProcessor.from_pretrained(
        config["model_name"], trust_remote_code=True
    )

    image_urls = [
        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
    ]
    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": config["question"]},
            ],
        },
    ]

    return processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )


 def build_ovis_prompt(images, config):
    """Build Ovis2.5 specific prompt with custom format."""
    image_urls = [
        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
    ]

    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )

    return (
        f"<|im_start|>user\n\n{placeholders}\n{config['question']}<|im_end|>\n"
        "<|im_start|>assistant\n"
    )


 def build_qwen2_5_video_prompt():
    """Build Qwen2.5-VL video prompt with EVS placeholder."""
    return (
        f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
        f"<|im_start|>user\n{VIDEO_PLACEHOLDER}"
        "Describe this video with a short sentence (no more than 20 words)"
        "<|im_end|><|im_start|>assistant\n"
    )


 # Handler functions
 def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
    """Standard LLM.generate() interface handler."""
    images = [asset.pil_image for asset in image_assets]

    # Build prompt
    if config.get("use_processor"):
        prompt = build_processor_prompt(images, config)
    else:
        prompt_builder_name = config.get("prompt_builder", "build_ovis_prompt")
        prompt_builder = globals()[prompt_builder_name]
        prompt = prompt_builder(images, config)

    # Determine limit_mm_per_prompt
    limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})

    # Create engine
    engine_args = EngineArgs(
        model=config["model_name"],
        trust_remote_code=True,
        max_model_len=config["max_model_len"],
        max_num_seqs=config["max_num_seqs"],
        limit_mm_per_prompt=limit_mm_per_prompt,
        mm_encoder_attn_backend=mm_encoder_attn_backend,
        hf_overrides=dummy_hf_overrides,
        load_format="dummy",
    )

    engine_dict = asdict(engine_args) | {"seed": 42}
    llm = LLM(**engine_dict)

    # Generate
    sampling_params = SamplingParams(**config["sampling_params"])
    outputs = llm.generate(
        {
            "prompt": prompt,
            "multi_modal_data": {"image": images},
        },
        sampling_params=sampling_params,
    )

    # Validate
    for o in outputs:
        generated_text = o.outputs[0].text
        validator = config.get("output_validator", lambda x: len(x) > 10)
        assert validator(generated_text), (
            f"Validation failed for {config['model_name']}: {generated_text}"
        )


 def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
    """LLM.chat() interface handler for Dots.OCR."""
    # Filter to stop_sign image only
    stop_sign_image = [
        asset.pil_image for asset in image_assets if asset.name == "stop_sign"
    ][0]

    # Build messages
    messages = build_dots_ocr_prompt([stop_sign_image], config)

    # Create engine
    engine_args = EngineArgs(
        model=config["model_name"],
        trust_remote_code=True,
        max_model_len=config["max_model_len"],
        max_num_seqs=config["max_num_seqs"],
        limit_mm_per_prompt=config["limit_mm_per_prompt"],
        mm_encoder_attn_backend=mm_encoder_attn_backend,
        hf_overrides=dummy_hf_overrides,
        load_format="dummy",
    )

    engine_dict = asdict(engine_args) | {"seed": 42}
    llm = LLM(**engine_dict)

    # Generate using chat
    sampling_params = SamplingParams(**config["sampling_params"])
    outputs = llm.chat(messages=messages, sampling_params=sampling_params)

    # Validate
    for o in outputs:
        generated_text = o.outputs[0].text
        validator = config.get("output_validator", lambda x: len(x) > 10)
        assert validator(generated_text), (
            f"Validation failed for {config['model_name']}: {generated_text}"
        )


 def run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner):
    """Video test with EVS (Efficient Video Sampling) handler."""
    for pruning_rate in config["video_params"]["pruning_rates"]:
        num_frames = config["video_params"]["num_frames"]

        # Sample frames from video
        sampled_vids = [
            sample_frames_from_video(asset.np_ndarrays, num_frames)
            for asset in video_assets
        ]

        # Build prompt and prepare video
        prompt = build_qwen2_5_video_prompt()
        prompts = [prompt]
        videos = [sampled_vids[0]]

        # Run with vllm_runner context manager
        with vllm_runner(
            config["model_name"],
            max_model_len=config["max_model_len"],
            max_num_seqs=config["max_num_seqs"],
            limit_mm_per_prompt=config["limit_mm_per_prompt"],
            tensor_parallel_size=1,
            video_pruning_rate=pruning_rate,
            mm_encoder_attn_backend=mm_encoder_attn_backend,
            hf_overrides=dummy_hf_overrides,
            load_format="dummy",
            **config["runner_kwargs"],
        ) as vllm_model:
            outputs = vllm_model.generate_greedy(
                prompts,
                config["sampling_params"]["max_tokens"],
                videos=videos,
            )

            # Validate output
            assert len(outputs) == 1, f"Expected 1 output, got {len(outputs)}"
            output_ids, output_text = outputs[0]
            assert len(output_ids) > 0, "Generated no output IDs"
            assert len(output_text) > 0, "Generated empty text"
            assert isinstance(output_text, str), (
                f"Output is not string: {type(output_text)}"
            )


 # Main test function
@pytest.mark.parametrize("model_key", list(MODEL_CONFIGS.keys()))
@pytest.mark.parametrize(
    "mm_encoder_attn_backend",
    [None] + current_platform.get_supported_vit_attn_backends(),
 )
@create_new_process_for_each_test()
 def test_vit_backend_functionality(
    model_key: str,
    mm_encoder_attn_backend: AttentionBackendEnum | None,
    image_assets,
    video_assets,
    vllm_runner,
    request,
 ):
    """Test ViT attention backend functionality for multimodal models.

    This test validates that each model can successfully generate outputs
    using different ViT attention backends. The test:
    1. Filters unsupported backends per model
    2. Applies appropriate GPU marks
    3. Routes to the correct test handler based on interface
    4. Validates output meets minimum requirements
    """
    config = MODEL_CONFIGS[model_key]

    # Step 1: Backend filtering
    if (
        "supported_backends" in config
        and mm_encoder_attn_backend is not None
        and mm_encoder_attn_backend not in config["supported_backends"]
    ):
        pytest.skip(
            f"{model_key} does not support {mm_encoder_attn_backend} backend now."
        )

    # Step 2: Apply GPU marks dynamically
    if "gpu_marks" in config:
        for mark in config["gpu_marks"]:
            request.applymarker(mark)

    # Step 3: Route to appropriate handler
    if config.get("media_type") == "video":
        run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner)
    elif config["interface"] == "llm_chat":
        run_llm_chat_test(config, mm_encoder_attn_backend, image_assets)
    elif config["interface"] == "llm_generate":
        run_llm_generate_test(config, mm_encoder_attn_backend, image_assets)
    else:
        raise ValueError(f"Unknown interface: {config['interface']}")
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -9,7 +9,7 @@ from mistral_common.audio import Audio
 from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
 from mistral_common.protocol.instruct.messages import UserMessage

 from vllm.tokenizers import MistralTokenizer
 from vllm.tokenizers.mistral import MistralTokenizer

 from ....conftest import AudioTestAssets
 from ....utils import RemoteOpenAIServer
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -1,150 +1,146 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from collections.abc import Sequence
 from typing import Any

 import librosa
 import pytest
 from transformers import AutoModelForSpeechSeq2Seq

 from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.platforms import current_platform

 from ....conftest import VllmRunner
 from ....conftest import HfRunner, PromptAudioInput, VllmRunner
 from ....utils import create_new_process_for_each_test, multi_gpu_test
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close

 VLLM_PROMPT = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
 HF_PROMPT = ""
 # Whisper expects 16kHz audio
 WHISPER_SAMPLE_RATE = 16000

 PROMPTS = [
    {
        "prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
        "multi_modal_data": {
            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
        },
    },
    {  # Test explicit encoder/decoder prompt
        "encoder_prompt": {
            "prompt": "",
            "multi_modal_data": {
                "audio": AudioAsset("winning_call").audio_and_sample_rate,
            },
        },
        "decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
    },
 ]

 EXPECTED = {
    "openai/whisper-tiny": [
        " He has birth words I spoke in the original corner of that. And a"
        " little piece of black coat poetry. Mary had a little sandwich,"
        " sweet, with white and snow. And everyone had it very went the last"
        " would sure to go.",
        " >> And the old one, fit John the way to Edgar Martinez. >> One more"
        " to line down the field line for our base camp. Here comes joy. Here"
        " is June and the third base. They're going to wave him in. The throw"
        " to the plate will be late. The Mariners are going to play for the"
        " American League Championship. I don't believe it. It just continues"
        " by all five.",
    ],
    "openai/whisper-small": [
        " The first words I spoke in the original pornograph. A little piece"
        " of practical poetry. Mary had a little lamb, its fleece was quite a"
        " slow, and everywhere that Mary went the lamb was sure to go.",
        " And the old one pitch on the way to Edgar Martinez one month. Here"
        " comes joy. Here is Junior to third base. They're gonna wave him"
        " in. The throw to the plate will be late. The Mariners are going to"
        " play for the American League Championship. I don't believe it. It"
        " just continues. My, oh my.",
    ],
    "openai/whisper-medium": [
        " The first words I spoke in the original phonograph, a little piece"
        " of practical poetry. Mary had a little lamb, its fleece was quite as"
        " slow, and everywhere that Mary went the lamb was sure to go.",
        " And the 0-1 pitch on the way to Edgar Martinez swung on the line"
        " down the left field line for Obeyshev. Here comes Joy. Here is"
        " Jorgen at third base. They're going to wave him in. The throw to the"
        " plate will be late. The Mariners are going to play for the American"
        " League Championship. I don't believe it. It just continues. My, oh"
        " my.",
    ],
    "openai/whisper-large-v3": [
        " The first words I spoke in the original phonograph, a little piece"
        " of practical poetry. Mary had a little lamb, its feet were quite as"
        " slow, and everywhere that Mary went, the lamb was sure to go.",
        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
        " Now the left field line for a base hit. Here comes Joy. Here is"
        " Junior to third base. They're going to wave him in. The throw to the"
        " plate will be late. The Mariners are going to play for the American"
        " League Championship. I don't believe it. It just continues. My, oh,"
        " my.",
    ],
    "openai/whisper-large-v3-turbo": [
        " The first words I spoke in the original phonograph, a little piece"
        " of practical poetry. Mary had a little lamb, its streets were quite"
        " as slow, and everywhere that Mary went the lamb was sure to go.",
        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
        " down the left field line for a base hit. Here comes Joy. Here is"
        " Junior to third base. They're going to wave him in. The throw to the"
        " plate will be late. The Mariners are going to play for the American"
        " League Championship. I don't believe it. It just continues. My, oh,"
        " my.",
    ],
 }

@pytest.fixture(autouse=True)
 def use_spawn_for_whisper(monkeypatch):
    """Whisper has issues with forked workers, use spawn instead."""
    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")


 def run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    inputs: Sequence[tuple[list[str], list[str], PromptAudioInput]],
    model: str,
    *,
    max_model_len: int,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: str | None = None,
    dtype: str = "half",
    enforce_eager: bool = True,
 ) -> None:
    prompt_list = PROMPTS * 10
    expected_list = EXPECTED[model] * 10
    """Inference result should be the same between hf and vllm.

    All the audio fixtures for the test are from AudioAsset.
    For huggingface runner, we provide the audio as input.
    For vllm runner, we provide MultiModalDataDict objects
    and corresponding MultiModalConfig as input.
    """
    with vllm_runner(
        model,
        dtype=dtype,
        max_model_len=448,
        max_model_len=max_model_len,
        tensor_parallel_size=tensor_parallel_size,
        distributed_executor_backend=distributed_executor_backend,
        # TODO (NickLucche) figure out output differences with non-eager and re-enable
        enforce_eager=True,
        limit_mm_per_prompt={"audio": 2},
        enforce_eager=enforce_eager,
        disable_custom_all_reduce=True,
    ) as vllm_model:
        llm = vllm_model.llm

        sampling_params = SamplingParams(
            temperature=0,
            top_p=1.0,
            max_tokens=200,
        vllm_outputs_per_case = [
            vllm_model.generate_greedy_logprobs(
                vllm_prompts,
                max_tokens,
                num_logprobs=num_logprobs,
                audios=audios,
            )
            for vllm_prompts, _, audios in inputs
        ]

    with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
        hf_outputs_per_case = [
            hf_model.generate_greedy_logprobs_limit(
                hf_prompts,
                max_tokens,
                num_logprobs=num_logprobs,
                audios=audios,
            )
            for _, hf_prompts, audios in inputs
        ]

    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_outputs,
            name_0="hf",
            name_1="vllm",
        )

        outputs = llm.generate(prompt_list, sampling_params)

    for output, expected in zip(outputs, expected_list):
        print(output.outputs[0].text)
        assert output.outputs[0].text == expected
@pytest.fixture
 def input_audios() -> list[tuple[list[str], list[str], list[tuple[Any, int]]]]:
    audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
    inputs = []
    for asset in audio_assets:
        audio, orig_sr = asset.audio_and_sample_rate
        # Resample to Whisper's expected sample rate (16kHz)
        if orig_sr != WHISPER_SAMPLE_RATE:
            audio = librosa.resample(
                audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE
            )
        # vLLM prompts, HF prompts, audio inputs
        inputs.append(([VLLM_PROMPT], [HF_PROMPT], [(audio, WHISPER_SAMPLE_RATE)]))
    return inputs


@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("dtype", ["half"])
@create_new_process_for_each_test()
 def test_models(vllm_runner, model, dtype) -> None:
    run_test(
        vllm_runner,
        model,
        tensor_parallel_size=1,
        dtype=dtype,
    )
 def check_model_available(model: str) -> None:
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")


@pytest.mark.core_model
@pytest.mark.cpu_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("dtype", ["half"])
 def test_models_cpu(vllm_runner, model, dtype) -> None:
    # @create_new_process_for_each_test() does not work for some runners
    # TODO: to fix cpu privilege issues in run-cpu-test-arm.sh
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("enforce_eager", [True, False])
@create_new_process_for_each_test("spawn")
 def test_models(
    hf_runner,
    vllm_runner,
    model: str,
    dtype: str,
    num_logprobs: int,
    input_audios,
    enforce_eager: bool,
 ) -> None:
    check_model_available(model)
    if current_platform.is_cpu() and not enforce_eager:
        pytest.skip("Skipping test for CPU with non-eager mode")
    run_test(
        hf_runner,
        vllm_runner,
        input_audios,
        model,
        tensor_parallel_size=1,
        dtype=dtype,
        max_model_len=448,
        max_tokens=200,
        num_logprobs=num_logprobs,
        tensor_parallel_size=1,
        enforce_eager=enforce_eager,
    )


@@ -152,15 +148,31 @@ def test_models_cpu(vllm_runner, model, dtype) -> None:
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@create_new_process_for_each_test()
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [200])
@pytest.mark.parametrize("num_logprobs", [5])
@create_new_process_for_each_test("spawn")
 def test_models_distributed(
    hf_runner,
    vllm_runner,
    model,
    distributed_executor_backend,
    model: str,
    distributed_executor_backend: str,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    input_audios,
 ) -> None:
    check_model_available(model)
    run_test(
        hf_runner,
        vllm_runner,
        input_audios,
        model,
        dtype=dtype,
        max_model_len=448,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        tensor_parallel_size=2,
        distributed_executor_backend=distributed_executor_backend,
        enforce_eager=False,
    )
--- a/tests/models/multimodal/processing/test_audioflamingo3.py
+++ b/tests/models/multimodal/processing/test_audioflamingo3.py
@@ -0,0 +1,125 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 # Copyright 2025 The vLLM team.
 # Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
 # reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 from unittest.mock import MagicMock

 import numpy as np
 import pytest
 import torch
 from transformers import PretrainedConfig

 from tests.models.registry import HF_EXAMPLE_MODELS


 class MockAudioFlamingo3Config(PretrainedConfig):
    model_type = "audioflamingo3"

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.audio_config = PretrainedConfig()
        self.text_config = PretrainedConfig()


 class MockAudioFlamingo3Processor:
    def __init__(self):
        self.audio_token = "<sound>"
        self.audio_token_id = 12345
        self.feature_extractor = MockFeatureExtractor()

    def __call__(self, text=None, audios=None, **kwargs):
        return {"input_ids": [1, 2, 3], "input_features": [np.zeros((3000, 80))]}


 class MockFeatureExtractor:
    def __init__(self):
        self.sampling_rate = 16000
        self.chunk_length = 30


@pytest.fixture
 def mock_ctx():
    config = MockAudioFlamingo3Config()

    ctx = MagicMock()
    ctx.get_hf_config.return_value = config
    ctx.get_hf_processor.return_value = MockAudioFlamingo3Processor()
    ctx.model_config.hf_config = config
    return ctx


@pytest.fixture(autouse=True)
 def check_transformers_version():
    # Check if the model is supported by the current transformers version
    model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
    model_info.check_transformers_version(on_fail="skip")


 def test_audio_chunk_counting(mock_ctx):
    from vllm.model_executor.models.audioflamingo3 import (
        AudioFlamingo3DummyInputsBuilder,
        AudioFlamingo3MultiModalProcessor,
        AudioFlamingo3ProcessingInfo,
    )

    info = AudioFlamingo3ProcessingInfo(mock_ctx)
    processor = AudioFlamingo3MultiModalProcessor(
        info, AudioFlamingo3DummyInputsBuilder(info)
    )

    sr = 16000
    audio_1 = np.zeros(30 * sr)
    audio_2 = np.zeros(45 * sr)

    mm_data = {"audio": [audio_1, audio_2]}
    prompt = "<|user|>Listen.<|end|>"

    from vllm.multimodal.processing import BaseMultiModalProcessor

    def mock_base_call(self, prompt, mm_data, mm_kwargs, tok_kwargs):
        return {"input_ids": [1, 2, 3], "input_features": torch.randn(1, 80, 3000)}

    with pytest.MonkeyPatch.context() as mp:
        mp.setattr(BaseMultiModalProcessor, "_call_hf_processor", mock_base_call)

        processed = processor._call_hf_processor(prompt, mm_data, {}, {})

        chunk_counts = processed["chunk_counts"]

        assert chunk_counts[0].item() == 1
        assert chunk_counts[1].item() == 2
        assert len(chunk_counts) == 2


 def test_dummy_data_generation(mock_ctx):
    from vllm.model_executor.models.audioflamingo3 import (
        AudioFlamingo3DummyInputsBuilder,
        AudioFlamingo3ProcessingInfo,
    )

    info = AudioFlamingo3ProcessingInfo(mock_ctx)
    builder = AudioFlamingo3DummyInputsBuilder(info)

    mm_counts = {"audio": 2}
    dummy_data = builder.get_dummy_mm_data(100, mm_counts, None)

    assert "audio" in dummy_data
    assert len(dummy_data["audio"]) == 2

    expected_len = 600 * 16000
    assert len(dummy_data["audio"][0]) == expected_len
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -22,11 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.tokenizers import (
    MistralTokenizer,
    TokenizerLike,
    cached_tokenizer_from_config,
 )
 from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
 from vllm.tokenizers.mistral import MistralTokenizer

 from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import (
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -8,6 +8,7 @@ from typing import Any, TypeAlias

 import numpy as np
 import pytest
 import torch
 import torch.nn as nn
 from PIL import Image

@@ -35,6 +36,7 @@ from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_dtype

 from ....utils import create_new_process_for_each_test
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import dummy_hf_overrides
 from .test_common import get_model_ids_to_test, get_text_token_prompts
@@ -136,6 +138,7 @@ def create_batched_mm_kwargs(
    )


 # TODO(Isotr0py): Don't initalize model during test
@contextmanager
 def initialize_dummy_model(
    model_cls: type[nn.Module],
@@ -150,16 +153,21 @@ def initialize_dummy_model(
        backend="nccl",
    )
    initialize_model_parallel(tensor_model_parallel_size=1)

    current_device = torch.get_default_device()
    vllm_config = VllmConfig(model_config=model_config)
    with set_current_vllm_config(vllm_config=vllm_config):
        with set_default_torch_dtype(model_config.dtype):
            torch.set_default_device(current_platform.device_type)
            model = model_cls(vllm_config=vllm_config)
            torch.set_default_device(current_device)
        yield model

    del model
    cleanup_dist_env_and_memory()


@create_new_process_for_each_test()
@pytest.mark.parametrize("model_id", get_model_ids_to_test())
 def test_model_tensor_schema(model_id: str):
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -356,7 +356,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    ),
    "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
    "MistralLarge3ForCausalLM": _HfExamplesInfo(
        "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4", is_available_online=False
        "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4"
    ),
    "MixtralForCausalLM": _HfExamplesInfo(
        "mistralai/Mixtral-8x7B-Instruct-v0.1",
@@ -573,12 +573,17 @@ _AUTOMATIC_CONVERTED_MODELS = {
    "Qwen3ForSequenceClassification": _HfExamplesInfo(
        "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
    ),
    "Qwen3ForTokenClassification": _HfExamplesInfo("bd2lcco/Qwen3-0.6B-finetuned"),
 }

 _MULTIMODAL_EXAMPLE_MODELS = {
    # [Decoder-only]
    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
    "AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
        "nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0.dev"
    ),
    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
    "BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
    "BeeForConditionalGeneration": _HfExamplesInfo(
        "Open-Bee/Bee-8B-RL",
        trust_remote_code=True,
@@ -635,7 +640,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    ),
    "HunYuanVLForConditionalGeneration": _HfExamplesInfo(
        "tencent/HunyuanOCR",
        is_available_online=False,
        hf_overrides={"num_experts": 0},
    ),
    "Idefics3ForConditionalGeneration": _HfExamplesInfo(
        "HuggingFaceM4/Idefics3-8B-Llama3",
@@ -674,8 +679,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        "https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31",
    ),
    "LightOnOCRForConditionalGeneration": _HfExamplesInfo(
        "lightonai/LightOnOCR-1B",
        is_available_online=False,
        "lightonai/LightOnOCR-1B-1025"
    ),
    "Llama4ForConditionalGeneration": _HfExamplesInfo(
        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
@@ -779,8 +783,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
            "ministral-3": "mistralai/Ministral-3-3B-Instruct-2512",
        },
        tokenizer_mode="mistral",
        # TODO: revert once Mistral-Large-3 and Ministral-3 are publicly available.
        is_available_online=False,
    ),
    "QwenVLForConditionalGeneration": _HfExamplesInfo(
        "Qwen/Qwen-VL",
@@ -843,7 +845,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        is_available_online=False,
    ),
    # [Encoder-decoder]
    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),
    "WhisperForConditionalGeneration": _HfExamplesInfo(
        "openai/whisper-large-v3-turbo",
        extras={"v3": "openai/whisper-large-v3"},
    ),
    # [Cross-encoder]
    "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"),
 }
@@ -886,6 +891,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
    "EagleMistralLarge3ForCausalLM": _HfExamplesInfo(
        "mistralai/Mistral-Large-3-675B-Instruct-2512",
        speculative_model="mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle",
        # TODO: revert once figuring out OOM in CI
        is_available_online=False,
    ),
    "LlamaForCausalLMEagle3": _HfExamplesInfo(
--- a/tests/multimodal/test_sparse_tensor_validation_unit.py
+++ b/tests/multimodal/test_sparse_tensor_validation_unit.py
@@ -0,0 +1,134 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Unit tests for sparse tensor validation.

 Simple, fast unit tests that can run without server fixtures.
 Run with: pytest tests/multimodal/test_sparse_tensor_validation_unit.py -v
 """

 import io

 import pytest
 import torch


 class TestSparseTensorValidationContextManager:
    """Test that torch.sparse.check_sparse_tensor_invariants() works as expected."""

    def test_valid_sparse_tensor_passes(self):
        """Valid sparse tensors should pass validation."""
        indices = torch.tensor([[0, 1], [0, 1]])
        values = torch.tensor([1.0, 2.0])
        shape = (2, 2)

        with torch.sparse.check_sparse_tensor_invariants():
            tensor = torch.sparse_coo_tensor(indices, values, shape)
            dense = tensor.to_dense()

        assert dense.shape == shape

    def test_out_of_bounds_indices_rejected(self):
        """Sparse tensors with out-of-bounds indices should be rejected."""
        indices = torch.tensor([[5], [5]])  # Out of bounds for 2x2
        values = torch.tensor([1.0])
        shape = (2, 2)

        with pytest.raises(RuntimeError) as exc_info:  # noqa: SIM117
            with torch.sparse.check_sparse_tensor_invariants():
                tensor = torch.sparse_coo_tensor(indices, values, shape)
                tensor.to_dense()

        assert (
            "index" in str(exc_info.value).lower()
            or "bound" in str(exc_info.value).lower()
        )

    def test_negative_indices_rejected(self):
        """Sparse tensors with negative indices should be rejected."""
        indices = torch.tensor([[-1], [0]])
        values = torch.tensor([1.0])
        shape = (2, 2)

        with pytest.raises(RuntimeError):  # noqa: SIM117
            with torch.sparse.check_sparse_tensor_invariants():
                tensor = torch.sparse_coo_tensor(indices, values, shape)
                tensor.to_dense()

    def test_without_context_manager_allows_invalid(self):
        """
        WITHOUT validation, invalid tensors may not immediately error.

        This demonstrates the vulnerability: PyTorch 2.8.0+ doesn't validate
        by default, which can lead to memory corruption.
        """
        indices = torch.tensor([[100], [100]])  # Way out of bounds
        values = torch.tensor([1.0])
        shape = (2, 2)

        # Without validation context, this might create an invalid tensor
        # (actual behavior depends on PyTorch version)
        tensor = torch.sparse_coo_tensor(indices, values, shape)

        # The tensor object is created, but it's invalid
        assert tensor.is_sparse


 class TestTorchLoadWithValidation:
    """Test torch.load() with sparse tensor validation."""

    def test_load_valid_sparse_tensor_with_validation(self):
        """Valid sparse tensors should load successfully with validation."""
        # Create and save a valid sparse tensor
        indices = torch.tensor([[0, 1], [0, 1]])
        values = torch.tensor([1.0, 2.0])
        tensor = torch.sparse_coo_tensor(indices, values, (2, 2))

        buffer = io.BytesIO()
        torch.save(tensor, buffer)
        buffer.seek(0)

        # Load with validation
        with torch.sparse.check_sparse_tensor_invariants():
            loaded = torch.load(buffer, weights_only=True)
            dense = loaded.to_dense()

        assert dense.shape == (2, 2)

    def test_load_invalid_sparse_tensor_rejected(self):
        """Invalid sparse tensors should be caught when loaded with validation."""
        # Create an invalid sparse tensor (out of bounds)
        indices = torch.tensor([[10], [10]])
        values = torch.tensor([1.0])
        tensor = torch.sparse_coo_tensor(indices, values, (2, 2))

        buffer = io.BytesIO()
        torch.save(tensor, buffer)
        buffer.seek(0)

        # Load with validation - should fail on to_dense()
        with pytest.raises(RuntimeError):  # noqa: SIM117
            with torch.sparse.check_sparse_tensor_invariants():
                loaded = torch.load(buffer, weights_only=True)
                loaded.to_dense()

    def test_load_dense_tensor_unaffected(self):
        """Dense tensors should work normally with the validation context."""
        # Create and save a dense tensor
        tensor = torch.randn(10, 20)

        buffer = io.BytesIO()
        torch.save(tensor, buffer)
        buffer.seek(0)

        # Load with validation (should have no effect on dense tensors)
        with torch.sparse.check_sparse_tensor_invariants():
            loaded = torch.load(buffer, weights_only=True)

        assert loaded.shape == (10, 20)
        assert not loaded.is_sparse


 if __name__ == "__main__":
    # Allow running directly for quick testing
    pytest.main([__file__, "-v", "--tb=short"])
--- a/tests/quantization/test_blackwell_moe.py
+++ b/tests/quantization/test_blackwell_moe.py
@@ -10,9 +10,9 @@ import pytest
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform

 if not current_platform.is_device_capability(100):
 if not current_platform.is_device_capability_family(100):
    pytest.skip(
        "This test only runs on Blackwell GPUs (SM100).", allow_module_level=True
        "This test only runs on Blackwell GPUs (SM10x).", allow_module_level=True
    )


--- a/tests/reasoning/test_mistral_reasoning_parser.py
+++ b/tests/reasoning/test_mistral_reasoning_parser.py
@@ -5,7 +5,7 @@ import pytest

 from tests.reasoning.utils import run_reasoning_extraction_mistral
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.tokenizers import MistralTokenizer
 from vllm.tokenizers.mistral import MistralTokenizer

 parser_name = "mistral"

--- a/tests/reasoning/utils.py
+++ b/tests/reasoning/utils.py
@@ -4,7 +4,7 @@

 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.reasoning import ReasoningParser
 from vllm.tokenizers import MistralTokenizer
 from vllm.tokenizers.mistral import MistralTokenizer


 class StreamingReasoningReconstructor:
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -7,7 +7,7 @@ from vllm.config import ModelConfig
 from vllm.inputs import zip_enc_dec_prompts
 from vllm.inputs.parse import parse_raw_prompts
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.tokenizers import init_tokenizer_from_config
 from vllm.tokenizers import cached_tokenizer_from_config

 pytestmark = pytest.mark.cpu_test

@@ -34,6 +34,13 @@ INPUTS_SLICES = [
 ]


 # Test that a nested mixed-type list of lists raises a TypeError.
@pytest.mark.parametrize("invalid_input", [[[1, 2], ["foo", "bar"]]])
 def test_invalid_input_raise_type_error(invalid_input):
    with pytest.raises(TypeError):
        parse_raw_prompts(invalid_input)


 def test_parse_raw_single_batch_empty():
    with pytest.raises(ValueError, match="at least one prompt"):
        parse_raw_prompts([])
@@ -108,7 +115,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
 )
 def test_preprocessor_always_mm_code_path(model_id, prompt):
    model_config = ModelConfig(model=model_id)
    tokenizer = init_tokenizer_from_config(model_config)
    tokenizer = cached_tokenizer_from_config(model_config)
    input_preprocessor = InputPreprocessor(model_config, tokenizer)

    # HF processor adds sep token
--- a/tests/tokenizers_/test_basic.py
+++ b/tests/tokenizers_/test_basic.py
@@ -3,38 +3,39 @@
 from typing import _get_protocol_attrs  # type: ignore

 import pytest
 from transformers import PreTrainedTokenizerBase
 from transformers import (
    PreTrainedTokenizer,
    PreTrainedTokenizerBase,
    PreTrainedTokenizerFast,
 )

 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.mistral import MistralTokenizer


 def _get_missing_attrs(obj: object, target: type):
    return [k for k in _get_protocol_attrs(target) if not hasattr(obj, k)]


 def _assert_tokenizer_like(tokenizer: object):
    missing_attrs = _get_missing_attrs(tokenizer, TokenizerLike)
    assert not missing_attrs, f"Missing attrs: {missing_attrs}"


 def test_tokenizer_like_protocol():
    assert not (
        missing_attrs := _get_missing_attrs(
            get_tokenizer("gpt2", use_fast=False),
            TokenizerLike,
        )
    ), f"Missing attrs: {missing_attrs}"

    assert not (
        missing_attrs := _get_missing_attrs(
            get_tokenizer("gpt2", use_fast=True),
            TokenizerLike,
        )
    ), f"Missing attrs: {missing_attrs}"

    assert not (
        missing_attrs := _get_missing_attrs(
            get_tokenizer(
                "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
            ),
            TokenizerLike,
        )
    ), f"Missing attrs: {missing_attrs}"
    tokenizer = get_tokenizer("gpt2", use_fast=False)
    assert isinstance(tokenizer, PreTrainedTokenizer)
    _assert_tokenizer_like(tokenizer)

    tokenizer = get_tokenizer("gpt2", use_fast=True)
    assert isinstance(tokenizer, PreTrainedTokenizerFast)
    _assert_tokenizer_like(tokenizer)

    tokenizer = get_tokenizer(
        "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
    )
    assert isinstance(tokenizer, MistralTokenizer)
    _assert_tokenizer_like(tokenizer)


@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
--- a/tests/tokenizers_/test_detokenize.py
+++ b/tests/tokenizers_/test_detokenize.py
@@ -8,7 +8,7 @@ import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast

 from vllm.sampling_params import SamplingParams
 from vllm.tokenizers import MistralTokenizer
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.detokenizer import (
    FastIncrementalDetokenizer,
--- a/tests/tokenizers_/test_registry.py
+++ b/tests/tokenizers_/test_registry.py
@@ -2,7 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path

 from vllm.tokenizers import TokenizerLike, TokenizerRegistry, get_tokenizer
 import pytest

 from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.registry import (
    TokenizerRegistry,
    get_tokenizer,
    resolve_tokenizer_args,
 )


 class TestTokenizer(TokenizerLike):
@@ -40,10 +47,22 @@ class TestTokenizer(TokenizerLike):
        return True


@pytest.mark.parametrize("runner_type", ["generate", "pooling"])
 def test_resolve_tokenizer_args_idempotent(runner_type):
    tokenizer_mode, tokenizer_name, args, kwargs = resolve_tokenizer_args(
        "facebook/opt-125m",
        runner_type=runner_type,
    )

    assert (tokenizer_mode, tokenizer_name, args, kwargs) == resolve_tokenizer_args(
        tokenizer_name, *args, **kwargs
    )


 def test_customized_tokenizer():
    TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)

    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer", "abc")
    tokenizer = TokenizerRegistry.load_tokenizer("test_tokenizer", "abc")
    assert isinstance(tokenizer, TestTokenizer)
    assert tokenizer.path_or_repo_id == "abc"
    assert tokenizer.bos_token_id == 0
--- a/tests/tool_parsers/init.py
+++ b/tests/tool_parsers/init.py
--- a/tests/tool_parsers/test_deepseekv31_tool_parser.py
+++ b/tests/tool_parsers/test_deepseekv31_tool_parser.py
@@ -3,10 +3,10 @@

 import pytest

 from vllm.entrypoints.openai.tool_parsers.deepseekv31_tool_parser import (
 from vllm.tokenizers import get_tokenizer
 from vllm.tool_parsers.deepseekv31_tool_parser import (
    DeepSeekV31ToolParser,
 )
 from vllm.tokenizers import get_tokenizer

 MODEL = "deepseek-ai/DeepSeek-V3.1"

--- a/tests/tool_parsers/test_ernie45_moe_tool_parser.py
+++ b/tests/tool_parsers/test_ernie45_moe_tool_parser.py
@@ -13,9 +13,9 @@ from vllm.entrypoints.openai.protocol import (
    FunctionCall,
    ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser
 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.tool_parsers.ernie45_tool_parser import Ernie45ToolParser

 # Use a common model that is likely to be available
 MODEL = "baidu/ERNIE-4.5-21B-A3B-Thinking"
--- a/tests/tool_parsers/test_glm4_moe_tool_parser.py
+++ b/tests/tool_parsers/test_glm4_moe_tool_parser.py
@@ -7,12 +7,10 @@ import json
 import pytest

 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers.glm4_moe_tool_parser import (
 from vllm.tokenizers import get_tokenizer
 from vllm.tool_parsers.glm4_moe_tool_parser import (
    Glm4MoeModelToolParser,
 )
 from vllm.tokenizers import get_tokenizer

 pytestmark = pytest.mark.cpu_test

 pytest.skip("skip glm4_moe parser test", allow_module_level=True)
 # Use a common model that is likely to be available
--- a/tests/tool_parsers/test_jamba_tool_parser.py
+++ b/tests/tool_parsers/test_jamba_tool_parser.py
@@ -9,11 +9,9 @@ import pytest
 from partial_json_parser.core.options import Allow

 from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser
 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally

 pytestmark = pytest.mark.cpu_test
 from vllm.tool_parsers.jamba_tool_parser import JambaToolParser

 MODEL = "ai21labs/Jamba-tiny-dev"

--- a/tests/tool_parsers/test_kimi_k2_tool_parser.py
+++ b/tests/tool_parsers/test_kimi_k2_tool_parser.py
@@ -7,10 +7,8 @@ import json
 import pytest

 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser
 from vllm.tokenizers import get_tokenizer

 pytestmark = pytest.mark.cpu_test
 from vllm.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser

 # Use a common model that is likely to be available
 MODEL = "moonshotai/Kimi-K2-Instruct"
--- a/tests/tool_parsers/test_minimax_tool_parser.py
+++ b/tests/tool_parsers/test_minimax_tool_parser.py
@@ -12,10 +12,8 @@ from vllm.entrypoints.openai.protocol import (
    FunctionCall,
    ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.minimax_tool_parser import MinimaxToolParser
 from vllm.tokenizers import get_tokenizer

 pytestmark = pytest.mark.cpu_test
 from vllm.tool_parsers.minimax_tool_parser import MinimaxToolParser

 # Use a common model that is likely to be available
 MODEL = "MiniMaxAi/MiniMax-M1-40k"
--- a/tests/tool_parsers/test_mistral_tool_parser.py
+++ b/tests/tool_parsers/test_mistral_tool_parser.py
@@ -12,13 +12,10 @@ from mistral_common.protocol.instruct.tool_calls import FunctionCall, ToolCall
 from partial_json_parser.core.options import Allow

 from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolParser
 from vllm.tokenizers import (
    MistralTokenizer,
    TokenizerLike,
    get_tokenizer,
 )
 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.tool_parsers.mistral_tool_parser import MistralToolParser


@pytest.fixture(scope="module")
--- a/tests/tool_parsers/test_openai_tool_parser.py
+++ b/tests/tool_parsers/test_openai_tool_parser.py
@@ -15,8 +15,8 @@ from openai_harmony import (
 )

 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers.openai_tool_parser import OpenAIToolParser
 from vllm.tokenizers import get_tokenizer
 from vllm.tool_parsers.openai_tool_parser import OpenAIToolParser

 MODEL = "gpt2"

--- a/tests/tool_parsers/test_qwen3coder_tool_parser.py
+++ b/tests/tool_parsers/test_qwen3coder_tool_parser.py
@@ -13,14 +13,12 @@ from vllm.entrypoints.openai.protocol import (
    FunctionCall,
    ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import (
    Qwen3CoderToolParser,
 )
 from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally

 pytestmark = pytest.mark.cpu_test
 from vllm.tool_parsers.qwen3coder_tool_parser import (
    Qwen3CoderToolParser,
 )
 from vllm.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser

 MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"

--- a/tests/tool_parsers/test_seed_oss_tool_parser.py
+++ b/tests/tool_parsers/test_seed_oss_tool_parser.py
@@ -14,11 +14,9 @@ from vllm.entrypoints.openai.protocol import (
    FunctionCall,
    ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser
 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally

 pytestmark = pytest.mark.cpu_test
 from vllm.tool_parsers.seed_oss_tool_parser import SeedOssToolParser

 # Use a common model that is likely to be available
 MODEL = "ByteDance-Seed/Seed-OSS-36B-Instruct"
--- a/tests/tool_parsers/test_xlam_tool_parser.py
+++ b/tests/tool_parsers/test_xlam_tool_parser.py
@@ -12,11 +12,9 @@ from vllm.entrypoints.openai.protocol import (
    FunctionCall,
    ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser
 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally

 pytestmark = pytest.mark.cpu_test
 from vllm.tool_parsers.xlam_tool_parser import xLAMToolParser

 # Use a common model that is likely to be available
 MODEL = "Salesforce/Llama-xLAM-2-8B-fc-r"
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -12,7 +12,7 @@ from vllm.entrypoints.openai.protocol import (
    ChatCompletionToolsParam,
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.tool_parsers.utils import get_json_schema_from_tools
 from vllm.tool_parsers.utils import get_json_schema_from_tools

 pytestmark = pytest.mark.cpu_test

--- a/tests/v1/entrypoints/conftest.py
+++ b/tests/v1/entrypoints/conftest.py
@@ -76,6 +76,8 @@ def sample_json_schema():
        },
        "required": ["name", "age", "skills", "grade", "email", "work_history"],
        "additionalProperties": False,
        "minProperties": 1,
        "maxProperties": 10,
    }


@@ -96,6 +98,9 @@ def unsupported_json_schema():
        },
        "required": ["score", "tags"],
        "additionalProperties": False,
        "patternProperties": {
            "^score$": {"type": "integer"},
        },
    }


--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -461,7 +461,7 @@ class TestNixlHandshake:
            metadata = NixlConnectorMetadata()
            if num_xfers > 0:
                num_xfers -= 1
                metadata.add_new_req(
                metadata.add_new_req_to_recv(
                    request_id=request_id,
                    local_block_ids=[num_xfers + 1, num_xfers + 2, num_xfers + 3],
                    kv_transfer_params={
@@ -532,7 +532,7 @@ class TestNixlHandshake:
            vllm_config, connector.engine_id
        )
        metadata = NixlConnectorMetadata()
        metadata.add_new_req(
        metadata.add_new_req_to_recv(
            request_id="id",
            local_block_ids=[1, 2, 3],
            kv_transfer_params={
@@ -588,7 +588,7 @@ class TestNixlHandshake:
        metadata = NixlConnectorMetadata()
        total_reqs = 5
        for i in range(total_reqs):
            metadata.add_new_req(
            metadata.add_new_req_to_recv(
                request_id=f"id_{i}",
                local_block_ids=[1, 2, 3],
                kv_transfer_params={
@@ -752,7 +752,7 @@ def test_kv_connector_stats(dist_init):
    # Create transfer metadata
    request_id = "test_req_for_stats"
    metadata = NixlConnectorMetadata()
    metadata.add_new_req(
    metadata.add_new_req_to_recv(
        request_id=request_id,
        local_block_ids=[1, 2, 3],
        kv_transfer_params={
@@ -1515,7 +1515,7 @@ def test_handshake_failure_returns_finished(dist_init):

    request_id = "test_handshake_fail"
    metadata = NixlConnectorMetadata()
    metadata.add_new_req(
    metadata.add_new_req_to_recv(
        request_id=request_id,
        local_block_ids=[1, 2, 3],
        kv_transfer_params={
@@ -1565,7 +1565,7 @@ def test_transfer_setup_failure_returns_finished(dist_init):

    request_id = "test_transfer_fail"
    metadata = NixlConnectorMetadata()
    metadata.add_new_req(
    metadata.add_new_req_to_recv(
        request_id=request_id,
        local_block_ids=[7, 8, 9],
        kv_transfer_params={
--- a/tests/v1/kv_offload/test_cpu_gpu.py
+++ b/tests/v1/kv_offload/test_cpu_gpu.py
@@ -9,7 +9,7 @@ import torch
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
 from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler
 from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandlers

 BACKENDS_TO_TEST = [FlashAttentionBackend]

@@ -82,7 +82,7 @@ def test_transfer(

    # create handler
    cpu_block_size = gpu_blocks_per_cpu_block * gpu_block_size
    handler = CpuGpuOffloadingHandler(
    handlers = CpuGpuOffloadingHandlers(
        attn_backends=attn_backends,
        gpu_block_size=gpu_block_size,
        cpu_block_size=cpu_block_size,
@@ -112,8 +112,7 @@ def test_transfer(

    # set transfer direction
    if gpu_to_cpu:
        src_kv_caches = handler.gpu_tensors
        dst_kv_caches = handler.cpu_tensors
        handler = handlers.gpu_to_cpu_handler
        src_spec_class = GPULoadStoreSpec
        dst_spec_class = CPULoadStoreSpec
        src_blocks = gpu_blocks
@@ -122,8 +121,7 @@ def test_transfer(
        dst_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size
        dst_size_in_gpu_blocks = num_cpu_blocks * gpu_blocks_per_cpu_block
    else:
        src_kv_caches = handler.cpu_tensors
        dst_kv_caches = handler.gpu_tensors
        handler = handlers.cpu_to_gpu_handler
        src_spec_class = CPULoadStoreSpec
        dst_spec_class = GPULoadStoreSpec
        src_blocks = cpu_blocks
@@ -144,12 +142,12 @@ def test_transfer(
    dst_spec = dst_spec_class(dst_blocks)

    # clone src and dst tensors before transfer
    orig_src_caches = [x.clone() for x in src_kv_caches]
    orig_dst_caches = [x.clone() for x in dst_kv_caches]
    orig_src_caches = [x.clone() for x in handler.src_tensors]
    orig_dst_caches = [x.clone() for x in handler.dst_tensors]

    # call transfer function
    assert handler.transfer_async(1, (src_spec, dst_spec))
    assert set(handler.transfer_events.keys()) == {1}
    assert set({x[0] for x in handler._transfers}) == {1}

    # wait for transfer to complete
    end_time = time.time() + 10
@@ -161,15 +159,15 @@ def test_transfer(
        time.sleep(0.1)

    # verify src tensors did not change
    for orig_tensor, tensor in zip(orig_src_caches, src_kv_caches):
    for orig_tensor, tensor in zip(orig_src_caches, handler.src_tensors):
        assert torch.equal(orig_tensor, tensor)

    # verify dst tensors
    for dst_block in range(dst_size_in_gpu_blocks):
        src_block_candidate = dst_to_src.get(dst_block)
        for src_cache, dst_cache, orig_dst_cache, kv_dim in zip(
            src_kv_caches,
            dst_kv_caches,
            handler.src_tensors,
            handler.dst_tensors,
            orig_dst_caches,
            handler.kv_dim_before_num_blocks,
        ):
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -528,9 +528,11 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
        ),
    ],
 )
@pytest.mark.parametrize("top_logprobs", [0, 3])
 def test_spec_decode_logprobs(
    logprobs_mode: LogprobsMode,
    model_setup: tuple[str, str, str],
    top_logprobs: int,
 ):
    """Spec decode logprobs should match those of the base model.

@@ -543,7 +545,7 @@ def test_spec_decode_logprobs(

    prompt = "Hello world " * 50
    sampling_params = SamplingParams(
        temperature=0, logprobs=3, max_tokens=10, ignore_eos=False
        temperature=0, logprobs=top_logprobs, max_tokens=10, ignore_eos=False
    )
    method, model_name, spec_model_name = model_setup
    max_model_len = 256
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -111,7 +111,7 @@ def create_sampling_metadata(
        top_p=top_p,
        top_k=top_k,
        generators=generators,
        max_num_logprobs=0,
        max_num_logprobs=None,
        no_penalties=no_penalties,
        prompt_token_ids=prompt_token_ids,
        frequency_penalties=frequency_penalties,
--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@@ -44,8 +44,6 @@ def unsupported_array_schemas():
@pytest.fixture
 def unsupported_object_schemas():
    return [
        {"type": "object", "minProperties": 1},
        {"type": "object", "maxProperties": 5},
        {"type": "object", "propertyNames": {"pattern": "^[a-z]+$"}},
        {"type": "object", "patternProperties": {"^S": {"type": "string"}}},
    ]
@@ -79,6 +77,8 @@ def supported_schema():
                },
            },
        },
        "minProperties": 1,
        "maxProperties": 100,
    }


--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -43,6 +43,7 @@ FILES = [
    "vllm/worker",
    "vllm/v1/core",
    "vllm/v1/engine",
    "vllm/v1/executor",
    "vllm/v1/metrics",
    "vllm/v1/pool",
    "vllm/v1/sample",
@@ -60,7 +61,6 @@ SEPARATE_GROUPS = [
    "vllm/model_executor",
    # v1 related
    "vllm/v1/attention",
    "vllm/v1/executor",
    "vllm/v1/kv_offload",
    "vllm/v1/spec_decode",
    "vllm/v1/structured_output",
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -498,15 +498,15 @@ def awq_dequantize(
 def awq_gemm(
    input: torch.Tensor,
    qweight: torch.Tensor,
    qzeros: torch.Tensor,
    scales: torch.Tensor,
    qzeros: torch.Tensor,
    split_k_iters: int,
 ) -> torch.Tensor:
    if envs.VLLM_USE_TRITON_AWQ:
        from vllm.model_executor.layers.quantization.awq_triton import awq_gemm_triton

        return awq_gemm_triton(input, qweight, qzeros, scales, split_k_iters)
    return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
        return awq_gemm_triton(input, qweight, scales, qzeros, split_k_iters)
    return torch.ops._C.awq_gemm(input, qweight, scales, qzeros, split_k_iters)


 # gptq
@@ -632,8 +632,8 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
    def _awq_gemm_fake(
        input: torch.Tensor,
        qweight: torch.Tensor,
        qzeros: torch.Tensor,
        scales: torch.Tensor,
        qzeros: torch.Tensor,
        split_k_iters: torch.SymInt,
    ) -> torch.Tensor:
        num_in_feats = input.size(0)
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer."""

 from collections.abc import Callable
 import functools
 from typing import cast

 import torch
@@ -16,7 +16,9 @@ from vllm.attention.backends.abstract import (
    MLAAttentionImpl,
 )
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layers.mm_encoder_attention import maybe_get_vit_flash_attn_backend
 from vllm.attention.selector import get_attn_backend
 from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
 from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer
 from vllm.config import CacheConfig, get_current_vllm_config
@@ -47,58 +49,9 @@ from vllm.v1.kv_cache_interface import (
    SlidingWindowSpec,
 )

 if current_platform.is_rocm():
    from vllm.platforms.rocm import on_gfx9
 else:
    on_gfx9 = lambda *args, **kwargs: False


 FP8_DTYPE = current_platform.fp8_dtype()
 logger = init_logger(__name__)


 def maybe_get_vit_flash_attn_backend(
    attn_backend: AttentionBackendEnum,
    attn_backend_override: AttentionBackendEnum | None = None,
 ) -> tuple[AttentionBackendEnum, Callable | None]:
    if current_platform.is_rocm():
        if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
            attn_backend = AttentionBackendEnum.ROCM_AITER_FA
        elif (
            attn_backend_override is None
            and on_gfx9()
            and attn_backend == AttentionBackendEnum.FLASH_ATTN
        ):
            pass
        else:
            return AttentionBackendEnum.TORCH_SDPA, None
    elif current_platform.is_cuda():
        pass
    elif current_platform.is_xpu():
        assert attn_backend == AttentionBackendEnum.FLASH_ATTN, (
            "XPU platform only supports FLASH_ATTN as vision attention backend."
        )
        pass
    else:
        return AttentionBackendEnum.TORCH_SDPA, None

    if attn_backend in {
        AttentionBackendEnum.FLASH_ATTN,
        AttentionBackendEnum.ROCM_AITER_FA,
    }:
        if attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
            from aiter import flash_attn_varlen_func
        else:
            try:
                from vllm.attention.utils.fa_utils import flash_attn_varlen_func
            except ImportError:
                flash_attn_varlen_func = None
    else:
        flash_attn_varlen_func = None

    return attn_backend, flash_attn_varlen_func


 def _init_kv_cache_quant(
    layer: nn.Module,
    quant_config: QuantizationConfig | None,
@@ -494,29 +447,15 @@ class MultiHeadAttention(nn.Module):
        attn_backend_override = None
        if multimodal_config is not None:
            attn_backend_override = multimodal_config.mm_encoder_attn_backend
        backend = get_vit_attn_backend(

        self.attn_backend = get_vit_attn_backend(
            head_size=head_size,
            dtype=dtype,
            attn_backend_override=attn_backend_override,
        )

        self.attn_backend = (
            backend
            if backend
            in {
                AttentionBackendEnum.TORCH_SDPA,
                AttentionBackendEnum.PALLAS,
                AttentionBackendEnum.ROCM_AITER_FA,
                AttentionBackendEnum.FLASH_ATTN,
            }
            else AttentionBackendEnum.TORCH_SDPA
        )

        self.attn_backend, self._flash_attn_varlen_func = (
            maybe_get_vit_flash_attn_backend(
                self.attn_backend,
                attn_backend_override=attn_backend_override,
            )
        self._flash_attn_varlen_func = maybe_get_vit_flash_attn_backend(
            self.attn_backend,
        )

        self.is_flash_attn_backend = self.attn_backend in {
@@ -524,6 +463,14 @@ class MultiHeadAttention(nn.Module):
            AttentionBackendEnum.ROCM_AITER_FA,
        }

        self.fa_version = None
        if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
            self.fa_version = get_flash_attn_version()
            assert self._flash_attn_varlen_func is not None
            self._flash_attn_varlen_func = functools.partial(
                self._flash_attn_varlen_func, fa_version=self.fa_version
            )

        logger.info_once(
            f"Using {self.attn_backend} for MultiHeadAttention in multimodal encoder."
        )
--- a/vllm/attention/layers/mm_encoder_attention.py
+++ b/vllm/attention/layers/mm_encoder_attention.py
@@ -0,0 +1,284 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from collections.abc import Callable

 import torch

 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.ops.vit_attn_wrappers import (
    vit_flash_attn_wrapper,
    vit_torch_sdpa_wrapper,
 )
 from vllm.config import MultiModalConfig
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.models.vision import get_vit_attn_backend

 logger = init_logger(__name__)


 def maybe_get_vit_flash_attn_backend(
    attn_backend: AttentionBackendEnum | None,
 ) -> Callable | None:
    # At this point,
    # we already have the attn_backend,
    # overriding logic is done in the platform-specific implementation.
    # so we don't need to override backend here.
    # Just return the attn_backend and flash_attn_varlen_func.

    if attn_backend == AttentionBackendEnum.FLASH_ATTN:
        from vllm.attention.utils.fa_utils import flash_attn_varlen_func
    elif attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
        from aiter import flash_attn_varlen_func
    else:
        flash_attn_varlen_func = None

    # if attn_backend is TORCH_SDPA,
    # it will reach here and the flash_attn_varlen_func will be None.
    return flash_attn_varlen_func


@CustomOp.register("mm_encoder_attn")
 class MMEncoderAttention(CustomOp):
    """Multi-headed attention without any cache, used for multimodal encoder."""

    def __init__(
        self,
        num_heads: int,
        head_size: int,
        scale: float | None = None,
        num_kv_heads: int | None = None,
        prefix: str = "",
        multimodal_config: MultiModalConfig | None = None,
    ) -> None:
        """
        Args:
            num_heads: number of attention heads per partition.
            head_size: hidden_size per attention head.
            scale: scale factor.
            num_kv_heads: number of kv heads.
            prefix: This has no effect, it is only here to make it easier to
                    swap between Attention and MultiHeadAttention
            multimodal_config: configs for multi-modal.
        """
        super().__init__()

        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = scale
        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
        self.layer_name = prefix

        assert self.num_heads % self.num_kv_heads == 0, (
            f"num_heads ({self.num_heads}) is not "
            f"divisible by num_kv_heads ({self.num_kv_heads})"
        )
        self.num_queries_per_kv = self.num_heads // self.num_kv_heads

        # During model initialization, the default dtype is set as the model
        # weight and activation dtype.
        dtype = torch.get_default_dtype()

        # Try to get vision attention backend from multimodal_config.
        attn_backend_override = None
        if multimodal_config is not None:
            attn_backend_override = multimodal_config.mm_encoder_attn_backend

        # Get device-specific vision attention backend.
        self.attn_backend = get_vit_attn_backend(
            head_size=head_size,
            dtype=dtype,
            attn_backend_override=attn_backend_override,
        )

        self.is_flash_attn_backend = self.attn_backend in {
            AttentionBackendEnum.FLASH_ATTN,
            AttentionBackendEnum.ROCM_AITER_FA,
        }

        self.flash_attn_varlen_func = maybe_get_vit_flash_attn_backend(
            self.attn_backend,
        )

        logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")

    @classmethod
    def enabled(cls) -> bool:
        return True

    def reshape_qkv_to_4d(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        bsz: int,
        q_len: int,
        kv_len: int,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Reshape query, key, value to 4D tensors:
        (batch_size, seq_len, num_heads, head_size)
        """
        query = query.view(bsz, q_len, self.num_heads, self.head_size)
        key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
        value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)

        if (num_repeat := self.num_queries_per_kv) > 1:
            # Handle MQA and GQA
            key = torch.repeat_interleave(key, num_repeat, dim=2)
            value = torch.repeat_interleave(value, num_repeat, dim=2)

        return query, key, value

    def reshape_qkv_to_3d(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        bsz: int,
        q_len: int,
        kv_len: int,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Reshape query, key, value to 3D tensors:
        (batch_size * seq_len, num_heads, head_size)
        """
        query = query.view(bsz * q_len, self.num_heads, self.head_size)
        key = key.view(bsz * kv_len, self.num_kv_heads, self.head_size)
        value = value.view(bsz * kv_len, self.num_kv_heads, self.head_size)

        if (num_repeat := self.num_queries_per_kv) > 1:
            # Handle MQA and GQA
            key = torch.repeat_interleave(key, num_repeat, dim=1)
            value = torch.repeat_interleave(value, num_repeat, dim=1)

        return query, key, value

    def _forward_sdpa(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        cu_seqlens: torch.Tensor | None = None,
    ) -> torch.Tensor:
        # TODO(Isotr0py): Migrate MultiHeadAttention
        assert cu_seqlens is not None

        bsz, q_len = query.size()[:2]
        kv_len = key.size(1)

        query, key, value = self.reshape_qkv_to_4d(
            query, key, value, bsz, q_len, kv_len
        )

        output = vit_torch_sdpa_wrapper(
            q=query,
            k=key,
            v=value,
            cu_seqlens=cu_seqlens,
        )
        return output

    def _forward_fa(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        cu_seqlens: torch.Tensor | None = None,
        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
    ) -> torch.Tensor:
        assert self.flash_attn_varlen_func is not None, (
            "Flash attention function is not set."
        )
        # # TODO(Isotr0py): Migrate MultiHeadAttention
        assert cu_seqlens is not None and max_seqlen is not None

        bsz = query.shape[0]

        output = vit_flash_attn_wrapper(
            q=query,
            k=key,
            v=value,
            cu_seqlens=cu_seqlens,
            max_seqlen=max_seqlen,
            batch_size=bsz,
            is_rocm_aiter=(self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA),
        )
        return output

    def forward_native(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        cu_seqlens: torch.Tensor | None = None,
        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
    ) -> torch.Tensor:
        return self._forward_sdpa(query, key, value, cu_seqlens)

    def forward_cuda(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        cu_seqlens: torch.Tensor | None = None,
        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
    ) -> torch.Tensor:
        if self.is_flash_attn_backend:
            return self._forward_fa(query, key, value, cu_seqlens, max_seqlen)
        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
            return self._forward_sdpa(query, key, value, cu_seqlens)
        else:
            raise ValueError(
                f"Unsupported multi-modal encoder attention backend for CUDA: "
                f"{self.attn_backend}."
            )

    def forward_cpu(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        cu_seqlens: torch.Tensor | None = None,
        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
    ) -> torch.Tensor:
        return self._forward_sdpa(query, key, value, cu_seqlens)

    def forward_xpu(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        cu_seqlens: torch.Tensor | None = None,
        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
    ) -> torch.Tensor:
        assert self.is_flash_attn_backend, (
            "XPU only supports FLASH_ATTN for vision attention."
        )
        return self._forward_fa(query, key, value, cu_seqlens, max_seqlen)

    def forward_tpu(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        cu_seqlens: torch.Tensor | None = None,
        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
    ) -> torch.Tensor:
        assert self.attn_backend == AttentionBackendEnum.PALLAS, (
            f"MMEncoderAttention on TPU only supports PALLAS backend, "
            f"but got {self.attn_backend}."
        )
        if cu_seqlens is None:
            query, key, value = (x.transpose(1, 2) for x in (query, key, value))
            from torch_xla.experimental.custom_kernel import flash_attention

            out = flash_attention(query, key, value, sm_scale=self.scale)
            out = out.transpose(1, 2)
            return out
        logger.warning_once(
            "PALLAS backend with cu_seqlens is not supported for ViT yet. ",
            "Falling back to SDPA implementation.",
        )
        return self._forward_sdpa(query, key, value, cu_seqlens)
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -355,7 +355,7 @@ def kernel_unified_attention_2d(
@triton.jit
 def kernel_unified_attention_3d(
    segm_output_ptr,
    # [num_tokens, num_query_heads, num_segments, head_size]
    # [num_tokens, num_query_heads, num_segments, head_size_padded]
    segm_max_ptr,  # [num_tokens, num_query_heads, num_segments]
    segm_expsum_ptr,  # [num_tokens, num_query_heads, num_segments]
    query_ptr,  # [num_tokens, num_query_heads, head_size]
@@ -749,6 +749,11 @@ def unified_attention(
    q_descale,
    k_descale,
    v_descale,
    seq_threshold_3D=None,
    num_par_softmax_segments=None,
    softmax_segm_output=None,
    softmax_segm_max=None,
    softmax_segm_expsum=None,
    alibi_slopes=None,
    output_scale=None,
    qq_bias=None,
@@ -793,8 +798,19 @@ def unified_attention(
    TILE_SIZE_PREFILL = 32
    TILE_SIZE_DECODE = 16 if q.element_size() >= 2 else 32

    # if batch contains a prefill
    if max_seqlen_q > 1 or total_num_q_blocks * num_kv_heads > 128:
    # Launch the 2D kernel if
    # 1. No intermediate tiled softmax buffers for the 3D kernel have been allocated, or
    # 2. The batch includes at least one prefill request, or
    # 3. The number of sequences exceeds the configured threshold
    if (
        seq_threshold_3D is None
        or num_par_softmax_segments is None
        or softmax_segm_output is None
        or softmax_segm_max is None
        or softmax_segm_expsum is None
        or max_seqlen_q > 1
        or num_seqs > seq_threshold_3D
    ):
        kernel_unified_attention_2d[
            (
                total_num_q_blocks,
@@ -847,37 +863,12 @@ def unified_attention(
            USE_FP8=output_scale is not None,
        )
    else:
        # for initial version, NUM_SEGMENTS = 16 is chosen as a default
        # value that showed good performance in tests
        NUM_SEGMENTS = 16

        segm_output = torch.empty(
            q.shape[0],
            num_query_heads,
            NUM_SEGMENTS,
            triton.next_power_of_2(head_size),
            dtype=torch.float32,
            device=q.device,
        )
        segm_max = torch.empty(
            q.shape[0],
            num_query_heads,
            NUM_SEGMENTS,
            dtype=torch.float32,
            device=q.device,
        )
        segm_expsum = torch.empty(
            q.shape[0],
            num_query_heads,
            NUM_SEGMENTS,
            dtype=torch.float32,
            device=q.device,
        )

        kernel_unified_attention_3d[(total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)](
            segm_output_ptr=segm_output,
            segm_max_ptr=segm_max,
            segm_expsum_ptr=segm_expsum,
        kernel_unified_attention_3d[
            (total_num_q_blocks, num_kv_heads, num_par_softmax_segments)
        ](
            segm_output_ptr=softmax_segm_output,
            segm_max_ptr=softmax_segm_max,
            segm_expsum_ptr=softmax_segm_expsum,
            query_ptr=q,
            key_cache_ptr=k,
            value_cache_ptr=v,
@@ -917,13 +908,13 @@ def unified_attention(
            BLOCK_Q=BLOCK_Q,
            num_seqs=num_seqs,
            BLOCK_M=BLOCK_M,
            NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
            NUM_SEGMENTS_PER_SEQ=num_par_softmax_segments,
        )
        reduce_segments[(q.shape[0], num_query_heads)](
            output_ptr=out,
            segm_output_ptr=segm_output,
            segm_max_ptr=segm_max,
            segm_expsum_ptr=segm_expsum,
            segm_output_ptr=softmax_segm_output,
            segm_max_ptr=softmax_segm_max,
            segm_expsum_ptr=softmax_segm_expsum,
            seq_lens_ptr=seqused_k,
            num_seqs=num_seqs,
            num_query_heads=num_query_heads,
@@ -936,6 +927,6 @@ def unified_attention(
            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
            query_start_len_ptr=cu_seqlens_q,
            BLOCK_Q=BLOCK_Q,
            NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
            NUM_SEGMENTS_PER_SEQ=num_par_softmax_segments,
            USE_FP8=output_scale is not None,
        )
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -44,9 +44,7 @@ def flash_attn_maxseqlen_wrapper(
        dropout_p=0.0,
        causal=False,
    )
    context_layer = einops.rearrange(
        output, "(b s) h d -> s b (h d)", b=batch_size
    ).contiguous()
    context_layer = einops.rearrange(output, "(b s) h d -> b s h d", b=batch_size)
    return context_layer


@@ -59,8 +57,7 @@ def flash_attn_maxseqlen_wrapper_fake(
    batch_size: int,
    is_rocm_aiter: bool,
 ) -> torch.Tensor:
    b, s, h, d = q.shape
    return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
    return torch.empty_like(q)


 direct_register_custom_op(
@@ -106,7 +103,6 @@ def torch_sdpa_wrapper(
        output_i = einops.rearrange(output_i, "b h s d -> b s h d ")
        outputs.append(output_i)
    context_layer = torch.cat(outputs, dim=1)
    context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
    return context_layer


@@ -116,8 +112,7 @@ def torch_sdpa_wrapper_fake(
    v: torch.Tensor,
    cu_seqlens: torch.Tensor,
 ) -> torch.Tensor:
    b, s, h, d = q.shape
    return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
    return torch.empty_like(q)


 direct_register_custom_op(
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -235,7 +235,9 @@ async def get_request(


 def calculate_metrics_for_embeddings(
    outputs: list[RequestFuncOutput], dur_s: float, selected_percentiles: list[float]
    outputs: list[RequestFuncOutput],
    dur_s: float,
    selected_percentiles: list[float],
 ) -> EmbedBenchmarkMetrics:
    """Calculate the metrics for the embedding requests.
Author	SHA1	Message	Date
yewentao256	d160e1f33b	Merge branch 'main' into wentao-enable-eplb-with-default-backend Signed-off-by: yewentao256 <zhyanwentao@126.com>	21 hours ago
yjc9696	855b101d75	[Frontend] add tools for dsv32 developer role (#30040 ) Signed-off-by: pridejcyang <pridejcyang@tencent.com> Co-authored-by: pridejcyang <pridejcyang@tencent.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>	21 hours ago
Robert Shaw	d0502b4928	[MoE][Refactor 1/N] Separate Online Quantization (#30627 ) Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com>	21 hours ago
Max Hu	3f175f18a2	[Bugfix] Fix multimodal configuration for Qwen3VL MOE model (#30670 ) Signed-off-by: Max Hu <hyoung2991@gmail.com>	22 hours ago
Cyrus Leung	ed586e7724	[Refactor] [3/N] Move tool parser tests and run on CPU (#30693 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	23 hours ago
Chauncey	2a1776b7ac	[Refactor] [2/N] Move tool parsers into the vLLM main directory (#30675 ) Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>	23 hours ago
Nicolò Lucchesi	185c22bf2f	[Misc][Hybrid allocator + kv connector] Optionally enable hybrid allocator + KV cache connector (#29805 ) Signed-off-by: NickLucche <nlucches@redhat.com>	1 day ago
duke	e4806d973a	[BugFix] Add embed_input_ids method to make QWenLMHeadModel a vllm model (#30674 ) Signed-off-by: root <iwzbi@zju.edu.cn> Co-authored-by: root <iwzbi@zju.edu.cn>	1 day ago
wang.yuqi	4429d934de	[Model] Automatic conversion of TokenClassification model (#30666 ) Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>	1 day ago
ゆり	33278073d6	typing: Add type hints to TurnMetrics class in context.py (#30552 ) Co-authored-by: zkexorability <zkexorability@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>	1 day ago
汪志鹏	1adeb3b84c	[New Model] BAGEL support (AR only) (#28439 ) Signed-off-by: princepride <wangzhipeng628@gmail.com> Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	1 day ago
Kunshang Ji	e3a1cd1c59	[XPU] fix Dockerfile.xpu, avoid wheel conflicts (#30662 ) Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>	1 day ago
Wentao Ye	3778673ea8	[Feat] Refactor for `parallel_config` in `FusedMoEModularKernel` (#30282 ) Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>	1 day ago
Seokhyun An	b337647aa0	[Bugfix] Drop empty tool_calls lists to keep assistant replies in chat template (#30648 ) Signed-off-by: Seokhyun An <iamseokhyun@gmail.com>	1 day ago
Jee Jee Li	a524d1ba0a	[Bugfix] Fix deepseek_v32 tokenizer_mode (#30658 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	1 day ago
Shanshan Shen	87b4d1557d	[CustomOp][MM] Extract MMEncoderAttention as CustomOp and replace the backend of QwenVisionAttention with it. (#30125 ) Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>	1 day ago
Wenqi Glantz	84e23d103d	additional protection for CVE-2025-62164 (#30649 ) Signed-off-by: Wenqi Glantz <wglantz@nvidia.com>	1 day ago
Shanshan Shen	738648fb81	[CustomOp] Support object-level enable for CustomOp (#30547 ) Signed-off-by: shen-shanshan <467638484@qq.com>	1 day ago
Boyuan Feng	917fdae5b2	[Log] Skip piecewise cudagraph warn when using full cudagraph (#30657 ) Signed-off-by: Boyuan Feng <boyuan@meta.com>	1 day ago
Robert Shaw	e2ed238885	Revert "[Fix]Load kv-cache dtype from hf_quant_config.json automatically" (#30653 )	1 day ago
Or Ozeri	174e39ead7	CPU KV Offloading: Use more CUDA streams (#29013 ) Signed-off-by: Or Ozeri <oro@il.ibm.com>	1 day ago
RioS	9ccbf6b692	[responsesAPI]add extra body parameters (#30532 ) Signed-off-by: Ri0S <aa248424@gmail.com>	1 day ago
Chendi.Xue	ae2e503dda	[NIXL][BUG FIX] Fix a bug for PD with host_buffer after merging 29665 (#30420 ) Signed-off-by: Chendi Xue <chendi.xue@intel.com> Signed-off-by: Mark McLoughlin <markmc@redhat.com> Co-authored-by: Mark McLoughlin <markmc@redhat.com>	1 day ago
Tsukasa OI	9e33a1a75b	[Model][Quantization] Override HF defaults to GGUF ones (incl. Qwen3 MoE) (#30118 ) Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>	1 day ago
Vensen	add4b0ca44	[Bugfix][benchmarks] Fix input token calculation for rerank benchmark metrics (#30596 ) Signed-off-by: vensen <vensenmu@gmail.com>	1 day ago
ZiTian Zhao	ae88aada38	[Feature]Add EVS (Efficient Video Sampling) Support for Qwen3-VL (#29752 ) Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com> Co-authored-by: deitxfge <huhaibo1990@126.com>	1 day ago
yifant-code	5ccf0efa84	[Bugfix] Improve error messages in ModelConfig validation (#30213 ) Signed-off-by: ytian218 <ytian218@bloomberg.net> Co-authored-by: ytian218 <ytian218@bloomberg.net>	1 day ago
ElizaWszola	994acec0cc	[Bugfix] Fix fusion for VL models (#30244 ) Signed-off-by: ElizaWszola <ewszola@redhat.com>	1 day ago
zifeitong	48b8456ff9	[Bugfix] Revert Qwen2-VL part of change in #28271 (#30542 ) Signed-off-by: Zifei Tong <zifeitong@gmail.com>	1 day ago
Drew Botwinick	5b64ac21f9	[Bugfix] Update get_processor_data to use get_all method (#30583 ) Signed-off-by: Drew Botwinick <6953152+dbotwinick@users.noreply.github.com>	1 day ago
Bin Bao	a8ec486592	[Misc] Add a script to benchmark compilation time (#29919 ) Signed-off-by: Bin Bao <binbao@meta.com>	1 day ago
tjp_zju	6ecc1e411b	[Bugfix] fix _get_quant_method of FusedMoE for deepseekV3.2 on non-NV… (#30057 ) Signed-off-by: tjp_zju <tanjianpingzju1990@gmail.com>	2 days ago
Shengliang Xu	0bb0bae436	Nvidia ModelOpt workaround for issue 28072 (#30164 ) Signed-off-by: Shengliang Xu <shengliangx@nvidia.com> Co-authored-by: Pavani Majety <pmajety@nvidia.com>	2 days ago
Johannes F	060893654d	fix: Update json features supported by xGrammar (#30390 ) Signed-off-by: Johannes Flommersfeld <johannes.flommersfeld@tngtech.com> Signed-off-by: Johannes F <johannesflommersfeld@users.noreply.github.com> Co-authored-by: Johannes Flommersfeld <johannes.flommersfeld@tngtech.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2 days ago
Matthias Gehre	e9add129ad	[Bugfix] awq_gemm: fix argument order swap (#30364 ) Signed-off-by: Matthias Gehre <matthias.gehre@amd.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>	2 days ago
Ilya Markov	3224ea9915	[torch.compile] Add encoder tag for compilation (#30489 ) Signed-off-by: ilmarkov <markovilya197@gmail.com>	2 days ago
Lasha Koroshinadze	3a20450d31	Add AudioFlamingo3 model support (#30539 ) Signed-off-by: Lasha <26011196+lashahub@users.noreply.github.com> Signed-off-by: Lasha Koroshinadze <26011196+lashahub@users.noreply.github.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	2 days ago
Didier Durand	1a55cfafcb	[Doc]: fixing typos in various files (#30540 ) Signed-off-by: Didier Durand <durand.didier@gmail.com> Signed-off-by: Didier Durand <2927957+didier-durand@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>	2 days ago
drslark	add1b9d3de	[main][BugFix] Fixed an accuracy bug of Qwen3-next-MTP when batched inferring (#30632 ) Signed-off-by: drslark <slarksblood@qq.com>	2 days ago
Cyrus Leung	dcb31196da	[Chore] Remove redundant `RequestPrompt` (#30612 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2 days ago
Laith Sakka	f569c654e1	enable unbacked with aot_compile (#30462 ) Signed-off-by: Laith Sakka <lsakka@meta.com>	2 days ago
Micah Williamson	97f2f160fd	[ROCm][CI] Add "Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy Test" Back Into AMD CI (#30590 ) Signed-off-by: David Chen <530634352@qq.com> Signed-off-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Signed-off-by: Micah Williamson <micah.williamson@amd.com> Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	2 days ago
Kayvan Mivehnejad	29f7d97715	Improve parse_raw_prompt test cases for invalid input .v2 (#30512 ) Signed-off-by: Kayvan Mivehnejad <K.Mivehnejad@gmail.com>	2 days ago
Qier Li	dc7fb5bebe	[Bug][KVConnector][Metrics] Remove a vacuous assertion breaking external-launcher (#30577 ) Co-authored-by: Qier Li <qier@fb.com>	2 days ago
Qidong Su	24429d5924	[Doc] Add instructions for building docker image on GB300 with CUDA13 (#30414 ) Signed-off-by: Qidong Su <soodoshll@gmail.com>	2 days ago
Wentao Ye	6e78ed6ba7	[Logs] Optimize startup logs 4 (#29903 ) Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2 days ago
Isotr0py	7c16f3fbcc	[Doc] Add documents for multi-node distributed serving with MP backend (#30509 ) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2 days ago
lif	ddbfbe5278	[Docs] Clarify Expert Parallel behavior for attention and MoE layers (#30615 ) Signed-off-by: majiayu000 <1835304752@qq.com>	2 days ago
Laith Sakka	763963aa73	set assume_32bit_indexing and pass unbacked hints (#30459 ) Signed-off-by: Laith Sakka <lsakka@meta.com>	2 days ago
Cyrus Leung	39cefbdf17	[Refactor] `TokenizerRegistry` only uses lazy imports (#30609 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2 days ago
Chen Zhang	ace34e3783	[Bugfix] Qwen3-next with --hf-overrides \{\"num_hidden_layers\":8\} (#30433 ) Signed-off-by: Chen Zhang <zhangch99@outlook.com>	2 days ago
Isotr0py	e5db3e2774	[CI/Build] Fix broken mm processor test Mistral-3-large (#30597 ) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>	3 days ago
Cyrus Leung	64251f48df	[Chore] Adjust tokenizer import to avoid circular imports (#30601 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	3 days ago
Nick Hill	1cec5b7ea9	[Scheduer] Simplify stop checking for pooling models (#30591 ) Signed-off-by: Nick Hill <nhill@redhat.com>	3 days ago
Cyrus Leung	b09806e28f	[Bugfix] Dictionary MM embeddings for online chat (#30507 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	3 days ago
Tsukasa OI	fdc135d768	[Misc][Quantization] Clarify the intent of GGUF `FusedMoE` weight materialization (#30310 ) Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>	3 days ago
Roberto L. Castro	4fa7ce46f3	[Feature] Add SM103 (Blackwell Ultra) Support to vLLM (#30484 ) Signed-off-by: LopezCastroRoberto <robertol.c510@gmail.com> Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com> Co-authored-by: youkaichao <youkaichao@gmail.com>	3 days ago
Nicolò Lucchesi	57e9bf1864	[CI] Whisper logprobs tests (#30504 ) Signed-off-by: NickLucche <nlucches@redhat.com>	3 days ago
Michael Goin	2f32a68d75	[CI] Update several models in registry that are available online now (#30514 ) Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Isotr0py <2037008807@qq.com>	3 days ago
Matthew Bonanni	f5dfbbd8e9	[Docs] Remove references to `VLLM_ATTENTION_BACKEND` (#30564 ) Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>	3 days ago
Michael Goin	fc0119425c	Add IBM and Red Hat to compute resources sponsors (#30581 ) Signed-off-by: Michael Goin <mgoin64@gmail.com>	3 days ago
Matthew Bonanni	86a3261525	[Bugfix] Pass FA version in `MultiHeadAttention` (#30575 ) Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>	3 days ago
rasmith	08f8a5627e	[CI/Build][Kernel][BugFix][AMD] Fix per_token_group_quant_fp8 to use correct fp8 min/max values and update atol/rtol in test_quantfp8_group_functionality (#30292 ) Signed-off-by: Randall Smith <ransmith@amd.com> Co-authored-by: Randall Smith <ransmith@amd.com>	3 days ago
Kevin H. Luu	b4039c08b5	[ci] Mark PrimeRL integration test as soft fail (#30578 ) Signed-off-by: Kevin H. Luu <khluu000@gmail.com>	3 days ago
Wentao Ye	1e6b115300	[Refactor] Reduce duplicate code in `per_token_group_quant` cuda kernels (#30496 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	3 days ago
danielafrimi	13618626df	[MoE-FP8-modelopt] Add FlashInfer alignment padding for intermediate dimensions (#29748 ) Signed-off-by: Daniel Afrimi <dafrimi@pool0-00589.cm.cluster> Signed-off-by: dafrimi <dafrimi@nvidia.com> Co-authored-by: Daniel Afrimi <dafrimi@pool0-00589.cm.cluster> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>	3 days ago
danielafrimi	6ec0d8dbe4	[Fix]Load kv-cache dtype from hf_quant_config.json automatically (#29980 ) Signed-off-by: Daniel Afrimi <dafrimi@nvidia.com>	3 days ago
Li, Jiang	9693dd0fe3	[CI/Build] Add x86 CPU wheel release pipeline (#28848 ) Signed-off-by: jiang1.li <jiang1.li@intel.com>	3 days ago
Xin Yang	1f19d8f899	[Perf] Set split_k to 1 for triton_kernels (#30528 ) Signed-off-by: Xin Yang <xyangx@amazon.com>	3 days ago
shivampr	cd7740ac5c	[ROCm] Enable Triton ScaledMM fallback + kernel selection fix (#26668 ) Signed-off-by: Shivam <shivampr.dev@gmail.com> Signed-off-by: Shivam <shivamprasad91@gmail.com>	3 days ago
Wentao Ye	02a5880394	[CI] Fix mypy for vllm/v1/executor (#30517 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	3 days ago
realliujiaxu	d2c919dcc2	[bugfix] fix bug when top_logprobs=0 with spec decoding (#30059 ) Signed-off-by: realliujiaxu <realliujiaxu@163.com>	3 days ago
Benjamin Bartels	f3237f3f6b	[Frontend] Fixes anthropic streaming message_start usage nesting (#30266 ) Signed-off-by: bbartels <benjamin@bartels.dev>	3 days ago
jvlunteren	9c0ee995a8	[Kernel] Support CUDA Graphs in 3D Triton Attention Kernel (#28306 ) Signed-off-by: Jan van Lunteren <jvl@zurich.ibm.com> Signed-off-by: jvlunteren <161835099+jvlunteren@users.noreply.github.com> Co-authored-by: Thomas Parnell <tom.parnell@gmail.com> Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>	3 days ago
Michael Goin	09ad3b76b3	[Bug] Fix attention_backend arg string parsing (#30534 ) Signed-off-by: mgoin <mgoin64@gmail.com>	3 days ago