Merge branch 'main' into wentao-optimize-startup-logs-4

[bug] Fix "Current vLLM config is not set." warnings when FlashInfer attention is used (#30241 )
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -0,0 +1,24 @@
 name: vllm_ci
 job_dirs:
  - ".buildkite/test_areas"
  - ".buildkite/image_build"
 run_all_patterns:
  - "docker/Dockerfile"
  - "CMakeLists.txt"
  - "requirements/common.txt"
  - "requirements/cuda.txt"
  - "requirements/build.txt"
  - "requirements/test.txt"
  - "setup.py"
  - "csrc/"
  - "cmake/"
 run_all_exclude_patterns:
  - "docker/Dockerfile."
  - "csrc/cpu/"
  - "csrc/rocm/"
  - "cmake/hipify.py"
  - "cmake/cpu_extension.cmake"
 registries: public.ecr.aws/q9t5s3a7
 repositories:
  main: "vllm-ci-postmerge-repo"
  premerge: "vllm-ci-test-repo"
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -0,0 +1,56 @@
 #!/bin/bash
 set -e

 if [[ $# -lt 8 ]]; then
  echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
  exit 1
 fi

 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 BRANCH=$4
 VLLM_USE_PRECOMPILED=$5
 VLLM_MERGE_BASE_COMMIT=$6
 CACHE_FROM=$7
 CACHE_TO=$8

 # authenticate with AWS ECR
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
 aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com

 # docker buildx 
 docker buildx create --name vllm-builder --driver docker-container --use
 docker buildx inspect --bootstrap
 docker buildx ls

 # skip build if image already exists
 if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
  exit 0
 fi

 if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
  merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
 else
  merge_base_commit_build_args=""
 fi

 # build
 docker buildx build --file docker/Dockerfile \
  --build-arg max_jobs=16 \
  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
  --build-arg USE_SCCACHE=1 \
  --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
  --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
  --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
  ${merge_base_commit_build_args} \
  --cache-from type=registry,ref=${CACHE_FROM},mode=max \
  --cache-to type=registry,ref=${CACHE_TO},mode=max \
  --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
  $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
  --push \
  --target test \
  --progress plain .
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -0,0 +1,57 @@
 group: Abuild
 steps:
  - label: ":docker: Build image"
    key: image-build
    depends_on: []
    commands:
    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
          limit: 2
        - exit_status: -10  # Agent was lost
          limit: 2

  - label: ":docker: Build CPU image"
    key: image-build-cpu
    depends_on: []
    commands:
    - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
    env:
      DOCKER_BUILDKIT: "1"
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
          limit: 2
        - exit_status: -10  # Agent was lost
          limit: 2

  - label: ":docker: Build HPU image"
    soft_fail: true
    depends_on: []
    key: image-build-hpu
    commands:
    - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
    env:
      DOCKER_BUILDKIT: "1"
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
          limit: 2
        - exit_status: -10  # Agent was lost
          limit: 2
  
  - label: ":docker: Build CPU arm64 image"
    key: cpu-arm64-image-build
    depends_on: []
    optional: true
    commands:
    - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
    env:
      DOCKER_BUILDKIT: "1"
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
          limit: 2
        - exit_status: -10  # Agent was lost
          limit: 2
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -0,0 +1,36 @@
 #!/bin/bash
 set -e

 if [[ $# -lt 3 ]]; then
  echo "Usage: $0 <registry> <repo> <commit>"
  exit 1
 fi

 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
 if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
  exit 0
 fi

 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
  --build-arg VLLM_CPU_AVX512BF16=true \
  --build-arg VLLM_CPU_AVX512VNNI=true \
  --build-arg VLLM_CPU_AMXBF16=true \
  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
  --target vllm-test \
  --progress plain .

 # push
 docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -0,0 +1,33 @@
 #!/bin/bash
 set -e

 if [[ $# -lt 3 ]]; then
  echo "Usage: $0 <registry> <repo> <commit>"
  exit 1
 fi

 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
 if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
  exit 0
 fi

 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
  --target vllm-test \
  --progress plain .

 # push
 docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_hpu.sh
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -0,0 +1,34 @@
 #!/bin/bash
 set -e

 if [[ $# -lt 3 ]]; then
  echo "Usage: $0 <registry> <repo> <commit>"
  exit 1
 fi

 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
 if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
  exit 0
 fi

 # build
 docker build \
  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
  --build-arg max_jobs=16 \
  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
  --progress plain \
  https://github.com/vllm-project/vllm-gaudi.git

 # push
 docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -36,11 +36,17 @@ function cpu_tests() {
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"

  # Run model tests
  docker exec cpu-test bash -c "
    set -e
    pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"

  # Run kernel tests
  docker exec cpu-test bash -c "
    set -e
    pytest -x -v -s tests/kernels/test_onednn.py
    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
    pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"

  # basic online serving
  docker exec cpu-test bash -c '
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -47,6 +47,6 @@ docker run \
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
 PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
 PRIME_RL_DIR="${REPO_ROOT}/prime-rl"

 if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
    echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
    exit 0
 fi

 echo "Setting up Prime-RL integration test environment..."

 # Clean up any existing Prime-RL directory
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -398,7 +398,8 @@ steps:
  timeout_in_minutes: 25
  gpu: h100
  source_file_dependencies:
    - vllm/
    - vllm/v1/attention
    - vllm/model_executor/layers
    - tests/v1/determinism/
  commands:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -440,23 +441,29 @@ steps:
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
  - vllm/entrypoints
  - vllm/multimodal
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
    # for basic
    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_pooling.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    # for pooling models
    - python3 pooling/pooling/vision_language_pooling.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -718,6 +725,18 @@ steps:
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  autorun_on_main: true
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1

 - label: OpenAI API correctness # 10min
  timeout_in_minutes: 15
  mirror_hardwares: [amdexperimental, amdproduction]
@@ -727,7 +746,7 @@ steps:
  - csrc/
  - vllm/entrypoints/openai/
  - vllm/model_executor/models/whisper.py
  commands: # LMEval
  commands: # LMEval+Transcription WER check
  # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
  - pytest -s entrypoints/openai/correctness/

@@ -963,6 +982,19 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

 - label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/multimodal/
  - vllm/inputs/
  - vllm/v1/core/
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1

 - label: Multi-Modal Models Test (Extended) 1 # 60min
  timeout_in_minutes: 120
  mirror_hardwares: [amdexperimental]
@@ -1098,7 +1130,6 @@ steps:
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - vllm/model_executor/layers/fused_moe/layer.py
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
@@ -1132,12 +1163,25 @@ steps:
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
    # Run all e2e fusion tests
    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py

 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
  optional: true # run on nightlies
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58

 - label: Blackwell Quantized MoE Test
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
@@ -1155,6 +1199,16 @@ steps:
  commands:
    - pytest -s -v tests/quantization/test_blackwell_moe.py

 - label: Blackwell LM Eval Small Models
  timeout_in_minutes: 120
  gpu: b200
  optional: true # run on nightlies
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1

 #####  1 GPU test  #####
 #####  multi gpus test  #####

@@ -1397,6 +1451,39 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py


 - label: LM Eval Large Models # optional
  gpu: a100
  optional: true
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

 ##### H100 test #####
 - label: LM Eval Large Models (H100) # optional
  gpu: h100
  optional: true
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4


 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
  mirror_hardwares: [amdexperimental]
@@ -1440,29 +1527,6 @@ steps:
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1

 - label: Blackwell LM Eval Small Models
  timeout_in_minutes: 120
  gpu: b200
  optional: true # run on nightlies
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1

 - label: Multi-Modal Accuracy Eval (Small Models) # 10min
  timeout_in_minutes: 70
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/multimodal/
  - vllm/inputs/
  - vllm/v1/core/
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1

 - label: LM Eval Large Models (4 Card)
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
@@ -1478,21 +1542,6 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

 - label: LM Eval Large Models (H100) # optional
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4

 - label: ROCm LM Eval Large Models (8 Card)
  mirror_hardwares: [amdproduction]
  agent_pool: mi325_8
@@ -1517,6 +1566,20 @@ steps:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58

 ##### RL Integration Tests #####
 - label: Prime-RL Integration Test # 15min
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_2
  # grade: Blocking
  timeout_in_minutes: 30
  optional: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
    - bash .buildkite/scripts/run-prime-rl-test.sh
 - label: DeepSeek V2-Lite Accuracy
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
@@ -1550,17 +1613,26 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

 ##### RL Integration Tests #####
 - label: Prime-RL Integration Test # 15min
 - label: DeepSeek V2-Lite Async EPLB Accuracy
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_2
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 30
  gpu: h100
  optional: true
  num_gpus: 2
  num_gpus: 4
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
    - bash .buildkite/scripts/run-prime-rl-test.sh
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030

 - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -468,7 +468,9 @@ steps:
  # tests covered elsewhere.
  # Use `find` to launch multiple instances of pytest so that
  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
  # However, find does not normally propagate error codes, so we combine it with xargs
  # (using -0 for proper path handling)
  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@@ -482,7 +484,9 @@ steps:
  # as it is a heavy test that is covered in other steps.
  # Use `find` to launch multiple instances of pytest so that
  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
  # However, find does not normally propagate error codes, so we combine it with xargs
  # (using -0 for proper path handling)
  - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

 - label: PyTorch Fullgraph Test # 27min
  timeout_in_minutes: 40
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@@ -0,0 +1,21 @@
 group: Attention
 depends_on: 
  - image-build
 steps:
 - label: V1 attention (H100)
  timeout_in_minutes: 30
  gpu: h100
  source_file_dependencies:
    - vllm/v1/attention
    - tests/v1/attention
  commands:
    - pytest -v -s v1/attention

 - label: V1 attention (B200)
  timeout_in_minutes: 30
  gpu: b200
  source_file_dependencies:
    - vllm/v1/attention
    - tests/v1/attention
  commands:
    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -0,0 +1,16 @@
 group: Basic Correctness
 depends_on: 
  - image-build
 steps:
 - label: Basic Correctness
  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_basic_correctness
  - tests/basic_correctness/test_cpu_offload
  - tests/basic_correctness/test_cumem.py
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
--- a/.buildkite/test_areas/benchmarks.yaml
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -0,0 +1,19 @@
 group: Benchmarks
 depends_on: 
  - image-build
 steps:
 - label: Benchmarks
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/.buildkite"
  source_file_dependencies:
  - benchmarks/
  commands:
  - bash scripts/run-benchmarks.sh

 - label: Benchmarks CLI Test
  timeout_in_minutes: 20
  source_file_dependencies:
  - vllm/
  - tests/benchmarks/
  commands:
  - pytest -v -s benchmarks/
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -0,0 +1,57 @@
 group: Compile
 depends_on: 
  - image-build
 steps:
 - label: Fusion and Compile Tests (B200)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  gpu: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/worker/
  - vllm/v1/cudagraph_dispatcher.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
    # Wrap with quotes to escape yaml
    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

 - label: Fusion E2E (2 GPUs)(B200)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  gpu: b200
  optional: true
  num_gpus: 2
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/distributed/test_fusions_e2e.py
  commands:
    - nvidia-smi
    # Run all e2e fusion tests
    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py

--- a/.buildkite/test_areas/cuda.yaml
+++ b/.buildkite/test_areas/cuda.yaml
@@ -0,0 +1,22 @@
 group: CUDA
 depends_on: 
  - image-build
 steps:
 - label: Platform Tests (CUDA)
  timeout_in_minutes: 15
  source_file_dependencies:
  - vllm/
  - tests/cuda
  commands:
    - pytest -v -s cuda/test_cuda_context.py

 - label: Cudagraph
  timeout_in_minutes: 20
  source_file_dependencies:
  - tests/v1/cudagraph
  - vllm/v1/cudagraph_dispatcher.py
  - vllm/config/compilation.py
  - vllm/compilation
  commands:
    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -0,0 +1,199 @@
 group: Distributed
 depends_on: 
  - image-build
 steps:
 - label: Distributed Comm Ops
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
  commands:
  - pytest -v -s distributed/test_comm_ops.py
  - pytest -v -s distributed/test_shm_broadcast.py
  - pytest -v -s distributed/test_shm_buffer.py
  - pytest -v -s distributed/test_shm_storage.py

 - label: Distributed (2 GPUs)
  timeout_in_minutes: 90
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/worker/worker_base.py
  - vllm/v1/engine/
  - vllm/v1/worker/
  - tests/compile/fullgraph/test_basic_correctness.py
  - tests/compile/test_wrapper.py
  - tests/distributed/
  - tests/entrypoints/llm/test_collective_rpc.py
  - tests/v1/distributed
  - tests/v1/entrypoints/openai/test_multi_api_servers.py
  - tests/v1/shutdown
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py

 - label: Distributed Tests (4 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  # test with torchrun tp=2 and external_dp=2
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with torchrun tp=2 and pp=2
  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with torchrun tp=4 and dp=1
  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with torchrun tp=2, pp=2 and dp=1
  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with torchrun tp=1 and dp=4 with ep
  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with torchrun tp=2 and dp=2 with ep
  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/fullgraph/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - cd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py

 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
  gpu: h100
  num_gpus: 8
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - examples/offline_inference/torchrun_dp_example.py
  - vllm/config/parallel.py
  - vllm/distributed/
  - vllm/v1/engine/llm_engine.py
  - vllm/v1/executor/uniproc_executor.py
  - vllm/v1/worker/gpu_worker.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  # test with torchrun tp=2 and dp=4 with ep
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep

 - label: Distributed Tests (4 GPUs)(A100)
  gpu: a100
  optional: true
  num_gpus: 4
  source_file_dependencies:
  - vllm/
  commands:
  # NOTE: don't test llama model here, it seems hf implementation is buggy
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

 - label: Distributed Tests (2 GPUs)(H200)
  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
    - pytest -v -s tests/v1/distributed/test_dbo.py

 - label: Distributed Tests (2 GPUs)(B200)
  gpu: b200
  optional: true
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
    - pytest -v -s tests/v1/distributed/test_dbo.py

 - label: 2 Node Test (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  num_nodes: 2
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
  - tests/examples/offline_inference/data_parallel.py
  commands:
    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"

 - label: Distributed NixlConnector PD accuracy (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh

 - label: Pipeline + Context Parallelism (4 GPUs))
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -0,0 +1,59 @@
 group: E2E Integration
 depends_on: 
  - image-build
 steps:
 - label: DeepSeek V2-Lite Accuracy
  timeout_in_minutes: 60
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010

 - label: Qwen3-30B-A3B-FP8-block Accuracy
  timeout_in_minutes: 60
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020

 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
  gpu: b200
  optional: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

 - label: Prime-RL Integration (2 GPUs)
  timeout_in_minutes: 30
  optional: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
    - bash .buildkite/scripts/run-prime-rl-test.sh

 - label: DeepSeek V2-Lite Async EPLB Accuracy
  timeout_in_minutes: 60
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030

 - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
  timeout_in_minutes: 60
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -0,0 +1,26 @@
 group: Engine
 depends_on: 
  - image-build
 steps:
 - label: Engine
  timeout_in_minutes: 15
  source_file_dependencies:
  - vllm/
  - tests/engine
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
  - tests/test_vllm_port
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py

 - label: V1 e2e + engine
  timeout_in_minutes: 45
  source_file_dependencies:
    - vllm/
    - tests/v1
  commands:
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
    - pytest -v -s v1/engine
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -0,0 +1,68 @@
 group: Entrypoints
 depends_on: 
  - image-build
 steps:
 - label: Entrypoints Unit Tests  
  timeout_in_minutes: 10
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/entrypoints
  - tests/entrypoints/
  commands:
  - pytest -v -s entrypoints/openai/tool_parsers
  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling

 - label: Entrypoints Integration (LLM)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Entrypoints Integration (API Server)
  timeout_in_minutes: 130
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/openai
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
  - pytest -v -s entrypoints/test_chat_utils.py


 - label: Entrypoints Integration (Pooling)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/pooling
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling


 - label: Entrypoints V1
  timeout_in_minutes: 50
  source_file_dependencies:
    - vllm/
    - tests/v1
  commands:
    - pytest -v -s v1/entrypoints

 - label: OpenAI API Correctness
  timeout_in_minutes: 30
  source_file_dependencies:
  - csrc/
  - vllm/entrypoints/openai/
  - vllm/model_executor/models/whisper.py
  commands: # LMEval+Transcription WER check
  - pytest -s entrypoints/openai/correctness/
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -0,0 +1,23 @@
 group: Expert Parallelism
 depends_on: 
  - image-build
 steps:
 - label: EPLB Algorithm
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_algo.py
  commands:
  - pytest -v -s distributed/test_eplb_algo.py

 - label: EPLB Execution
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_execute.py
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
  - pytest -v -s distributed/test_eplb_spec_decode.py
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -0,0 +1,117 @@
 group: Kernels
 depends_on: 
  - image-build
 steps:
 - label: Kernels Core Operation Test
  timeout_in_minutes: 75
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
  - tests/kernels/test_top_k_per_row.py
  commands:
    - pytest -v -s kernels/core kernels/test_top_k_per_row.py

 - label: Kernels Attention Test %N
  timeout_in_minutes: 35
  source_file_dependencies:
  - csrc/attention/
  - vllm/attention
  - vllm/v1/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

 - label: Kernels Quantization Test %N
  timeout_in_minutes: 90
  source_file_dependencies:
  - csrc/quantization/
  - vllm/model_executor/layers/quantization
  - tests/kernels/quantization
  commands:
    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

 - label: Kernels MoE Test %N
  timeout_in_minutes: 60
  source_file_dependencies:
  - csrc/quantization/cutlass_w8a8/moe/
  - csrc/moe/
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  - vllm/distributed/device_communicators/
  - vllm/envs.py
  - vllm/config
  commands:
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

 - label: Kernels Mamba Test
  timeout_in_minutes: 45
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
  - vllm/model_executor/layers/mamba/ops
  commands:
    - pytest -v -s kernels/mamba

 - label: Kernels DeepGEMM Test (H100)
  timeout_in_minutes: 45
  gpu: h100
  num_gpus: 1
  source_file_dependencies:
  - tools/install_deepgemm.sh
  - vllm/utils/deep_gemm.py
  - vllm/model_executor/layers/fused_moe
  - vllm/model_executor/layers/quantization
  - tests/kernels/quantization/test_block_fp8.py
  - tests/kernels/moe/test_deepgemm.py
  - tests/kernels/moe/test_batched_deepgemm.py
  - tests/kernels/attention/test_deepgemm_attention.py
  commands:
    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
    - pytest -v -s kernels/moe/test_deepgemm.py
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py

 - label: Kernels (B200)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
  - csrc/attention/mla/
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
  - vllm/v1/attention/backends/mla/flashinfer_mla.py
  - vllm/platforms/cuda.py
  - vllm/attention/selector.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
    - pytest -v -s tests/kernels/attention/test_attention_selector.py
    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
    # Quantization
    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -0,0 +1,46 @@
 group: LM Eval
 depends_on: 
  - image-build
 steps:
 - label: LM Eval Small Models
  timeout_in_minutes: 75
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  autorun_on_main: true
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1

 - label: LM Eval Large Models (4 GPUs)(A100)
  gpu: a100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

 - label: LM Eval Large Models (4 GPUs)(H100)
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4

 - label: LM Eval Small Models (B200)
  timeout_in_minutes: 120
  gpu: b200
  optional: true
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -0,0 +1,31 @@
 group: LoRA
 depends_on: 
  - image-build
 steps:
 - label: LoRA %N
  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/lora
  - tests/lora
  commands:
    - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
  parallelism: 4


 - label: LoRA TP (Distributed)
  timeout_in_minutes: 30
  num_gpus: 4
  source_file_dependencies:
  - vllm/lora
  - tests/lora
  commands:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    # There is some Tensor Parallelism related processing logic in LoRA that
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
    - pytest -v -s -x lora/test_olmoe_tp.py
    - pytest -v -s -x lora/test_gptoss_tp.py
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -0,0 +1,163 @@
 group: Miscellaneous
 depends_on: 
  - image-build
 steps:
 - label: V1 Others
  timeout_in_minutes: 60
  source_file_dependencies:
    - vllm/
    - tests/v1
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    # split the test to avoid interference
    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
    - pytest -v -s v1/kv_offload
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
    - pytest -v -s v1/spec_decode
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
    - pytest -v -s v1/test_request.py
    - pytest -v -s v1/test_outputs.py
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

 - label: V1 Others (CPU)
  depends_on: ~
  source_file_dependencies:
    - vllm/
    - tests/v1
  no_gpu: true
  commands:
    # split the test to avoid interference
    - pytest -v -s -m 'cpu_test' v1/core
    - pytest -v -s v1/structured_output
    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'cpu_test' v1/metrics

 - label: Regression
  timeout_in_minutes: 20
  source_file_dependencies:
  - vllm/
  - tests/test_regression
  commands:
  - pip install modelscope
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional

 - label: Examples
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
  - vllm/entrypoints
  - vllm/multimodal
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
    - python3 offline_inference/basic/chat.py # for basic
    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     # for pooling models
    - python3 pooling/pooling/vision_language_pooling.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536

 - label: Metrics, Tracing (2 GPUs)
  timeout_in_minutes: 20
  num_gpus: 2
  source_file_dependencies:
  - vllm/
  - tests/v1/tracing
  commands:
  - "pip install \
      'opentelemetry-sdk>=1.26.0' \
      'opentelemetry-api>=1.26.0' \
      'opentelemetry-exporter-otlp>=1.26.0' \
      'opentelemetry-semantic-conventions-ai>=0.4.1'"
  - pytest -v -s v1/tracing

 - label: Python-only Installation
  depends_on: ~
  timeout_in_minutes: 20
  source_file_dependencies:
  - tests/standalone_tests/python_only_compile.sh
  - setup.py
  commands:
  - bash standalone_tests/python_only_compile.sh

 - label: Async Engine, Inputs, Utils, Worker
  timeout_in_minutes: 50
  source_file_dependencies:
  - vllm/
  - tests/multimodal
  - tests/utils_
  commands:
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_

 - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
  depends_on: ~
  timeout_in_minutes: 20
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/transformers_utils
  - tests/config
  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s tokenizers_
  - pytest -v -s transformers_utils
  - pytest -v -s config

 - label: GPT-OSS Eval (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
  optional: true
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58

 - label: Batch Invariance (H100)
  timeout_in_minutes: 25
  gpu: h100
  source_file_dependencies:
    - vllm/v1/attention
    - vllm/model_executor/layers
    - tests/v1/determinism/
  commands:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pip install pytest-timeout pytest-forked
    - pytest -v -s v1/determinism/test_batch_invariance.py
    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
--- a/.buildkite/test_areas/model_executor.yaml
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -0,0 +1,17 @@
 group: Model Executor
 depends_on: 
  - image-build
 steps:
 - label: Model Executor
  timeout_in_minutes: 35
  source_file_dependencies:
  - vllm/engine/arg_utils.py
  - vllm/config/model.py
  - vllm/model_executor
  - tests/model_executor
  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s model_executor
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -0,0 +1,62 @@
 group: Models - Basic
 depends_on: 
  - image-build
 steps:
 - label: Basic Models Tests (Initialization)
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/test_initialization.py
  commands:
    # Run a subset of model initialization tests
    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset

 - label: Basic Models Tests (Extra Initialization) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
  - tests/models/test_initialization.py
  commands:
    # Only when vLLM model source is modified - test initialization of a large
    # subset of supported models (the complement of the small subset in the above
    # test.) Also run if model initialization test file is modified
    - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
  parallelism: 2

 - label: Basic Models Tests (Other)
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
    - pytest -v -s models/test_transformers.py models/test_registry.py

 - label: Basic Models Test (Other CPU) # 5min
  timeout_in_minutes: 10
  source_file_dependencies:
  - vllm/
  - tests/models/test_utils.py
  - tests/models/test_vision.py
  no_gpu: true
  commands:
    - pytest -v -s models/test_utils.py models/test_vision.py

 - label: Transformers Nightly Models
  working_dir: "/vllm-workspace/"
  optional: true
  soft_fail: true
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
    - pytest -v -s tests/models/test_initialization.py
    - pytest -v -s tests/models/test_transformers.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
    - python3 examples/offline_inference/basic/chat.py
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
--- a/.buildkite/test_areas/models_distributed.yaml
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -0,0 +1,22 @@
 group: Models - Distributed
 depends_on: 
  - image-build
 steps:
 - label: Distributed Model Tests (2 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/model_executor/model_loader/sharded_state_loader.py
  - vllm/model_executor/models/
  - tests/basic_correctness/
  - tests/model_executor/model_loader/test_sharded_state_loader.py
  - tests/models/
  commands:
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -0,0 +1,91 @@
 group: Models - Language
 depends_on: 
  - image-build
 steps:
 - label: Language Models Tests (Standard)
  timeout_in_minutes: 25
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/language
  commands:
    # Test standard language models, excluding a subset of slow tests
    - pip freeze | grep -E 'torch'
    - pytest -v -s models/language -m 'core_model and (not slow_test)'

 - label: Language Models Tests (Extra Standard) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
  - tests/models/language/pooling/test_embedding.py
  - tests/models/language/generation/test_common.py
  - tests/models/language/pooling/test_classification.py
  commands:
    # Shard slow subset of standard language models tests. Only run when model
    # source is modified, or when specified test files are modified
    - pip freeze | grep -E 'torch'
    - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
  parallelism: 2

 - label: Language Models Tests (Hybrid) %N
  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/generation
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    # Shard hybrid language model tests
    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
  parallelism: 2

 - label: Language Models Test (Extended Generation) # 80min
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/generation
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'

 - label: Language Models Test (PPL)
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/generation_ppl_test
  commands:
    - pytest -v -s models/language/generation_ppl_test

 - label: Language Models Test (Extended Pooling)  # 36min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling
  commands:
    - pytest -v -s models/language/pooling -m 'not core_model'

 - label: Language Models Test (MTEB)
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling_mteb_test
  commands:
    - pytest -v -s models/language/pooling_mteb_test
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -0,0 +1,79 @@
 group: Models - Multimodal
 depends_on: 
  - image-build
 steps:
 - label: Multi-Modal Models (Standard) # 60min
  timeout_in_minutes: 80
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pip freeze | grep -E 'torch'
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

 - label: Multi-Modal Processor Test (CPU)
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  no_gpu: true
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py

 - label: Multi-Modal Processor # 44min
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing/test_tensor_schema.py

 - label: Multi-Modal Accuracy Eval (Small Models) # 50min
  timeout_in_minutes: 70
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/multimodal/
  - vllm/inputs/
  - vllm/v1/core/
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1

 - label: Multi-Modal Models (Extended) 1
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing

 - label: Multi-Modal Models (Extended) 2
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'

 - label: Multi-Modal Models (Extended) 3
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'

 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models
  optional: true
  commands:
    - echo 'Testing custom models...'
    # PR authors can temporarily add commands below to test individual models
    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -0,0 +1,34 @@
 group: Plugins
 depends_on: 
  - image-build
 steps:
 - label: Plugin Tests (2 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
  commands:
  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
  - pip install -e ./plugins/vllm_add_dummy_platform
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
  - pip install -e ./plugins/prithvi_io_processor_plugin
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
  # begin stat_logger plugins test
  - pip install -e ./plugins/vllm_add_dummy_stat_logger
  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
  - pip uninstall dummy_stat_logger -y
  # end stat_logger plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -0,0 +1,50 @@
 group: PyTorch
 depends_on: 
  - image-build
 steps:
 - label: PyTorch Compilation Unit Tests
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/
    - tests/compile
  commands:
  # Run unit tests defined directly under compile/,
  # not including subdirectories, which are usually heavier
  # tests covered elsewhere.
  # Use `find` to launch multiple instances of pytest so that
  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"

 - label: PyTorch Fullgraph Smoke Test
  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
  # Run smoke tests under fullgraph directory, except test_full_graph.py
  # as it is a heavy test that is covered in other steps.
  # Use `find` to launch multiple instances of pytest so that
  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"

 - label: PyTorch Fullgraph
  timeout_in_minutes: 40
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
    # Limit to no custom ops to reduce running time
    # Wrap with quotes to escape yaml and avoid starting -k string with a -
  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"

 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
  soft_fail: true
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
  commands:
  - bash standalone_tests/pytorch_nightly_dependency.sh
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -0,0 +1,46 @@
 group: Quantization
 depends_on: 
  - image-build
 steps:
 - label: Quantization
  timeout_in_minutes: 90
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/quantization
  commands:
  # temporary install here since we need nightly, will move to requirements/test.in
  # after torchao 0.12 release, and pin a working version of torchao nightly here

  # since torchao nightly is only compatible with torch nightly currently
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

 - label: Quantized MoE Test (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
  source_file_dependencies:
  - tests/quantization/test_blackwell_moe.py
  - vllm/model_executor/models/deepseek_v2.py
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/models/llama4.py
  - vllm/model_executor/layers/fused_moe
  - vllm/model_executor/layers/quantization/compressed_tensors
  - vllm/model_executor/layers/quantization/modelopt.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - pytest -s -v tests/quantization/test_blackwell_moe.py

 - label: Quantized Models Test
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/model_executor/layers/quantization
  - tests/models/quantization
  commands:
    - pytest -v -s models/quantization
--- a/.buildkite/test_areas/samplers.yaml
+++ b/.buildkite/test_areas/samplers.yaml
@@ -0,0 +1,14 @@
 group: Samplers
 depends_on: 
  - image-build
 steps:
 - label: Samplers Test
  timeout_in_minutes: 75
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/sampling_metadata.py
  - tests/samplers
  - tests/conftest.py
  commands:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
--- a/.buildkite/test_areas/tool_use.yaml
+++ b/.buildkite/test_areas/tool_use.yaml
@@ -0,0 +1,23 @@
 group: Tool use
 depends_on: 
  - image-build
 steps:
 - label: OpenAI-Compatible Tool Use
  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  fast_check: false
  source_file_dependencies:
    - vllm/
    - tests/tool_use
  commands:
    - pytest -v -s -m 'not cpu_test' tool_use

 - label: OpenAI-Compatible Tool Use (CPU)
  depends_on: ~
  timeout_in_minutes: 10
  source_file_dependencies:
    - vllm/
    - tests/tool_use
  no_gpu: true
  commands:
    - pytest -v -s -m 'cpu_test' tool_use
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -0,0 +1,25 @@
 group: Weight Loading
 depends_on: 
  - image-build
 steps:
 - label: Weight Loading Multiple GPU  # 33min
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt

 - label: Weight Loading Multiple GPU - Large Models # optional
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  gpu: a100
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -35,6 +35,20 @@ pull_request_rules:

        For future commits, `pre-commit` will run automatically on changed files before each commit.

        > [!TIP]
        > <details>
        > <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
        > <br/>
        > <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
        >
        > ```bash
        > # For mypy (substitute "3.10" with the failing version if needed)
        > pre-commit run --hook-stage manual mypy-3.10
        > # For markdownlint
        > pre-commit run --hook-stage manual markdownlint
        > ```
        > </details>

 - name: comment-dco-failure
  description: Comment on PR when DCO check fails
  conditions:
@@ -172,7 +186,7 @@ pull_request_rules:
      - files~=^tests/entrypoints/test_context.py
      - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
      - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
      - files~=^vllm/entrypoints/harmony_utils.py
      - files~=^vllm/entrypoints/openai/parser/harmony_utils.py
      - files~=^vllm/entrypoints/tool_server.py
      - files~=^vllm/entrypoints/tool.py
      - files~=^vllm/entrypoints/context.py
@@ -390,4 +404,4 @@ pull_request_rules:
  actions:
    label:
      add:
        - kv-connector
        - kv-connector
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -13,7 +13,7 @@ jobs:

    steps:
      - name: Checkout repository
        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1

      - name: Set up Python
        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -12,7 +12,7 @@ jobs:
    timeout-minutes: 30

    steps:
      - uses: actions/checkout@v6
      - uses: actions/checkout@v6.0.1

      - uses: astral-sh/setup-uv@v7
        with:
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,7 +16,7 @@ jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
    - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
    - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
      with:
        python-version: "3.12"
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -7,13 +7,15 @@ on:

 jobs:
  close-issues-and-pull-requests:
    # Prevents triggering on forks or other repos
    if: github.repository == 'vllm-project/vllm'
    permissions:
      issues: write
      pull-requests: write
      actions: write
    runs-on: ubuntu-latest
    steps:
      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
      - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
        with:
          # Increasing this value ensures that changes to this workflow
          # propagate to all issues and PRs in days rather than months
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -874,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
    set(SRCS
       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
       "csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
       "csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
       )

    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@@ -944,7 +947,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
  "csrc/moe/moe_align_sum_kernels.cu"
  "csrc/moe/moe_lora_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -96,8 +96,9 @@ start_server() {
    # This correctly passes each element as a separate argument.
    if [[ -n "$profile_dir" ]]; then
        # Start server with profiling enabled
        VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
        local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
        VLLM_SERVER_DEV_MODE=1 \
            vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    else
        # Start server without profiling
        VLLM_SERVER_DEV_MODE=1 \
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -574,7 +574,7 @@ async def benchmark(
    )
    print(
        "{:<40} {:<10.2f}".format(
            "Total Token throughput (tok/s):", metrics.total_token_throughput
            "Total token throughput (tok/s):", metrics.total_token_throughput
        )
    )

@@ -963,8 +963,7 @@ def create_argument_parser():
    parser.add_argument(
        "--profile",
        action="store_true",
        help="Use Torch Profiler. The endpoint must be launched with "
        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
        help="Use vLLM Profiling. --profiler-config must be provided on the server.",
    )
    parser.add_argument(
        "--result-dir",
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -14,6 +14,9 @@ from tqdm import tqdm

 import vllm._custom_ops as ops
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    per_token_group_quant_fp8,
 )


@dataclass
@@ -22,6 +25,7 @@ class bench_params_t:
    hidden_size: int
    add_residual: bool
    dtype: torch.dtype
    group_size: list[int]

    def description(self):
        return (
@@ -29,6 +33,7 @@ class bench_params_t:
            f"x D {self.hidden_size} "
            f"x R {self.add_residual} "
            f"x DT {self.dtype}"
            f"x GS {self.group_size}"
        )


@@ -38,10 +43,11 @@ def get_bench_params() -> list[bench_params_t]:
    HIDDEN_SIZES = list(range(1024, 8129, 1024))
    ADD_RESIDUAL = [True, False]
    DTYPES = [torch.bfloat16, torch.float]
    GROUP_SIZES = [[1, 64], [1, 128]]

    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
    bench_params = list(
        map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
        map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
    )
    return bench_params

@@ -52,6 +58,7 @@ def unfused_int8_impl(
    x: torch.Tensor,
    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
    group_size: list[int],
 ):
    # Norm
    torch_out = None
@@ -69,6 +76,7 @@ def unfused_fp8_impl(
    x: torch.Tensor,
    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
    group_size: list[int],
 ):
    # Norm
    torch_out = None
@@ -81,23 +89,63 @@ def unfused_fp8_impl(
    torch_out, _ = ops.scaled_fp8_quant(torch_out)


 def unfused_groupwise_fp8_impl(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
    group_size: list[int],
 ):
    # Norm
    torch_out = None
    if residual is None:
        torch_out = rms_norm_layer.forward_cuda(x, residual)
    else:
        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)

    # Quant
    torch_out, _ = per_token_group_quant_fp8(
        torch_out, group_size=group_size[1], use_ue8m0=False
    )


 def fused_impl(
    rms_norm_layer: RMSNorm,  # this stores the weights
    x: torch.Tensor,
    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
    group_size: list[int],
 ):
    out, _ = ops.rms_norm_dynamic_per_token_quant(
        x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
    )


 def fused_groupwise_impl(
    rms_norm_layer: RMSNorm,  # this stores the weights
    x: torch.Tensor,
    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
    group_size: list[int],
 ):
    out, _ = ops.rms_norm_per_block_quant(
        x,
        rms_norm_layer.weight,
        1e-6,
        quant_dtype,
        group_size,
        residual=residual,
        is_scale_transposed=True,
    )


 # Bench functions
 def bench_fn(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
    residual: torch.Tensor,
    quant_dtype: torch.dtype,
    group_size: list[int],
    label: str,
    sub_label: str,
    fn: Callable,
@@ -110,10 +158,11 @@ def bench_fn(
        "x": x,
        "residual": residual,
        "quant_dtype": quant_dtype,
        "group_size": group_size,
        "fn": fn,
    }
    return TBenchmark.Timer(
        stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
        stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
        globals=globals,
        label=label,
        sub_label=sub_label,
@@ -147,6 +196,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
            x,
            residual,
            torch.int8,
            params.group_size,
            label,
            sub_label,
            unfused_int8_impl,
@@ -161,6 +211,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
            x,
            residual,
            torch.float8_e4m3fn,
            params.group_size,
            label,
            sub_label,
            unfused_fp8_impl,
@@ -175,6 +226,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
            x,
            residual,
            torch.int8,
            params.group_size,
            label,
            sub_label,
            fused_impl,
@@ -189,6 +241,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
            x,
            residual,
            torch.float8_e4m3fn,
            params.group_size,
            label,
            sub_label,
            fused_impl,
@@ -196,6 +249,36 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
        )
    )

    # unfused groupwise fp8 impl.
    timers.append(
        bench_fn(
            layer,
            x,
            residual,
            torch.float8_e4m3fn,
            params.group_size,
            label,
            sub_label,
            unfused_groupwise_fp8_impl,
            "unfused_groupwise_fp8_impl",
        )
    )

    # fused groupwise fp8 impl.
    timers.append(
        bench_fn(
            layer,
            x,
            residual,
            torch.float8_e4m3fn,
            params.group_size,
            label,
            sub_label,
            fused_groupwise_impl,
            "fused_groupwise_fp8_impl",
        )
    )

    print_timers(timers)

    return timers
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@@ -24,12 +24,15 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
 num_tokens_range = [1, 16, 256, 4096]
 num_experts_range = [16, 64, 224, 256, 280, 512]
 topk_range = [1, 2, 8]
 configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
 ep_size_range = [1, 8]
 configs = list(
    itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
 )


@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=["num_tokens", "num_experts", "topk"],
        x_names=["num_tokens", "num_experts", "topk", "ep_size"],
        x_vals=configs,
        line_arg="provider",
        line_vals=["vllm"],
@@ -38,16 +41,26 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
        args={},
    )
 )
 def benchmark(num_tokens, num_experts, topk, provider):
 def benchmark(num_tokens, num_experts, topk, ep_size, provider):
    """Benchmark function for Triton."""
    block_size = 256
    torch.cuda.manual_seed_all(0)
    topk_ids = get_topk_ids(num_tokens, num_experts, topk)

    e_map = None
    if ep_size != 1:
        local_e = num_experts // ep_size
        e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
        e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
        e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)

    quantiles = [0.5, 0.2, 0.8]

    if provider == "vllm":
        ms, min_ms, max_ms = triton.testing.do_bench(
            lambda: moe_align_block_size(topk_ids, block_size, num_experts),
            lambda: moe_align_block_size(
                topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
            ),
            quantiles=quantiles,
        )

--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        endif()

        # Build ACL with CMake
        set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
        set(CMAKE_BUILD_TYPE "Release")
        set(ARM_COMPUTE_ARCH "armv8.2-a")
        set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
        set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
        set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
        set(ARM_COMPUTE_ENABLE_OPENMP "ON")
        set(ARM_COMPUTE_ENABLE_WERROR "OFF")
        set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
        set(ARM_COMPUTE_BUILD_TESTING "OFF")

        set(_cmake_config_cmd
             ${CMAKE_COMMAND} -G Ninja -B build 
            -DARM_COMPUTE_BUILD_SHARED_LIB=OFF 
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -117,7 +117,6 @@ torch::Tensor get_scheduler_metadata(
  input.casual = casual;
  input.isa = isa;
  input.enable_kv_split = enable_kv_split;
  TORCH_CHECK(casual, "Only supports casual mask for now.");

  VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
    CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -186,7 +186,7 @@ struct AttentionMetadata {
 //  - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
 //  * q_tile_size * 4, partial output, max + sum (float)
 // Reduction scratchpad contains:
 //  - flags: bool array to indicate wether the split is finished
 //  - flags: bool array to indicate whether the split is finished
 //  - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
 //  - max, sum: 2 * split_num * q_tile_size * 4
 class AttentionScratchPad {
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -118,6 +118,24 @@
    }                                         \
  }

 #define VLLM_DISPATCH_BOOL(expr, const_expr, ...) \
  if (expr) {                                     \
    constexpr bool const_expr = true;             \
    __VA_ARGS__();                                \
  } else {                                        \
    constexpr bool const_expr = false;            \
    __VA_ARGS__();                                \
  }

 #define VLLM_DISPATCH_GROUP_SIZE(group_size, const_group_size, ...) \
  if (group_size == 128) {                                          \
    constexpr int const_group_size = 128;                           \
    __VA_ARGS__();                                                  \
  } else if (group_size == 64) {                                    \
    constexpr int const_group_size = 64;                            \
    __VA_ARGS__();                                                  \
  }

 #define VLLM_DISPATCH_RANK234(NUM_DIMS, ...)                                   \
  switch (NUM_DIMS) {                                                          \
    case 2: {                                                                  \
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -444,23 +444,27 @@ __device__ inline T apply_sigmoid(T val) {
  return cuda_cast<T, float>(sigmoid_accurate(f));
 }

 template <typename T>
 template <ScoringFunc SF, typename T>
 __device__ inline T apply_scoring(T val) {
  if constexpr (SF == SCORING_SIGMOID) {
    return apply_sigmoid(val);
  } else {
    return val;
  }
 }

 template <typename T, ScoringFunc SF>
 __device__ void topk_with_k2(T* output, T const* input, T const* bias,
                             cg::thread_block_tile<32> const& tile,
                             int32_t const lane_id,
                             int const num_experts_per_group,
                             int const scoring_func) {
                             int const num_experts_per_group) {
  // Get the top2 per thread
  T largest = neg_inf<T>();
  T second_largest = neg_inf<T>();

  if (num_experts_per_group > WARP_SIZE) {
    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
      T value = input[i];
      // Apply scoring function if needed
      if (scoring_func == SCORING_SIGMOID) {
        value = apply_sigmoid(value);
      }
      T value = apply_scoring<SF>(input[i]);
      value = value + bias[i];

      if (value > largest) {
@@ -472,11 +476,7 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
    }
  } else {
    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
      T value = input[i];
      // Apply scoring function if needed
      if (scoring_func == SCORING_SIGMOID) {
        value = apply_sigmoid(value);
      }
      T value = apply_scoring<SF>(input[i]);
      value = value + bias[i];
      largest = value;
    }
@@ -501,13 +501,12 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
  }
 }

 template <typename T>
 template <typename T, ScoringFunc SF>
 __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
                                    int64_t const num_tokens,
                                    int64_t const num_cases,
                                    int64_t const n_group,
                                    int64_t const num_experts_per_group,
                                    int const scoring_func) {
                                    int64_t const num_experts_per_group) {
  int32_t warp_id = threadIdx.x / WARP_SIZE;
  int32_t lane_id = threadIdx.x % WARP_SIZE;

@@ -525,21 +524,21 @@ __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
    asm volatile("griddepcontrol.wait;");
 #endif
    topk_with_k2(output, input, group_bias, tile, lane_id,
                 num_experts_per_group, scoring_func);
    topk_with_k2<T, SF>(output, input, group_bias, tile, lane_id,
                        num_experts_per_group);
  }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
  asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }

 template <typename T, typename IdxT>
 template <typename T, typename IdxT, ScoringFunc SF, int NGroup = -1>
 __global__ void group_idx_and_topk_idx_kernel(
    T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
    T const* bias, int64_t const num_tokens, int64_t const n_group,
    int64_t const topk_group, int64_t const topk, int64_t const num_experts,
    int64_t const num_experts_per_group, bool renormalize,
    double routed_scaling_factor, int scoring_func) {
    double routed_scaling_factor) {
  int32_t warp_id = threadIdx.x / WARP_SIZE;
  int32_t lane_id = threadIdx.x % WARP_SIZE;
  int32_t case_id =
@@ -549,6 +548,11 @@ __global__ void group_idx_and_topk_idx_kernel(
  topk_values += case_id * topk;
  topk_indices += case_id * topk;

  constexpr bool kUseStaticNGroup = (NGroup > 0);
  // use int32 to avoid implicit conversion
  int32_t const n_group_i32 =
      kUseStaticNGroup ? NGroup : static_cast<int32_t>(n_group);

  int32_t align_num_experts_per_group =
      warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);

@@ -574,13 +578,14 @@ __global__ void group_idx_and_topk_idx_kernel(

  if (case_id < num_tokens) {
    // calculate group_idx
    int32_t target_num_min = WARP_SIZE - n_group + topk_group;
    int32_t target_num_min =
        WARP_SIZE - n_group_i32 + static_cast<int32_t>(topk_group);
    // The check is necessary to avoid abnormal input
    if (lane_id < n_group && is_finite(group_scores[lane_id])) {
    if (lane_id < n_group_i32 && is_finite(group_scores[lane_id])) {
      value = group_scores[lane_id];
    }

    int count_equal_to_top_value = WARP_SIZE - n_group;
    int count_equal_to_top_value = WARP_SIZE - n_group_i32;
    int pre_count_equal_to_top_value = 0;
    // Use loop to find the largset top_group
    while (count_equal_to_top_value < target_num_min) {
@@ -604,7 +609,7 @@ __global__ void group_idx_and_topk_idx_kernel(
  int count_equalto_topkth_group = 0;
  bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
  if (case_id < num_tokens && if_proceed_next_topk) {
    for (int i_group = 0; i_group < n_group; i_group++) {
    auto process_group = [&](int i_group) {
      if ((group_scores[i_group] > topk_group_value) ||
          ((group_scores[i_group] == topk_group_value) &&
           (count_equalto_topkth_group < num_equalto_topkth_group))) {
@@ -613,11 +618,10 @@ __global__ void group_idx_and_topk_idx_kernel(
             i += WARP_SIZE) {
          T candidates = neg_inf<T>();
          if (i < num_experts_per_group) {
            // Apply scoring function (if any) and add bias
            // apply scoring function (if any) and add bias
            T input = scores[offset + i];
            if (is_finite(input)) {
              T score = (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input)
                                                          : input;
              T score = apply_scoring<SF>(input);
              candidates = score + bias[offset + i];
            }
          }
@@ -627,6 +631,17 @@ __global__ void group_idx_and_topk_idx_kernel(
          count_equalto_topkth_group++;
        }
      }
    };

    if constexpr (kUseStaticNGroup) {
 #pragma unroll
      for (int i_group = 0; i_group < NGroup; ++i_group) {
        process_group(i_group);
      }
    } else {
      for (int i_group = 0; i_group < n_group_i32; ++i_group) {
        process_group(i_group);
      }
    }
    queue.done();
    __syncwarp();
@@ -646,12 +661,13 @@ __global__ void group_idx_and_topk_idx_kernel(
      if (i < topk) {
        // Load the score value (without bias) for normalization
        T input = scores[s_topk_idx[i]];
        value =
            (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input) : input;
        value = apply_scoring<SF>(input);
        s_topk_value[i] = value;
      }
      topk_sum +=
          cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
      if (renormalize) {
        topk_sum +=
            cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
      }
    }
  }

@@ -660,13 +676,9 @@ __global__ void group_idx_and_topk_idx_kernel(
  if (case_id < num_tokens) {
    if (if_proceed_next_topk) {
      for (int i = lane_id; i < topk; i += WARP_SIZE) {
        float value;
        if (renormalize) {
          value = cuda_cast<float, T>(s_topk_value[i]) / topk_sum *
                  routed_scaling_factor;
        } else {
          value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
        }
        float base = cuda_cast<float, T>(s_topk_value[i]);
        float value = renormalize ? (base / topk_sum * routed_scaling_factor)
                                  : (base * routed_scaling_factor);
        topk_indices[i] = s_topk_idx[i];
        topk_values[i] = value;
      }
@@ -684,6 +696,45 @@ __global__ void group_idx_and_topk_idx_kernel(
 #endif
 }

 template <typename T, typename IdxT, ScoringFunc SF>
 inline void launch_group_idx_and_topk_kernel(
    cudaLaunchConfig_t const& config, T* scores, T* group_scores,
    float* topk_values, IdxT* topk_indices, T const* bias,
    int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
    int64_t const topk, int64_t const num_experts,
    int64_t const num_experts_per_group, bool const renormalize,
    double const routed_scaling_factor) {
  auto launch = [&](auto* kernel_instance2) {
    cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
                       topk_values, topk_indices, bias, num_tokens, n_group,
                       topk_group, topk, num_experts, num_experts_per_group,
                       renormalize, routed_scaling_factor);
  };

  switch (n_group) {
    case 4: {
      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 4>);
      break;
    }
    case 8: {
      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 8>);
      break;
    }
    case 16: {
      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 16>);
      break;
    }
    case 32: {
      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 32>);
      break;
    }
    default: {
      launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF>);
      break;
    }
  }
 }

 template <typename T, typename IdxT>
 void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
                   IdxT* topk_indices, T const* bias, int64_t const num_tokens,
@@ -694,7 +745,6 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
                   cudaStream_t const stream = 0) {
  int64_t num_cases = num_tokens * n_group;
  int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
  auto* kernel_instance1 = &topk_with_k2_kernel<T>;
  cudaLaunchConfig_t config;
  config.gridDim = topk_with_k2_num_blocks;
  config.blockDim = BLOCK_SIZE;
@@ -705,16 +755,33 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
  config.numAttrs = 1;
  config.attrs = attrs;
  cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
                     num_tokens, num_cases, n_group, num_experts / n_group,
                     scoring_func);
  auto const sf = static_cast<ScoringFunc>(scoring_func);
  int64_t const num_experts_per_group = num_experts / n_group;
  auto launch_topk_with_k2 = [&](auto* kernel_instance1) {
    cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
                       num_tokens, num_cases, n_group, num_experts_per_group);
  };
  switch (sf) {
    case SCORING_NONE: {
      auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_NONE>;
      launch_topk_with_k2(kernel_instance1);
      break;
    }
    case SCORING_SIGMOID: {
      auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_SIGMOID>;
      launch_topk_with_k2(kernel_instance1);
      break;
    }
    default:
      // should be guarded by higher level checks.
      TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
  }

  int64_t topk_with_k_group_num_blocks =
      (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
  size_t dynamic_smem_in_bytes =
      warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
                                                           topk);
  auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
  config.gridDim = topk_with_k_group_num_blocks;
  config.blockDim = BLOCK_SIZE;
  config.dynamicSmemBytes = dynamic_smem_in_bytes;
@@ -723,10 +790,24 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
  config.numAttrs = 1;
  config.attrs = attrs;
  cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
                     topk_values, topk_indices, bias, num_tokens, n_group,
                     topk_group, topk, num_experts, num_experts / n_group,
                     renormalize, routed_scaling_factor, scoring_func);
  switch (sf) {
    case SCORING_NONE: {
      launch_group_idx_and_topk_kernel<T, IdxT, SCORING_NONE>(
          config, scores, group_scores, topk_values, topk_indices, bias,
          num_tokens, n_group, topk_group, topk, num_experts,
          num_experts_per_group, renormalize, routed_scaling_factor);
      break;
    }
    case SCORING_SIGMOID: {
      launch_group_idx_and_topk_kernel<T, IdxT, SCORING_SIGMOID>(
          config, scores, group_scores, topk_values, topk_indices, bias,
          num_tokens, n_group, topk_group, topk, num_experts,
          num_experts_per_group, renormalize, routed_scaling_factor);
      break;
    }
    default:
      TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
  }
 }

 #define INSTANTIATE_NOAUX_TC(T, IdxT)                                       \
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -14,7 +14,6 @@

 namespace vllm {
 namespace moe {

 namespace batched_moe_align_block_size {

 // Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
@@ -80,17 +79,32 @@ __global__ void batched_moe_align_block_size_kernel(
 }  // namespace batched_moe_align_block_size

 template <typename scalar_t>
 __global__ void moe_align_block_size_kernel(
 __device__ void _moe_align_block_size(
    const scalar_t* __restrict__ topk_ids,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
    int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
    int32_t* __restrict__ total_tokens_post_pad,
    int32_t* __restrict__ expert_map, int32_t num_experts,
    int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
    size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded) {
    size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
    int32_t max_num_m_blocks, int32_t model_offset, int32_t inactive_expert_id,
    int32_t topk_num, int32_t* token_mask, bool has_expert_map) {
  extern __shared__ int32_t shared_counts[];

  // Initialize sorted_token_ids with numel
  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
    sorted_token_ids[it] = numel;
  // Compute input buffer offsets. Typically these will all be 0, except when
  // using Multi LoRA.
  int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
  int expert_ids_offset = max_num_m_blocks * model_offset;
  int cumsum_offset = (num_experts + 1) * model_offset;

  // Use separate threadblocks to fill sorted_token_ids.
  // This is safe since the current kernel does not use sorted_token_ids.
  if (blockIdx.x % 2) {
    // Initialize sorted_token_ids with numel
    for (size_t it = threadIdx.x; it < max_num_tokens_padded;
         it += blockDim.x) {
      sorted_token_ids[sorted_token_ids_offset + it] = numel;
    }
    return;
  }

  const int warp_id = threadIdx.x / WARP_SIZE;
@@ -112,9 +126,16 @@ __global__ void moe_align_block_size_kernel(
    if (expert_id >= num_experts) {
      continue;
    }
    if (has_expert_map) {
      expert_id = expert_map[expert_id];
      // filter invalid experts
      if (expert_id == -1) continue;
    }
    int warp_idx = expert_id / experts_per_warp;
    int expert_offset = expert_id % experts_per_warp;
    atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
    int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
    atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset],
              mask);
  }

  __syncthreads();
@@ -135,48 +156,196 @@ __global__ void moe_align_block_size_kernel(
  int cumsum_val;
  BlockScan(temp_storage).ExclusiveSum(expert_count, cumsum_val);
  if (expert_id <= num_experts) {
    cumsum[expert_id] = cumsum_val;
    cumsum[cumsum_offset + expert_id] = cumsum_val;
  }

  if (expert_id == num_experts) {
    *total_tokens_post_pad = cumsum_val;
    total_tokens_post_pad[model_offset] = cumsum_val;
  }

  __syncthreads();

  if (threadIdx.x < num_experts) {
    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
         i += block_size) {
      expert_ids[i / block_size] = threadIdx.x;
    for (int i = cumsum[cumsum_offset + threadIdx.x];
         i < cumsum[cumsum_offset + threadIdx.x + 1]; i += block_size) {
      expert_ids[expert_ids_offset + i / block_size] = threadIdx.x;
    }
  }

  // Fill remaining expert_ids with 0
  const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
  const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
  for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
    expert_ids[i] = 0;
  const size_t fill_start_idx =
      cumsum[cumsum_offset + num_experts] / block_size + threadIdx.x;
  for (size_t i = fill_start_idx; i < max_num_m_blocks; i += blockDim.x) {
    expert_ids[expert_ids_offset + i] = inactive_expert_id;
  }
 }

 template <typename scalar_t, int32_t fill_threads>
 __device__ void _moe_align_block_size_small_batch_expert(
    const scalar_t* __restrict__ topk_ids,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
    int32_t* __restrict__ total_tokens_post_pad,
    int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
    size_t numel, int32_t max_num_tokens_padded, int32_t max_num_m_blocks,
    int32_t inactive_expert_id, int32_t model_offset, int32_t topk_num,
    int32_t* token_mask, bool has_expert_map) {
  // Compute input buffer offsets. Typically these will all be 0, except when
  // using Multi LoRA.
  int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
  int expert_ids_offset = max_num_m_blocks * model_offset;

  // Use an additional group of threads to fill sorted_token_ids.
  // Since the current kernel will use sorted_token_ids afterward,
  // we fill sorted_token_ids within the same threadblock to make
  // synchronization easier.
  if (threadIdx.x < fill_threads) {
    // Initialize sorted_token_ids with numel
    for (size_t it = threadIdx.x; it < max_num_tokens_padded;
         it += fill_threads) {
      sorted_token_ids[sorted_token_ids_offset + it] = numel;
    }
    // Three __syncthreads() corresponding to the other threads
    __syncthreads();
    __syncthreads();
    __syncthreads();
    return;
  }

  const size_t tid = threadIdx.x - fill_threads;
  const size_t stride = blockDim.x - fill_threads;

  extern __shared__ int32_t shared_mem[];
  int32_t* cumsum = shared_mem;
  int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);

  for (int i = 0; i < num_experts; ++i) {
    tokens_cnts[(tid + 1) * num_experts + i] = 0;
  }

  for (size_t i = tid; i < numel; i += stride) {
    int32_t expert_id = topk_ids[i];
    if (has_expert_map) {
      expert_id = expert_map[expert_id];
      // filter invalid expert
      if (expert_id == -1) continue;
    }
    int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
    tokens_cnts[(tid + 1) * num_experts + expert_id] += mask;
  }

  __syncthreads();

  if (tid < num_experts) {
    tokens_cnts[tid] = 0;
    for (int i = 1; i <= stride; ++i) {
      tokens_cnts[i * num_experts + tid] +=
          tokens_cnts[(i - 1) * num_experts + tid];
    }
  }

  __syncthreads();

  if (tid == 0) {
    cumsum[0] = 0;
    for (int i = 1; i <= num_experts; ++i) {
      cumsum[i] =
          cumsum[i - 1] +
          CEILDIV(tokens_cnts[stride * num_experts + i - 1], block_size) *
              block_size;
    }
    total_tokens_post_pad[model_offset] =
        static_cast<int32_t>(cumsum[num_experts]);
  }

  __syncthreads();

  if (tid < num_experts) {
    for (int i = cumsum[tid]; i < cumsum[tid + 1]; i += block_size) {
      expert_ids[expert_ids_offset + i / block_size] = tid;
    }
  }

  // Fill remaining expert_ids with 0
  const size_t fill_start_idx = cumsum[num_experts] / block_size + tid;
  for (size_t i = fill_start_idx; i < max_num_m_blocks; i += stride) {
    expert_ids[expert_ids_offset + i] = inactive_expert_id;
  }

  for (size_t i = tid; i < numel; i += stride) {
    int32_t expert_id = topk_ids[i];
    if (has_expert_map) {
      expert_id = expert_map[expert_id];
      // filter invalid expert
      if (expert_id == -1) continue;
    }
    int32_t rank_post_pad =
        tokens_cnts[tid * num_experts + expert_id] + cumsum[expert_id];

    if (token_mask == nullptr || token_mask[i / topk_num]) {
      sorted_token_ids[sorted_token_ids_offset + rank_post_pad] = i;
      ++tokens_cnts[tid * num_experts + expert_id];
    }
  }
 }

 template <typename scalar_t>
 __global__ void count_and_sort_expert_tokens_kernel(
 __device__ void _count_and_sort_expert_tokens(
    const scalar_t* __restrict__ topk_ids,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
    size_t numel, int32_t num_experts) {
  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
  const size_t stride = blockDim.x * gridDim.x;
    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
    int32_t max_num_tokens_padded, int32_t* __restrict__ token_mask,
    int32_t model_offset, int32_t topk_num, bool has_expert_map) {
  const size_t tid = blockIdx.y * blockDim.x + threadIdx.x;
  const size_t stride = blockDim.x * gridDim.y;

  for (size_t i = tid; i < numel; i += stride) {
    int32_t expert_id = topk_ids[i];
    if (expert_id >= num_experts) {
      continue;
    }
    int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
    sorted_token_ids[rank_post_pad] = i;

    if (has_expert_map) {
      expert_id = expert_map[expert_id];
      // filter invalid experts
      if (expert_id == -1) continue;
    }

    if (token_mask == nullptr || token_mask[i / topk_num]) {
      int32_t rank_post_pad = atomicAdd(
          &cumsum_buffer[(model_offset * (num_experts + 1)) + expert_id], 1);
      sorted_token_ids[max_num_tokens_padded * model_offset + rank_post_pad] =
          i;
    }
  }
 }

 template <typename scalar_t>
 __global__ void moe_align_block_size_kernel(
    const scalar_t* __restrict__ topk_ids,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
    int32_t* __restrict__ total_tokens_post_pad,
    int32_t* __restrict__ expert_map, int32_t num_experts,
    int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
    size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
    int32_t topk_num, bool has_expert_map) {
  _moe_align_block_size(
      topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
      num_experts, padded_num_experts, experts_per_warp, block_size, numel,
      cumsum, max_num_tokens_padded, CEILDIV(max_num_tokens_padded, block_size),
      0, 0, topk_num, nullptr, has_expert_map);
 }

 template <typename scalar_t>
 __global__ void count_and_sort_expert_tokens_kernel(
    const scalar_t* __restrict__ topk_ids,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
    int32_t max_num_tokens_padded, int32_t topk_num, bool has_expert_map) {
  _count_and_sort_expert_tokens(
      topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
      max_num_tokens_padded, nullptr, 0, topk_num, has_expert_map);
 }

 template <typename scalar_t, int TOPK>
 __global__ void moe_sum_kernel(
    scalar_t* __restrict__ out,          // [..., d]
@@ -193,78 +362,111 @@ __global__ void moe_sum_kernel(
  }
 }

 template <typename scalar_t>
 template <typename scalar_t, int32_t fill_threads>
 __global__ void moe_align_block_size_small_batch_expert_kernel(
    const scalar_t* __restrict__ topk_ids,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
    int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
    int32_t block_size, size_t numel, int32_t max_num_tokens_padded) {
  // Initialize sorted_token_ids with numel
  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
    sorted_token_ids[it] = numel;
  }

  const size_t tid = threadIdx.x;
  const size_t stride = blockDim.x;

  extern __shared__ int32_t shared_mem[];
  int32_t* cumsum = shared_mem;
  int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
    int32_t* __restrict__ total_tokens_post_pad,
    int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
    size_t numel, int32_t max_num_tokens_padded, int32_t topk_num,
    bool has_expert_map) {
  _moe_align_block_size_small_batch_expert<scalar_t, fill_threads>(
      topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
      num_experts, block_size, numel, max_num_tokens_padded,
      CEILDIV(max_num_tokens_padded, block_size), 0, 0, topk_num, nullptr,
      has_expert_map);
 }

  for (int i = 0; i < num_experts; ++i) {
    tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
 template <typename scalar_t>
 __global__ void moe_lora_align_block_size_kernel(
    scalar_t* __restrict__ topk_ids, int32_t* __restrict__ token_lora_mapping,
    int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
    int max_loras, size_t numel, int max_num_tokens_padded,
    int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
    int32_t* __restrict__ expert_ids, int32_t topk_num,
    int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
    int32_t* __restrict__ cumsum, int32_t experts_per_warp,
    int32_t padded_num_experts, int32_t* lora_ids,
    int32_t* __restrict__ token_mask, bool has_expert_map) {
  int lora_idx = blockIdx.x / 2;
  int lora_id = lora_ids[lora_idx];
  if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
    return;
  }

  for (size_t i = tid; i < numel; i += stride) {
    ++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]];
  // Populate the token_mask based on the token-LoRA mapping
  int num_tokens = numel / topk_num;
  if (threadIdx.x == 0) {
    total_tokens_post_pad[lora_id] = 0;

    for (int i = 0; i < num_tokens; i++) {
      token_mask[(lora_id * num_tokens) + i] =
          (int)token_lora_mapping[i] == lora_id;
    }
  }

  __syncthreads();

  if (threadIdx.x < num_experts) {
    tokens_cnts[threadIdx.x] = 0;
    for (int i = 1; i <= blockDim.x; ++i) {
      tokens_cnts[i * num_experts + threadIdx.x] +=
          tokens_cnts[(i - 1) * num_experts + threadIdx.x];
    }
  _moe_align_block_size(
      topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
      num_experts, padded_num_experts, experts_per_warp, block_size, numel,
      cumsum, max_num_tokens_padded, max_num_m_blocks, lora_id, -1, topk_num,
      &token_mask[(lora_id * num_tokens)], has_expert_map);
 }

 template <typename scalar_t>
 __global__ void lora_count_and_sort_expert_tokens_kernel(
    const scalar_t* __restrict__ topk_ids,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
    int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
    int32_t max_num_tokens_padded, int32_t topk_num, int32_t* token_mask,
    int32_t* lora_ids, bool has_expert_map) {
  int lora_idx = blockIdx.x;
  int lora_id = lora_ids[lora_idx];
  if (lora_id == -1) {
    return;
  }

  __syncthreads();
  int num_tokens = numel / topk_num;

  if (threadIdx.x == 0) {
    cumsum[0] = 0;
    for (int i = 1; i <= num_experts; ++i) {
      cumsum[i] =
          cumsum[i - 1] +
          CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) *
              block_size;
    }
    *total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
  _count_and_sort_expert_tokens(
      topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
      max_num_tokens_padded, &token_mask[(lora_id * num_tokens)], lora_id,
      topk_num, has_expert_map);
 }

 template <typename scalar_t, int32_t fill_threads>
 __global__ void moe_lora_align_block_size_small_batch_expert_kernel(
    scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
    int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
    int max_loras, size_t numel, int max_num_tokens_padded,
    int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
    int32_t* __restrict__ expert_ids, int topk_num,
    int32_t* total_tokens_post_pad, int32_t* adapter_enabled, int32_t* lora_ids,
    int32_t* token_mask, bool has_expert_map) {
  int lora_idx = blockIdx.x;
  int lora_id = lora_ids[lora_idx];
  if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
    return;
  }

  __syncthreads();
  int num_tokens = numel / topk_num;
  if (threadIdx.x == 0) {
    total_tokens_post_pad[lora_id] = 0;

  if (threadIdx.x < num_experts) {
    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
         i += block_size) {
      expert_ids[i / block_size] = threadIdx.x;
    for (int i = 0; i < num_tokens; i++) {
      token_mask[(lora_id * num_tokens) + i] =
          (int)token_lora_mapping[i] == lora_id;
    }
  }

  // Fill remaining expert_ids with 0
  const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
  const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
  for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
    expert_ids[i] = 0;
  }
  __syncthreads();

  for (size_t i = tid; i < numel; i += stride) {
    int32_t expert_id = topk_ids[i];
    int32_t rank_post_pad =
        tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
    sorted_token_ids[rank_post_pad] = i;
    ++tokens_cnts[threadIdx.x * num_experts + expert_id];
  }
  _moe_align_block_size_small_batch_expert<scalar_t, fill_threads>(
      topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
      num_experts, block_size, numel, max_num_tokens_padded, max_num_m_blocks,
      -1, lora_id, topk_num, &token_mask[(lora_id * num_tokens)],
      has_expert_map);
 }

 }  // namespace moe
@@ -275,7 +477,8 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
 void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                          int64_t block_size, torch::Tensor sorted_token_ids,
                          torch::Tensor experts_ids,
                          torch::Tensor num_tokens_post_pad) {
                          torch::Tensor num_tokens_post_pad,
                          std::optional<torch::Tensor> maybe_expert_map) {
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  int64_t padded_num_experts =
@@ -287,14 +490,19 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
  // BlockScan uses 1024 threads and assigns one thread per expert.
  TORCH_CHECK(padded_num_experts < 1024,
              "padded_num_experts must be less than 1024");
  auto options_int =
      torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
  bool has_expert_map = maybe_expert_map.has_value();
  torch::Tensor expert_map;
  if (has_expert_map) {
    expert_map = maybe_expert_map.value();
  } else {
    expert_map = torch::empty({0}, options_int);
  }

  VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
      topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
        // calc needed amount of shared mem for `cumsum` tensors
        auto options_int =
            torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
        torch::Tensor cumsum_buffer =
            torch::empty({num_experts + 1}, options_int);
        bool small_batch_expert_mode =
            (topk_ids.numel() < 1024) && (num_experts <= 64);

@@ -304,43 +512,58 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
              ((threads + 1) * num_experts + (num_experts + 1)) *
              sizeof(int32_t);

          // threadIdx.x >= fill_threads: counting experts and aligning
          // threadIdx.x < fill_threads: filling sorted_token_ids
          constexpr int32_t fill_threads = 256;
          auto small_batch_expert_kernel =
              vllm::moe::moe_align_block_size_small_batch_expert_kernel<
                  scalar_t>;
          small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>(
                  scalar_t, fill_threads>;
          small_batch_expert_kernel<<<1, fill_threads + threads,
                                      shared_mem_size, stream>>>(
              topk_ids.data_ptr<scalar_t>(),
              sorted_token_ids.data_ptr<int32_t>(),
              experts_ids.data_ptr<int32_t>(),
              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
              topk_ids.numel(), sorted_token_ids.size(0));
              num_tokens_post_pad.data_ptr<int32_t>(),
              expert_map.data_ptr<int32_t>(), num_experts, block_size,
              topk_ids.numel(), sorted_token_ids.size(0), topk_ids.size(1),
              has_expert_map);
        } else {
          torch::Tensor cumsum_buffer =
              torch::empty({num_experts + 1}, options_int);
          auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;

          size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
          size_t shared_mem_size =
              num_warps * experts_per_warp * sizeof(int32_t);

          align_kernel<<<1, threads, shared_mem_size, stream>>>(
          // launch two threadblocks
          // blockIdx.x == 0: counting experts and aligning
          // blockIdx.x == 1: filling sorted_token_ids
          align_kernel<<<2, threads, shared_mem_size, stream>>>(
              topk_ids.data_ptr<scalar_t>(),
              sorted_token_ids.data_ptr<int32_t>(),
              experts_ids.data_ptr<int32_t>(),
              num_tokens_post_pad.data_ptr<int32_t>(), num_experts,
              padded_num_experts, experts_per_warp, block_size,
              topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>(),
              sorted_token_ids.size(0));
              num_tokens_post_pad.data_ptr<int32_t>(),
              expert_map.data_ptr<int32_t>(), num_experts, padded_num_experts,
              experts_per_warp, block_size, topk_ids.numel(),
              cumsum_buffer.data_ptr<int32_t>(), sorted_token_ids.size(0),
              topk_ids.size(1), has_expert_map);

          const int block_threads = std::min(256, (int)threads);
          const int num_blocks =
              (topk_ids.numel() + block_threads - 1) / block_threads;
          const int max_blocks = 65535;
          const int actual_blocks = std::min(num_blocks, max_blocks);
          dim3 gridDims(1, actual_blocks);

          auto sort_kernel =
              vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
          sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
          sort_kernel<<<gridDims, block_threads, 0, stream>>>(
              topk_ids.data_ptr<scalar_t>(),
              sorted_token_ids.data_ptr<int32_t>(),
              cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel(), num_experts);
              cumsum_buffer.data_ptr<int32_t>(), expert_map.data_ptr<int32_t>(),
              topk_ids.numel(), num_experts, sorted_token_ids.size(0),
              topk_ids.size(1), has_expert_map);
        }
      });
 }
@@ -414,3 +637,123 @@ void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
      break;
  }
 }

 void moe_lora_align_block_size(
    torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
    int64_t num_experts, int64_t block_size, int64_t max_loras,
    int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
    torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
    torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
    torch::Tensor lora_ids, std::optional<torch::Tensor> maybe_expert_map) {
  const int topk_num = topk_ids.size(1);

  TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");

  int device_max_shared_mem;
  auto dev = topk_ids.get_device();
  cudaDeviceGetAttribute(&device_max_shared_mem,
                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  int64_t padded_num_experts =
      ((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;

  // BlockScan uses 1024 threads and assigns one thread per expert.
  TORCH_CHECK(padded_num_experts < 1024,
              "padded_num_experts must be less than 1024");

  auto options_int =
      torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
  torch::Tensor token_mask =
      torch::empty({max_loras * topk_ids.size(0)}, options_int);
  bool has_expert_map = maybe_expert_map.has_value();
  torch::Tensor expert_map;
  if (has_expert_map) {
    expert_map = maybe_expert_map.value();
  } else {
    expert_map = torch::empty({0}, options_int);
  }

  VLLM_DISPATCH_INTEGRAL_TYPES(
      topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
        bool small_batch_expert_mode =
            (topk_ids.numel() < 1024) && (num_experts <= 64);

        if (small_batch_expert_mode) {
          const int32_t num_thread = max((int32_t)num_experts, 128);
          const int32_t shared_mem =
              (num_thread + 1) * num_experts * sizeof(int32_t) +
              (num_experts + 1) * sizeof(int32_t);
          if (shared_mem > device_max_shared_mem) {
            TORCH_CHECK(false, "Shared memory usage exceeds device limit.");
          }

          // threadIdx.x >= fill_threads: counting experts and aligning
          // threadIdx.x < fill_threads: filling sorted_token_ids
          constexpr int32_t fill_threads = 256;

          dim3 blockDim(num_thread + fill_threads);
          auto kernel =
              vllm::moe::moe_lora_align_block_size_small_batch_expert_kernel<
                  scalar_t, fill_threads>;
          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
              (void*)kernel, shared_mem));
          kernel<<<max_loras, blockDim, shared_mem, stream>>>(
              topk_ids.data_ptr<scalar_t>(),
              token_lora_mapping.data_ptr<int32_t>(), block_size,
              expert_map.data_ptr<int32_t>(), num_experts, max_loras,
              topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
              sorted_token_ids.data_ptr<int32_t>(),
              expert_ids.data_ptr<int32_t>(), topk_num,
              num_tokens_post_pad.data_ptr<int32_t>(),
              adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>(),
              token_mask.data_ptr<int32_t>(), has_expert_map);
        } else {
          int num_thread = 1024;
          dim3 blockDim(num_thread);
          size_t num_warps = CEILDIV(padded_num_experts, WARP_SIZE);

          size_t shared_mem_size = num_warps * WARP_SIZE * sizeof(int32_t);

          // cumsum buffer
          torch::Tensor cumsum =
              torch::zeros({max_loras * (num_experts + 1)}, options_int);

          auto align_kernel =
              vllm::moe::moe_lora_align_block_size_kernel<scalar_t>;

          // launch two threadblocks for each lora
          // blockIdx.x % 2 == 0: counting experts and aligning
          // blockIdx.x % 2 == 1: filling sorted_token_ids
          align_kernel<<<max_loras * 2, blockDim, shared_mem_size, stream>>>(
              topk_ids.data_ptr<scalar_t>(),
              token_lora_mapping.data_ptr<int32_t>(), block_size,
              expert_map.data_ptr<int32_t>(), num_experts, max_loras,
              topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
              sorted_token_ids.data_ptr<int32_t>(),
              expert_ids.data_ptr<int32_t>(), topk_num,
              num_tokens_post_pad.data_ptr<int32_t>(),
              adapter_enabled.data_ptr<int32_t>(), cumsum.data_ptr<int32_t>(),
              WARP_SIZE, padded_num_experts, lora_ids.data_ptr<int32_t>(),
              token_mask.data_ptr<int32_t>(), has_expert_map);

          const int block_threads = std::min(256, (int)num_thread);
          const int num_blocks =
              (topk_ids.numel() + block_threads - 1) / block_threads;

          const int max_blocks = 65535;
          const int actual_blocks = std::min(num_blocks, max_blocks);

          dim3 gridDims(max_loras, actual_blocks);
          auto sort_kernel =
              vllm::moe::lora_count_and_sort_expert_tokens_kernel<scalar_t>;

          sort_kernel<<<gridDims, block_threads, 0, stream>>>(
              topk_ids.data_ptr<scalar_t>(),
              sorted_token_ids.data_ptr<int32_t>(), cumsum.data_ptr<int32_t>(),
              expert_map.data_ptr<int32_t>(), topk_ids.numel(), num_experts,
              max_num_tokens_padded, topk_num, token_mask.data_ptr<int32_t>(),
              lora_ids.data_ptr<int32_t>(), has_expert_map);
        }
      });
 }
--- a/csrc/moe/moe_lora_align_sum_kernels.cu
+++ b/csrc/moe/moe_lora_align_sum_kernels.cu
@@ -1,174 +0,0 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>

 #include <ATen/ATen.h>
 #include <ATen/cuda/Atomic.cuh>

 #include "../cuda_compat.h"
 #include "../dispatch_utils.h"
 #include "core/math.hpp"

 namespace {

 __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
                                         int32_t col) {
  return row * total_col + col;
 }

 }  // namespace

 // TODO: Refactor common parts with moe_align_sum_kernels
 template <typename scalar_t, typename token_cnts_t>
 __global__ void moe_lora_align_sum_kernel(
    scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
    int64_t block_size, int num_experts, int max_loras, size_t numel,
    int max_num_tokens_padded, int max_num_m_blocks,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
    int topk_num, int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
    int32_t* lora_ids) {
  const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
  const size_t start_idx = threadIdx.x * tokens_per_thread;

  int lora_idx = blockIdx.x;
  int lora_id = lora_ids[lora_idx];
  if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
    return;
  }
  extern __shared__ int32_t shared_mem[];
  int32_t* cumsum = shared_mem;
  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);

  // Initialize sorted_token_ids with numel
  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
    sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel;
  }

  // Initialize expert_ids with -1
  for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) {
    expert_ids[lora_id * max_num_m_blocks + it] = -1;
  }

  // Initialize total_tokens_post_pad with 0
  if (threadIdx.x == 0) {
    total_tokens_post_pad[lora_id] = 0;
  }

  for (int i = 0; i < num_experts; ++i) {
    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
  }

  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
    int mask = token_lora_mapping[i / topk_num] == lora_id;
    int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]);
    tokens_cnts[idx] += mask;
  }

  __syncthreads();

  // For each expert we accumulate the token counts from the different threads.
  if (threadIdx.x < num_experts) {
    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
    for (int i = 1; i <= blockDim.x; ++i) {
      tokens_cnts[index(num_experts, i, threadIdx.x)] +=
          tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
    }
  }

  __syncthreads();

  // We accumulate the token counts of all experts in thread 0.
  if (threadIdx.x == 0) {
    cumsum[0] = 0;
    for (int i = 1; i <= num_experts; ++i) {
      cumsum[i] = cumsum[i - 1] +
                  div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
                           block_size) *
                      block_size;
    }
    total_tokens_post_pad[lora_id] = static_cast<int32_t>(cumsum[num_experts]);
  }

  __syncthreads();

  /**
   * For each expert, each thread processes the tokens of the corresponding
   * blocks and stores the corresponding expert_id for each block.
   */
  if (threadIdx.x < num_experts) {
    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
         i += block_size) {
      expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] =
          threadIdx.x;
    }
  }

  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
    int32_t expert_id = topk_ids[i];
    /** The cumsum[expert_id] stores the starting index of the tokens that the
     * expert with expert_id needs to process, and
     * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
     * processed by the expert with expert_id within the current thread's token
     * shard.
     */
    int32_t rank_post_pad =
        tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
        cumsum[expert_id];

    int mask = (int)token_lora_mapping[i / topk_num] == lora_id;
    atomicAdd(
        &sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)],
        (i - numel) * mask);
    tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask;
  }
 }

 void moe_lora_align_block_size(
    torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
    int64_t num_experts, int64_t block_size, int64_t max_loras,
    int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
    torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
    torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
    torch::Tensor lora_ids) {
  const int topk_num = topk_ids.size(1);

  TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");

  int device_max_shared_mem;
  auto dev = topk_ids.get_device();
  cudaDeviceGetAttribute(&device_max_shared_mem,
                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  const int32_t num_thread = max((int32_t)num_experts, 128);  // WARP_SIZE,
  TORCH_CHECK(num_thread <= 1024,
              "num_thread must be less than 1024, "
              "and fallback is not implemented yet.");
  const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) +
                             (num_experts + 1) * sizeof(int32_t);

  if (shared_mem > device_max_shared_mem) {
    TORCH_CHECK(false,
                "Shared memory usage exceeds device limit, and global memory "
                "fallback is not implemented yet.");
  }

  VLLM_DISPATCH_INTEGRAL_TYPES(
      topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
        dim3 blockDim(num_thread);
        auto kernel = moe_lora_align_sum_kernel<scalar_t, int32_t>;
        AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
            (void*)kernel, shared_mem));
        kernel<<<max_loras, blockDim, shared_mem, stream>>>(
            topk_ids.data_ptr<scalar_t>(),
            token_lora_mapping.data_ptr<int32_t>(), block_size, num_experts,
            max_loras, topk_ids.numel(), max_num_tokens_padded,
            max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
            expert_ids.data_ptr<int32_t>(), topk_num,
            num_tokens_post_pad.data_ptr<int32_t>(),
            adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>());
      });
 }
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -11,7 +11,8 @@ void moe_sum(torch::Tensor& input, torch::Tensor& output);
 void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                          int64_t block_size, torch::Tensor sorted_token_ids,
                          torch::Tensor experts_ids,
                          torch::Tensor num_tokens_post_pad);
                          torch::Tensor num_tokens_post_pad,
                          std::optional<torch::Tensor> maybe_expert_map);

 void batched_moe_align_block_size(int64_t max_tokens_per_batch,
                                  int64_t block_size,
@@ -26,7 +27,7 @@ void moe_lora_align_block_size(
    int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
    torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
    torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
    torch::Tensor lora_ids);
    torch::Tensor lora_ids, std::optional<torch::Tensor> maybe_expert_map);
 #ifndef USE_ROCM
 torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             torch::Tensor b_qweight, torch::Tensor b_scales,
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -19,7 +19,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "moe_align_block_size(Tensor topk_ids, int num_experts,"
      "                     int block_size, Tensor! sorted_token_ids,"
      "                     Tensor! experts_ids,"
      "                     Tensor! num_tokens_post_pad) -> ()");
      "                     Tensor! num_tokens_post_pad,"
      "                     Tensor? maybe_expert_map) -> ()");
  m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);

  // Aligning the number of tokens to be processed by each expert such
@@ -46,7 +47,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "                     Tensor !experts_ids,"
      "                     Tensor !num_tokens_post_pad,"
      "                     Tensor !adapter_enabled,"
      "                     Tensor !lora_ids) -> () ");
      "                     Tensor !lora_ids,"
      "                     Tensor? maybe_expert_map) -> () ");
  m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);

 #ifndef USE_ROCM
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -102,13 +102,16 @@ void apply_repetition_penalties_(torch::Tensor& logits,
                                 const torch::Tensor& output_mask,
                                 const torch::Tensor& repetition_penalties);

 void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
                   const torch::Tensor& rowEnds, torch::Tensor& indices,
                   int64_t numRows, int64_t stride0, int64_t stride1);
 void top_k_per_row_prefill(const torch::Tensor& logits,
                           const torch::Tensor& rowStarts,
                           const torch::Tensor& rowEnds, torch::Tensor& indices,
                           int64_t numRows, int64_t stride0, int64_t stride1,
                           int64_t topK);

 void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
                          const torch::Tensor& seq_lens, torch::Tensor& indices,
                          int64_t numRows, int64_t stride0, int64_t stride1);
                          const torch::Tensor& seqLens, torch::Tensor& indices,
                          int64_t numRows, int64_t stride0, int64_t stride1,
                          int64_t topK);

 void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                               torch::Tensor& weight, torch::Tensor& scale,
@@ -128,6 +131,13 @@ void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
                                      std::optional<torch::Tensor> scale_ub,
                                      std::optional<torch::Tensor> residual);

 void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
                              torch::Tensor const& weight,
                              torch::Tensor& scales, double const epsilon,
                              std::optional<torch::Tensor> scale_ub,
                              std::optional<torch::Tensor> residual,
                              int64_t group_size, bool is_scale_transposed);

 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                      std::optional<torch::Tensor> key, int64_t head_size,
                      torch::Tensor& cos_sin_cache, bool is_neox);
@@ -252,7 +262,8 @@ void get_cutlass_moe_mm_data(
 void get_cutlass_moe_mm_problem_sizes(
    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
    std::optional<bool> force_swap_ab = std::nullopt);

 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
                                  torch::Tensor& problem_sizes1,
@@ -299,6 +310,14 @@ void per_token_group_quant_int8(const torch::Tensor& input,
                                torch::Tensor& output_q,
                                torch::Tensor& output_s, int64_t group_size,
                                double eps, double int8_min, double int8_max);

 // Fused activation quantisation + DeepGEMM-compatible UE8M0-packed scales.
 void per_token_group_quant_8bit_packed(const torch::Tensor& input,
                                       torch::Tensor& output_q,
                                       torch::Tensor& output_s_packed,
                                       int64_t group_size, double eps,
                                       double min_8bit, double max_8bit);

 #endif

 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
--- a/csrc/quantization/cutlass_w4a8/get_group_starts.cuh
+++ b/csrc/quantization/cutlass_w4a8/get_group_starts.cuh
@@ -0,0 +1,104 @@
 // see csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
 #pragma once

 #include <cuda.h>
 #include <torch/all.h>
 #include <c10/cuda/CUDAStream.h>

 #include "core/scalar_type.hpp"
 #include "cutlass/bfloat16.h"
 #include "cutlass/float8.h"

 // ElementB is int32 (packed int4)
 // ElementGroupScale is cutlass::Array<cutlass::float_e4m3_t, 8> (packed fp8)
 template <typename ElementA, typename ElementB, typename ElementC,
          typename ElementAccumulator, typename ElementGroupScale>
 __global__ void get_group_gemm_starts(
    int64_t* expert_offsets, ElementA** a_offsets, ElementB** b_offsets,
    ElementC** out_offsets, ElementAccumulator** a_scales_offsets,
    ElementAccumulator** b_scales_offsets,
    ElementGroupScale** b_group_scales_offsets, ElementA* a_base_as_int,
    ElementB* b_base_as_int, ElementC* out_base_as_int,
    ElementAccumulator* a_scales_base_as_int,
    ElementAccumulator* b_scales_base_as_int,
    ElementGroupScale* b_group_scales_base_as_int, int64_t n, int64_t k,
    int64_t scale_k) {
  int expert_id = threadIdx.x;

  int64_t expert_offset = expert_offsets[expert_id];

  // same as w8a8
  a_offsets[expert_id] = a_base_as_int + expert_offset * k;
  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
  a_scales_offsets[expert_id] = a_scales_base_as_int + expert_offset;
  b_scales_offsets[expert_id] = b_scales_base_as_int + (n * expert_id);

  // w4a8 specific
  constexpr int pack_factor = 8;  // pack 8 int4 into int32
  b_offsets[expert_id] = b_base_as_int + (expert_id * k * n / pack_factor);
  b_group_scales_offsets[expert_id] =
      b_group_scales_base_as_int + (expert_id * scale_k * n);
 }

 #define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE)                  \
  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                       \
    get_group_gemm_starts<cutlass::float_e4m3_t, int32_t, C_TYPE, float, \
                          cutlass::Array<cutlass::float_e4m3_t, 8>>      \
        <<<1, num_experts, 0, stream>>>(                                 \
            static_cast<int64_t*>(expert_offsets.data_ptr()),            \
            static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),     \
            static_cast<int32_t**>(b_ptrs.data_ptr()),                   \
            static_cast<C_TYPE**>(out_ptrs.data_ptr()),                  \
            static_cast<float**>(a_scales_ptrs.data_ptr()),              \
            static_cast<float**>(b_scales_ptrs.data_ptr()),              \
            static_cast<cutlass::Array<cutlass::float_e4m3_t, 8>**>(     \
                b_group_scales_ptrs.data_ptr()),                         \
            static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),   \
            static_cast<int32_t*>(b_tensors.data_ptr()),                 \
            static_cast<C_TYPE*>(out_tensors.data_ptr()),                \
            static_cast<float*>(a_scales.data_ptr()),                    \
            static_cast<float*>(b_scales.data_ptr()),                    \
            static_cast<cutlass::Array<cutlass::float_e4m3_t, 8>*>(      \
                b_group_scales.data_ptr()),                              \
            n, k, scale_k);                                              \
  }

 namespace {

 void run_get_group_gemm_starts(
    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
    torch::Tensor& b_group_scales_ptrs, torch::Tensor const& a_tensors,
    torch::Tensor const& b_tensors, torch::Tensor& out_tensors,
    torch::Tensor const& a_scales, torch::Tensor const& b_scales,
    torch::Tensor const& b_group_scales, const int64_t b_group_size) {
  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
  TORCH_CHECK(b_tensors.dtype() == torch::kInt32);  // int4 8x packed into int32
  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
  TORCH_CHECK(b_group_scales.dtype() ==
              torch::kFloat8_e4m3fn);  // the underlying torch type is e4m3
  TORCH_CHECK(out_tensors.dtype() ==
              torch::kBFloat16);  // only support bf16 for now
  // expect int64_t to avoid overflow during offset calculations
  TORCH_CHECK(expert_offsets.dtype() == torch::kInt64);

  int num_experts = static_cast<int>(expert_offsets.size(0));
  // logical k, n
  int64_t n = out_tensors.size(1);
  int64_t k = a_tensors.size(1);
  int64_t scale_k = cutlass::ceil_div(k, b_group_size);

  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());

  if (false) {
  }
  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t)
  __CALL_GET_STARTS_KERNEL(torch::kFloat16, half)
  else {
    TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
  }
 }

 }  // namespace
--- a/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
@@ -0,0 +1,483 @@
 #include <vector>
 #include <tuple>

 #include "cutlass/cutlass.h"

 #include "cute/tensor.hpp"
 #include "cutlass/gemm/dispatch_policy.hpp"
 #include "cutlass/gemm/group_array_problem_shape.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/device/gemm_universal_adapter.h"

 #include "cutlass/util/packed_stride.hpp"
 #include "cutlass/util/mixed_dtype_utils.hpp"

 // vllm includes
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>
 #include "cutlass_extensions/torch_utils.hpp"
 #include "cutlass_extensions/common.hpp"

 #include "core/registration.h"
 #include "get_group_starts.cuh"
 #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 #include "w4a8_utils.cuh"

 namespace vllm::cutlass_w4a8_moe {

 using namespace cute;

 // -------------------------------------------------------------------------------------
 // Static configuration shared across all instantiations
 // -------------------------------------------------------------------------------------
 using ProblemShape =
    cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;  // <M,N,K> per
                                                             // group
 using MmaType = cutlass::float_e4m3_t;
 using QuantType = cutlass::int4b_t;

 constexpr int TileShapeK = 128 * 8 / sizeof_bits<MmaType>::value;
 static int constexpr PackFactor = 8;  // 8 int4 packed into int32

 // A matrix configuration
 using ElementA = MmaType;
 using LayoutA = cutlass::layout::RowMajor;  // Layout type for A matrix operand
 constexpr int AlignmentA =
    128 /
    cutlass::sizeof_bits<ElementA>::value;  // Alignment of A matrix in units of
                                            // elements (up to 16 bytes)

 // B matrix configuration
 using ElementB = QuantType;  // Element type for B matrix operand
 using LayoutB =
    cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
 constexpr int AlignmentB =
    128 / cutlass::sizeof_bits<
              ElementB>::value;  // Memory access granularity/alignment of B
                                 // matrix in units of elements (up to 16 bytes)

 // This example manually swaps and transposes, so keep transpose of input
 // layouts
 using LayoutA_Transpose =
    typename cutlass::layout::LayoutTranspose<LayoutA>::type;
 using LayoutB_Transpose =
    typename cutlass::layout::LayoutTranspose<LayoutB>::type;

 // Need to pass a pointer type to make the 3rd dimension of Stride be _0
 using StrideA =
    cute::remove_pointer_t<cutlass::detail::TagToStrideA_t<LayoutA*>>;
 using StrideB =
    cute::remove_pointer_t<cutlass::detail::TagToStrideB_t<LayoutB*>>;

 // Define the CuTe layout for reoredered quantized tensor B
 // LayoutAtomQuant places values that will be read by the same thread in
 // contiguous locations in global memory. It specifies the reordering within a
 // single warp's fragment
 using LayoutAtomQuant =
    decltype(cutlass::compute_memory_reordering_atom<MmaType>());
 using LayoutB_Reordered = decltype(cute::tile_to_shape(
    LayoutAtomQuant{}, Layout<Shape<int, int, Int<1>>, StrideB>{}));

 using ElementScale = cutlass::float_e4m3_t;
 using LayoutScale = cutlass::layout::RowMajor;

 // C/D matrix configuration
 using ElementC =
    cutlass::bfloat16_t;  // Element type for C and D matrix operands
 using LayoutC =
    cutlass::layout::RowMajor;  // Layout type for C and D matrix operands
 constexpr int AlignmentC =
    128 / cutlass::sizeof_bits<
              ElementC>::value;  // Memory access granularity/alignment of C
                                 // matrix in units of elements (up to 16 bytes)

 // D matrix configuration
 using ElementD = ElementC;
 using LayoutD = LayoutC;
 constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;

 // Core kernel configurations
 using ElementAccumulator = float;     // Element type for internal accumulation
 using ArchTag = cutlass::arch::Sm90;  // Tag indicating the minimum SM that
                                      // supports the intended feature
 using OperatorClass = cutlass::arch::OpClassTensorOp;  // Operator class tag
 using StageCountType =
    cutlass::gemm::collective::StageCountAuto;  // Stage count maximized based
                                                // on the tile size

 // per-channel and per-token scales for epilogue
 using ElementSChannel = float;

 template <class TileShape_MN, class ClusterShape_MNK, class KernelSchedule,
          class EpilogueSchedule>
 struct W4A8GroupedGemmKernel {
  using TileShape =
      decltype(cute::append(TileShape_MN{}, cute::Int<TileShapeK>{}));
  using ClusterShape = ClusterShape_MNK;

  // per-channel, per-token scales epilogue
  using ChTokScalesEpilogue =
      typename vllm::c3x::ScaledEpilogueArray<ElementAccumulator, ElementD,
                                              TileShape>;
  using EVTCompute = typename ChTokScalesEpilogue::EVTCompute;
  using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
          ArchTag, OperatorClass, TileShape, ClusterShape,
          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
          ElementSChannel, ElementC,
          typename cutlass::layout::LayoutTranspose<LayoutC>::type*, AlignmentC,
          ElementD, typename cutlass::layout::LayoutTranspose<LayoutD>::type*,
          AlignmentD, EpilogueSchedule, EVTCompute>::CollectiveOp;

  // =========================================================== MIXED INPUT
  // WITH SCALES
  // ===========================================================================
  // The Scale information must get paired with the operand that will be scaled.
  // In this example, B is scaled so we make a tuple of B's information and the
  // scale information.
  using CollectiveMainloopShuffled =
      typename cutlass::gemm::collective::CollectiveBuilder<
          ArchTag, OperatorClass,
          cute::tuple<ElementB, cutlass::Array<ElementScale, 8>>,
          LayoutB_Reordered*, AlignmentB, ElementA, LayoutA_Transpose*,
          AlignmentA, ElementAccumulator, TileShape, ClusterShape,
          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
              sizeof(typename CollectiveEpilogue::SharedStorage))>,
          KernelSchedule>::CollectiveOp;

  using GemmKernelShuffled = cutlass::gemm::kernel::GemmUniversal<
      ProblemShape, CollectiveMainloopShuffled, CollectiveEpilogue>;

  using GemmShuffled =
      cutlass::gemm::device::GemmUniversalAdapter<GemmKernelShuffled>;

  using StrideC = typename GemmKernelShuffled::InternalStrideC;
  using StrideD = typename GemmKernelShuffled::InternalStrideD;

  using StrideC_ref = cutlass::detail::TagToStrideC_t<LayoutC>;
  using StrideD_ref = cutlass::detail::TagToStrideC_t<LayoutD>;
  using StrideS = typename CollectiveMainloopShuffled::StrideScale;
  using StrideS_ref = cutlass::detail::TagToStrideB_t<LayoutScale>;

  // static asserts for passing in strides/layouts
  // pack to 2x int64
  static_assert(sizeof(StrideS) == 2 * sizeof(int64_t));
  // pack to 3xint32,
  static_assert(sizeof(LayoutB_Reordered) % sizeof(int32_t) == 0,
                "LayoutB_Reordered size must be divisible by 4 bytes");

  static void grouped_mm(
      torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
      const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
      const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
      const int64_t b_group_size, const torch::Tensor& expert_offsets,
      const torch::Tensor& problem_sizes_torch, const torch::Tensor& a_strides,
      const torch::Tensor& b_strides, const torch::Tensor& c_strides,
      const torch::Tensor& group_scale_strides) {
    auto device = a_tensors.device();
    auto device_id = device.index();
    const at::cuda::OptionalCUDAGuard device_guard(device);
    auto stream = at::cuda::getCurrentCUDAStream(device_id);

    int num_experts = static_cast<int>(expert_offsets.size(0));
    int n = static_cast<int>(b_tensors.size(1));
    int k = static_cast<int>(b_tensors.size(2)) * PackFactor;

    auto options_int =
        torch::TensorOptions().dtype(torch::kInt64).device(device);
    torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
    torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
    torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
    torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
    torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
    torch::Tensor b_group_scales_ptrs = torch::empty(num_experts, options_int);

    // get the correct offsets to pass to gemm
    run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs,
                              a_scales_ptrs, b_scales_ptrs, b_group_scales_ptrs,
                              a_tensors, b_tensors, out_tensors, a_scales,
                              b_scales, b_group_scales, b_group_size);

    // construct args
    using Args = typename GemmShuffled::Arguments;
    using MainloopArguments = typename GemmKernelShuffled::MainloopArguments;
    using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments;
    Args arguments;

    ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes =
        static_cast<ProblemShape::UnderlyingProblemShape*>(
            problem_sizes_torch.data_ptr());
    ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};

    // SwapAB so B operands come first
    MainloopArguments mainloop_arguments{
        static_cast<const QuantType**>(b_ptrs.data_ptr()),
        static_cast<LayoutB_Reordered*>(b_strides.data_ptr()),
        static_cast<const MmaType**>(a_ptrs.data_ptr()),
        static_cast<StrideA*>(a_strides.data_ptr()),
        static_cast<const cutlass::Array<ElementScale, 8>**>(
            b_group_scales_ptrs.data_ptr()),
        static_cast<StrideS*>(group_scale_strides.data_ptr()),
        static_cast<int>(b_group_size)};

    EpilogueArguments epilogue_arguments{
        // since we are doing SwapAB the channel scales comes first, then token
        // scales
        ChTokScalesEpilogue::prepare_args(  // see ScaledEpilogueArray
            static_cast<const ElementAccumulator**>(
                b_scales_ptrs.data_ptr()),  // per-channel
            static_cast<const ElementAccumulator**>(
                a_scales_ptrs.data_ptr()),  // per-token
            true, true),
        nullptr,                                       // C
        static_cast<StrideC*>(c_strides.data_ptr()),   // C
        static_cast<ElementD**>(out_ptrs.data_ptr()),  // D
        static_cast<StrideC*>(c_strides.data_ptr())    // D
    };

    static const cutlass::KernelHardwareInfo hw_info{
        device_id,
        cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
            device_id)};

    arguments = Args{cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape,
                     mainloop_arguments, epilogue_arguments, hw_info};

    // Allocate workspace
    size_t workspace_size = GemmShuffled::get_workspace_size(arguments);
    torch::Tensor workspace =
        torch::empty(workspace_size,
                     torch::TensorOptions().dtype(torch::kU8).device(device));

    // Run GEMM
    GemmShuffled gemm;
    CUTLASS_CHECK(gemm.can_implement(arguments));
    CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
    CUTLASS_CHECK(gemm.run(stream));
  }
 };

 // ----------------------------------------------------------------------------
 // Kernel instantiations and dispatch logic
 // ----------------------------------------------------------------------------
 using Coop = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;
 using CoopEpi = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;

 // Kernel_TileShape_ClusterShape_Schedule
 using Kernel_128x16_1x1x1_Coop =
    W4A8GroupedGemmKernel<Shape<_128, _16>, Shape<_1, _1, _1>, Coop, CoopEpi>;
 using Kernel_128x16_2x1x1_Coop =
    W4A8GroupedGemmKernel<Shape<_128, _16>, Shape<_2, _1, _1>, Coop, CoopEpi>;

 using Kernel_256x16_1x1x1_Coop =
    W4A8GroupedGemmKernel<Shape<_256, _16>, Shape<_1, _1, _1>, Coop, CoopEpi>;
 using Kernel_256x16_2x1x1_Coop =
    W4A8GroupedGemmKernel<Shape<_256, _16>, Shape<_2, _1, _1>, Coop, CoopEpi>;

 using Kernel_256x32_1x1x1_Coop =
    W4A8GroupedGemmKernel<Shape<_256, _32>, Shape<_1, _1, _1>, Coop, CoopEpi>;
 using Kernel_256x32_2x1x1_Coop =
    W4A8GroupedGemmKernel<Shape<_256, _32>, Shape<_2, _1, _1>, Coop, CoopEpi>;

 using Kernel_256x64_1x1x1_Coop =
    W4A8GroupedGemmKernel<Shape<_256, _64>, Shape<_1, _1, _1>, Coop, CoopEpi>;
 using Kernel_256x64_2x1x1_Coop =
    W4A8GroupedGemmKernel<Shape<_256, _64>, Shape<_2, _1, _1>, Coop, CoopEpi>;

 using Kernel_256x128_1x1x1_Coop =
    W4A8GroupedGemmKernel<Shape<_256, _128>, Shape<_1, _1, _1>, Coop, CoopEpi>;
 using Kernel_256x128_2x1x1_Coop =
    W4A8GroupedGemmKernel<Shape<_256, _128>, Shape<_2, _1, _1>, Coop, CoopEpi>;

 using Kernel_128x256_2x1x1_Coop =
    W4A8GroupedGemmKernel<Shape<_128, _256>, Shape<_2, _1, _1>, Coop, CoopEpi>;

 void mm_dispatch(
    torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
    const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
    const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
    const int64_t b_group_size, const torch::Tensor& expert_offsets,
    const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
    const torch::Tensor& b_strides, const torch::Tensor& c_strides,
    const torch::Tensor& group_scale_strides, const std::string& schedule) {
  if (schedule == "Kernel_128x16_1x1x1_Coop") {
    Kernel_128x16_1x1x1_Coop::grouped_mm(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
        c_strides, group_scale_strides);
  } else if (schedule == "Kernel_128x16_2x1x1_Coop") {
    Kernel_128x16_2x1x1_Coop::grouped_mm(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
        c_strides, group_scale_strides);
  } else if (schedule == "Kernel_256x16_1x1x1_Coop") {
    Kernel_256x16_1x1x1_Coop::grouped_mm(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
        c_strides, group_scale_strides);
  } else if (schedule == "Kernel_256x16_2x1x1_Coop") {
    Kernel_256x16_2x1x1_Coop::grouped_mm(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
        c_strides, group_scale_strides);
  } else if (schedule == "Kernel_256x32_1x1x1_Coop") {
    Kernel_256x32_1x1x1_Coop::grouped_mm(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
        c_strides, group_scale_strides);
  } else if (schedule == "Kernel_256x32_2x1x1_Coop") {
    Kernel_256x32_2x1x1_Coop::grouped_mm(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
        c_strides, group_scale_strides);
  } else if (schedule == "Kernel_256x64_1x1x1_Coop") {
    Kernel_256x64_1x1x1_Coop::grouped_mm(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
        c_strides, group_scale_strides);
  } else if (schedule == "Kernel_256x64_2x1x1_Coop") {
    Kernel_256x64_2x1x1_Coop::grouped_mm(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
        c_strides, group_scale_strides);
  } else if (schedule == "Kernel_256x128_1x1x1_Coop") {
    Kernel_256x128_1x1x1_Coop::grouped_mm(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
        c_strides, group_scale_strides);
  } else if (schedule == "Kernel_256x128_2x1x1_Coop") {
    Kernel_256x128_2x1x1_Coop::grouped_mm(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
        c_strides, group_scale_strides);
  } else if (schedule == "Kernel_128x256_2x1x1_Coop") {
    Kernel_128x256_2x1x1_Coop::grouped_mm(
        out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
        b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
        c_strides, group_scale_strides);
  } else {
    TORCH_CHECK(false,
                "cutlass_w4a8_moe_mm: unknown schedule string: ", schedule);
  }
 }

 void mm(torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
        const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
        const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
        const int64_t b_group_size, const torch::Tensor& expert_offsets,
        const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
        const torch::Tensor& b_strides, const torch::Tensor& c_strides,
        const torch::Tensor& group_scale_strides,
        std::optional<std::string> maybe_schedule) {
  // user has specified a schedule
  if (maybe_schedule) {
    mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                b_group_scales, b_group_size, expert_offsets, problem_sizes,
                a_strides, b_strides, c_strides, group_scale_strides,
                *maybe_schedule);
    return;
  }

  // use heuristic
  int m_full = a_tensors.size(0);
  int n = b_tensors.size(1);
  int k = b_tensors.size(2) * PackFactor;  // logical k
  int num_experts = b_tensors.size(0);
  // per-expert batch size assuming uniform distribution
  int m_expert = m_full / num_experts;

  std::string schedule;
  if (m_expert <= 16) {
    schedule = "Kernel_128x16_2x1x1_Coop";
  } else if (m_expert <= 32) {
    schedule = "Kernel_256x32_1x1x1_Coop";
  } else if (m_expert <= 64) {
    schedule = "Kernel_256x64_1x1x1_Coop";
  } else if (m_expert <= 128) {
    schedule = "Kernel_256x128_2x1x1_Coop";
  } else {  // m_expert > 128
    schedule = "Kernel_128x256_2x1x1_Coop";
  }

  mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
              b_group_scales, b_group_size, expert_offsets, problem_sizes,
              a_strides, b_strides, c_strides, group_scale_strides, schedule);
 }

 std::tuple<torch::Tensor, torch::Tensor> encode_and_reorder_int4b(
    torch::Tensor const& b_tensors) {
  TORCH_CHECK(b_tensors.dtype() == torch::kInt32);
  TORCH_CHECK(b_tensors.dim() == 3);  // (experts, n, k)
  TORCH_CHECK(b_tensors.is_contiguous());
  TORCH_CHECK(b_tensors.is_cuda());

  int n = static_cast<int>(b_tensors.size(1));
  int k = static_cast<int>(b_tensors.size(2)) * PackFactor;  // logical k

  // CUTLASS reorder_tensor requires k % 256 == 0 and n % 16 == 0.
  // These misalignments cause silent OOB unless run under Compute Sanitizer.
  TORCH_CHECK(k % 256 == 0, "logical k must be divisible by 256");
  TORCH_CHECK(n % 16 == 0, "n must be divisible by 16");

  // we will store the layout to an int32 tensor;
  // this is the number of elements we need per layout
  constexpr size_t layout_width = sizeof(LayoutB_Reordered) / sizeof(int32_t);

  torch::Tensor b_tensors_packed = torch::empty_like(b_tensors);
  int num_experts = static_cast<int>(b_tensors.size(0));

  auto b_ptr = static_cast<QuantType const*>(b_tensors.const_data_ptr());
  auto b_packed_ptr = static_cast<QuantType*>(b_tensors_packed.data_ptr());

  // multiply by ull so result does not overflow int32
  size_t num_int4_elems = 1ull * num_experts * n * k;
  bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(b_ptr, b_packed_ptr,
                                                           num_int4_elems);
  TORCH_CHECK(ok, "unified_encode_int4b failed");

  // construct the layout once; assumes each expert has the same layout
  using LayoutType = LayoutB_Reordered;
  std::vector<LayoutType> layout_B_reordered_host(num_experts);
  auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, Int<1>{}});
  auto shape_B = cute::make_shape(n, k, Int<1>{});
  auto layout_B = make_layout(shape_B, stride_B);
  LayoutType layout_B_reordered = tile_to_shape(LayoutAtomQuant{}, shape_B);

  // reorder weights for each expert
  for (int i = 0; i < num_experts; i++) {
    // since the storage type of int4b is 1 byte but one element is 4 bits
    // we need to adjust the offset
    int64_t offset =
        1ull * i * n * k * cutlass::sizeof_bits<QuantType>::value / 8;
    cutlass::reorder_tensor(b_packed_ptr + offset, layout_B,
                            layout_B_reordered);
  }

  // save the packed layout to torch tensor so we can re-use it
  auto cpu_opts =
      torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU);
  torch::Tensor layout_cpu =
      torch::empty({num_experts, layout_width}, cpu_opts);

  int32_t* layout_data = layout_cpu.data_ptr<int32_t>();
  for (int i = 0; i < num_experts; ++i) {
    std::memcpy(layout_data + i * layout_width,  // dst (int32*)
                &layout_B_reordered,             // src (LayoutType*)
                sizeof(LayoutType));             // number of bytes
  }

  torch::Tensor packed_layout =
      layout_cpu.to(b_tensors.device(), /*non_blocking=*/false);

  return {b_tensors_packed, packed_layout};
 }

 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
  m.impl("cutlass_w4a8_moe_mm", &mm);
  m.impl("cutlass_encode_and_reorder_int4b_grouped", &encode_and_reorder_int4b);
 }

 }  // namespace vllm::cutlass_w4a8_moe
 /////////////////////////////////////////////////////////////////////////////////////////////////
--- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
@@ -7,6 +7,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>
 #include "cutlass_extensions/torch_utils.hpp"
 #include "w4a8_utils.cuh"

 #include "core/registration.h"

@@ -395,71 +396,6 @@ torch::Tensor pack_scale_fp8(torch::Tensor const& scales) {
  return packed_scales;
 }

 /*
  GPU-accelerated implementation of cutlass::unified_encode_int4b.
  Constructs a lookup table in constant memory to map 8 bits
  (two 4-bit values) at a time. Assumes memory is contiguous
  and pointers are 16-byte aligned.
 */
 __constant__ uint8_t kNibbleLUT[256];

 __global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
                                            size_t nbytes) {
  constexpr size_t V = sizeof(uint4);  // 16 bytes
  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
  const size_t nthreads = size_t(gridDim.x) * blockDim.x;
  const size_t nvec = nbytes / V;

  // 1-D grid-stride loop over 16-byte chunks
  for (size_t vec = tid; vec < nvec; vec += nthreads) {
    uint4 v = reinterpret_cast<const uint4*>(in)[vec];
    uint8_t* b = reinterpret_cast<uint8_t*>(&v);
 #pragma unroll
    for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
    reinterpret_cast<uint4*>(out)[vec] = v;
  }
 }

 static bool upload_lut() {
  std::array<uint8_t, 256> lut{};
  auto map_nib = [](uint8_t v) -> uint8_t {
    // 1..7 -> (8 - v); keep 0 and 8..15
    return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
  };
  for (int b = 0; b < 256; ++b) {
    uint8_t lo = b & 0xF;
    uint8_t hi = (b >> 4) & 0xF;
    lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
  }
  cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
                                     /*offset=*/0, cudaMemcpyHostToDevice);

  return (e == cudaSuccess);
 }

 static bool unified_encode_int4b(cutlass::int4b_t const* in,
                                 cutlass::int4b_t* out, size_t num_int4_elems) {
  // Build/upload LUT
  if (!upload_lut()) return false;

  static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
                "int4 storage must be 1 byte");
  const size_t nbytes = num_int4_elems >> 1;

  auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
  auto* out_bytes = reinterpret_cast<uint8_t*>(out);

  // kernel launch params
  constexpr int block = 256;
  const size_t nvec = nbytes / sizeof(uint4);  // # of 16B vectors
  int grid = int((nvec + block - 1) / block);
  if (grid == 0) grid = 1;  // ensure we still cover the tail in the kernel

  unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
  cudaError_t err = cudaGetLastError();
  return (err == cudaSuccess);
 }

 torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
  TORCH_CHECK(B.dtype() == torch::kInt32);
  TORCH_CHECK(B.dim() == 2);
@@ -477,8 +413,8 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
  LayoutB_Reordered layout_B_reordered =
      cute::tile_to_shape(LayoutAtomQuant{}, shape_B);

  bool ok =
      vllm::cutlass_w4a8::unified_encode_int4b(B_ptr, B_packed_ptr, n * k);
  bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(B_ptr, B_packed_ptr,
                                                           n * k);
  TORCH_CHECK(ok, "unified_encode_int4b failed");
  cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered);

--- a/csrc/quantization/cutlass_w4a8/w4a8_utils.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_utils.cu
@@ -0,0 +1,90 @@
 #include "w4a8_utils.cuh"

 #include <array>
 #include <cuda_runtime.h>
 #include <cstdio>

 namespace vllm::cutlass_w4a8_utils {

 /*
  GPU-accelerated implementation of cutlass::unified_encode_int4b.
  Constructs a lookup table in constant memory to map 8 bits
  (two 4-bit values) at a time. Assumes memory is contiguous
  and pointers are 16-byte aligned.
 */
 __constant__ uint8_t kNibbleLUT[256];

 __global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
                                            size_t nbytes) {
  constexpr size_t V = sizeof(uint4);  // 16 bytes
  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
  const size_t nthreads = size_t(gridDim.x) * blockDim.x;
  const size_t nvec = nbytes / V;

  // 1-D grid-stride loop over 16-byte chunks
  for (size_t vec = tid; vec < nvec; vec += nthreads) {
    uint4 v = reinterpret_cast<const uint4*>(in)[vec];
    uint8_t* b = reinterpret_cast<uint8_t*>(&v);
 #pragma unroll
    for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
    reinterpret_cast<uint4*>(out)[vec] = v;
  }
 }

 static bool upload_lut() {
  std::array<uint8_t, 256> lut{};
  auto map_nib = [](uint8_t v) -> uint8_t {
    // 1..7 -> (8 - v); keep 0 and 8..15
    return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
  };
  for (int b = 0; b < 256; ++b) {
    uint8_t lo = b & 0xF;
    uint8_t hi = (b >> 4) & 0xF;
    lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
  }
  cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
                                     /*offset=*/0, cudaMemcpyHostToDevice);

  return (e == cudaSuccess);
 }

 bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out,
                          size_t num_int4_elems) {
  // Build/upload LUT
  if (!upload_lut()) return false;

  static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
                "int4 storage must be 1 byte");
  const size_t nbytes = num_int4_elems >> 1;

  auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
  auto* out_bytes = reinterpret_cast<uint8_t*>(out);

  // kernel launch params
  constexpr int block = 256;
  const size_t nvec = nbytes / sizeof(uint4);  // # of 16B vectors
  int grid = int((nvec + block - 1) / block);
  if (grid == 0) grid = 1;  // ensure we still cover the tail in the kernel

  unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);

  // launch errors
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) {
    printf("unified_encode_int4b_device launch error: %s (%d)\n",
           cudaGetErrorString(err), err);
    return false;
  }

  // runtime errors
  err = cudaDeviceSynchronize();
  if (err != cudaSuccess) {
    printf("unified_encode_int4b_device runtime error: %s (%d)\n",
           cudaGetErrorString(err), err);
    return false;
  }

  return true;
 }

 }  // namespace vllm::cutlass_w4a8_utils
--- a/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
+++ b/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
@@ -0,0 +1,11 @@
 #pragma once

 #include <cstddef>
 #include "cutlass/numeric_types.h"

 namespace vllm::cutlass_w4a8_utils {

 bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out,
                          size_t num_int4_elems);

 }  // namespace vllm::cutlass_w4a8_utils
--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -31,14 +31,15 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(

  // RMS Norm + Quant
  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
    token_scale = 1.0f / token_scale;
    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
                                     has_residual>(
        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
        out, input, weight, rms, &token_scale, hidden_size, residual);
  } else {
    // FP8 - Do not invert token_scale for exact match with FBGemm
    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
                                     has_residual>(
        out, input, weight, rms, token_scale, hidden_size, residual);
        out, input, weight, rms, &token_scale, hidden_size, residual);
  }
 }

@@ -75,14 +76,52 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(

  // RMS Norm + Quant
  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
    token_scale = 1.0f / token_scale;
    vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
        out, input, weight, rms, &token_scale, hidden_size, residual);
  } else {
    // FP8 - Do not invert s_token_scale for exact match with FBGemm
    vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
        out, input, weight, rms, token_scale, hidden_size, residual);
        out, input, weight, rms, &token_scale, hidden_size, residual);
  }
 }

 // RMS norm + quant kernel
 template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
          bool is_scale_transposed = false, int32_t group_size = 0>
 __global__ void rms_norm_per_block_quant_kernel(
    scalar_out_t* __restrict__ out,  // [..., hidden_size]
    float* __restrict__ scales,      // [num_tokens, hidden_size / group_size]
                                     // or
                                     // [hidden_size / group_size, num_tokens]
    scalar_t const* __restrict__ input,   // [..., hidden_size]
    scalar_t const* __restrict__ weight,  // [hidden_size]
    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
    scalar_t* __restrict__ residual = nullptr) {
  float rms;
  // Compute RMS
  // Always able to vectorize due to constraints on hidden_size
  vllm::vectorized::compute_rms<scalar_t, has_residual>(
      &rms, input, hidden_size, var_epsilon, residual);

  // Compute Scale
  // Always able to vectorize due to constraints on hidden_size and group_size
  vllm::vectorized::compute_dynamic_per_token_scales<
      scalar_t, scalar_out_t, has_residual, is_scale_transposed, group_size>(
      nullptr, scales, input, weight, rms, scale_ub, hidden_size, residual);

  // RMS Norm + Quant
  // Always able to vectorize due to constraints on hidden_size
  // For int8, don't invert token_scale here: do it inside the norm_and_quant
  // kernel. We do it because particular elements of token_scale can be shared
  // between multiple threads, so this way, we avoid extra synchronization
  // overhead.
  vllm::vectorized::norm_and_quant<
      scalar_t, scalar_out_t, std::is_same_v<scalar_out_t, int8_t>,
      has_residual, is_scale_transposed, group_size>(
      out, input, weight, rms, scales, hidden_size, residual);
 }

 }  // namespace vllm

 // Residual add + RMS norm + dynamic per token
@@ -103,30 +142,19 @@ void rms_norm_dynamic_per_token_quant_dispatch(
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  if (residual.has_value()) {
  VLLM_DISPATCH_BOOL(residual.has_value(), has_residual, [&] {
    VLLM_DISPATCH_QUANT_TYPES(
        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
                                                        true>
                                                        has_residual>
              <<<grid, block, 0, stream>>>(
                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
                  var_epsilon, hidden_size, residual->data_ptr<scalar_in_t>());
                  var_epsilon, hidden_size,
                  has_residual ? residual->data_ptr<scalar_in_t>() : nullptr);
        });

  } else {
    VLLM_DISPATCH_QUANT_TYPES(
        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
                                                        false>
              <<<grid, block, 0, stream>>>(
                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
                  var_epsilon, hidden_size, nullptr);
        });
  }
  });
 }

 void rms_norm_dynamic_per_token_quant(
@@ -157,3 +185,79 @@ void rms_norm_dynamic_per_token_quant(
            out, input, weight, scales, var_epsilon, scale_ub, residual);
      });
 }

 // Residual add + RMS norm + dynamic per token
 void rms_norm_per_block_quant_dispatch(
    torch::Tensor& out,           // [..., hidden_size]
    torch::Tensor const& input,   // [..., hidden_size]
    torch::Tensor const& weight,  // [hidden_size]
    torch::Tensor& scales,        // [num_tokens, hidden_size / group_size] or
                                  // [hidden_size / group_size, num_tokens]
    int32_t group_size,
    double const var_epsilon,  // Variance epsilon used in norm calculation
    std::optional<at::Tensor> const& scale_ub,
    std::optional<at::Tensor>& residual, bool is_scale_transposed) {
  int32_t hidden_size = input.size(-1);
  auto num_tokens = input.numel() / hidden_size;

  dim3 grid(num_tokens);
  const int max_block_size = (num_tokens <= 256) ? 512 : 256;
  dim3 block(std::min(hidden_size, max_block_size));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "rms_norm_per_block_quant_fp_dispatch", [&] {
        using scalar_in_t = scalar_t;
        VLLM_DISPATCH_GROUP_SIZE(group_size, gs, [&] {
          VLLM_DISPATCH_BOOL(residual.has_value(), has_residual, [&] {
            VLLM_DISPATCH_BOOL(is_scale_transposed, transpose_scale, [&] {
              VLLM_DISPATCH_QUANT_TYPES(
                  out.scalar_type(), "rms_norm_per_block_quant_kernel", [&] {
                    vllm::rms_norm_per_block_quant_kernel<scalar_in_t, scalar_t,
                                                          has_residual,
                                                          transpose_scale, gs>
                        <<<grid, block, 0, stream>>>(
                            out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
                            input.data_ptr<scalar_in_t>(),
                            weight.data_ptr<scalar_in_t>(),
                            scale_ub.has_value() ? scale_ub->data_ptr<float>()
                                                 : nullptr,
                            var_epsilon, hidden_size,
                            has_residual ? residual->data_ptr<scalar_in_t>()
                                         : nullptr);
                  });
            });
          });
        });
      });
 }

 void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
                              torch::Tensor const& weight,
                              torch::Tensor& scales, double const var_epsilon,
                              std::optional<torch::Tensor> scale_ub,
                              std::optional<torch::Tensor> residual,
                              int64_t group_size, bool is_scale_transposed) {
  static c10::ScalarType kFp8Type = is_fp8_ocp()
                                        ? c10::ScalarType::Float8_e4m3fn
                                        : c10::ScalarType::Float8_e4m3fnuz;
  TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
  TORCH_CHECK(out.is_contiguous() && input.is_contiguous());

  if (scale_ub.has_value()) {
    TORCH_CHECK(out.dtype() == kFp8Type);
  }
  TORCH_CHECK(weight.dtype() == input.dtype());
  TORCH_CHECK(scales.dtype() == torch::kFloat32);
  if (residual) {
    TORCH_CHECK(residual->scalar_type() == input.scalar_type());
  }

  TORCH_CHECK(group_size == 128 || group_size == 64,
              "Unsupported group size: ", group_size);

  rms_norm_per_block_quant_dispatch(out, input, weight, scales, group_size,
                                    var_epsilon, scale_ub, residual,
                                    is_scale_transposed);
 }
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -9,6 +9,7 @@
 #include "quant_conversions.cuh"

 #include "../../cub_helpers.h"
 #include "../../cuda_compat.h"

 namespace vllm {

@@ -43,62 +44,150 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
  *rms = s_rms;
 }

 template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
 __device__ float warpReduceMaxSpecialized(volatile float* val, int64_t tid,
                                          int64_t thread_in_warp,
                                          int64_t reduced_elems) {
  static_assert(WARP_SIZE == 32 || WARP_SIZE == 64);
  if constexpr (WARP_SIZE == 64) {
    if (thread_in_warp + 64 < reduced_elems)
      val[tid] = fmaxf(val[tid], val[tid + 64]);
  }
  if (thread_in_warp + 32 < reduced_elems)
    val[tid] = fmaxf(val[tid], val[tid + 32]);
  if (thread_in_warp + 16 < reduced_elems)
    val[tid] = fmaxf(val[tid], val[tid + 16]);
  if (thread_in_warp + 8 < reduced_elems)
    val[tid] = fmaxf(val[tid], val[tid + 8]);
  if (thread_in_warp + 4 < reduced_elems)
    val[tid] = fmaxf(val[tid], val[tid + 4]);
  if (thread_in_warp + 2 < reduced_elems)
    val[tid] = fmaxf(val[tid], val[tid + 2]);
  if (thread_in_warp + 1 < reduced_elems)
    val[tid] = fmaxf(val[tid], val[tid + 1]);
  return val[tid];
 }

 template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
          bool is_scale_transposed = false>
 __device__ void compute_dynamic_per_token_scales(
    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
    float const rms, float const* __restrict__ scale_ub,
    int32_t const hidden_size,
    scalar_t const* __restrict__ residual = nullptr) {
  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
  ;
  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};

    int32_t const hidden_size, scalar_t const* __restrict__ residual = nullptr,
    int32_t const group_size = 0) {
  float block_absmax_val_maybe = 0.0f;
  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
    float x = static_cast<float>(input[token_offset + i]);
    if constexpr (has_residual) {
      x += static_cast<float>(residual[token_offset + i]);
  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
  __syncthreads();
  if (group_size > 0) {
    __shared__ float s_max_vals[1024];
    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
    int64_t num_groups = hidden_size / group_size;
    int64_t const threads_per_group = blockDim.x / num_groups;
    int64_t const thread_in_group = threadIdx.x % threads_per_group;
    int64_t const group_offset = threadIdx.x / threads_per_group * group_size;
    int64_t const thread_offset = group_offset + thread_in_group;
    int64_t const thread_end =
        min(group_offset + group_size, static_cast<int64_t>(hidden_size));
    for (auto i = thread_offset; i < thread_end; i += threads_per_group) {
      float x = static_cast<float>(input[token_offset + i]);
      if constexpr (has_residual) {
        x += static_cast<float>(residual[token_offset + i]);
      }
      x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
      block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
    }
    s_max_vals[threadIdx.x] = block_absmax_val_maybe;
    __syncthreads();

    int64_t const warp_size = WARP_SIZE;
    int64_t const num_warps = blockDim.x / warp_size;
    int64_t const warp_id = threadIdx.x / warp_size;
    int64_t const thread_in_warp = threadIdx.x % warp_size;
    int64_t const groups_per_warp = (num_groups + num_warps - 1) / num_warps;
    for (auto i = 0; i < groups_per_warp; ++i) {
      int64_t const group_id = i * num_warps + warp_id;
      if (group_id < num_groups) {
        int64_t warp_start = group_id * threads_per_group;
        int64_t const start = warp_start + thread_in_warp;
        int64_t const warp_end = min(warp_start + threads_per_group,
                                     static_cast<int64_t>(hidden_size));
        for (auto j = start; j + warp_size < warp_end; j += warp_size) {
          s_max_vals[start] =
              fmaxf(s_max_vals[start], s_max_vals[j + warp_size]);
        }
        warpReduceMaxSpecialized(s_max_vals, start, thread_in_warp,
                                 min(warp_end - warp_start, warp_size));
      }
    }
    __syncthreads();

    if (thread_in_group == 0 && thread_offset < thread_end) {
      block_absmax_val_maybe = s_max_vals[threadIdx.x];
      float scale = 0.0f;
      if (scale_ub) {
        scale = min(block_absmax_val_maybe, *scale_ub);
      } else {
        scale = block_absmax_val_maybe;
      }
      // token scale computation
      scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
      // Global output store
      if constexpr (is_scale_transposed) {
        all_token_scales[(threadIdx.x / threads_per_group) * gridDim.x +
                         blockIdx.x] = scale;
      } else {
        all_token_scales[blockIdx.x * num_groups +
                         threadIdx.x / threads_per_group] = scale;
      }
    }
    __syncthreads();
  } else {
    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);

    for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
      float x = static_cast<float>(input[token_offset + i]);
      if constexpr (has_residual) {
        x += static_cast<float>(residual[token_offset + i]);
      }

    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
    block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
  }

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
  block_absmax_val_maybe =
      BlockReduce(reduceStore)
          .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);

  __shared__ float s_token_scale;
  if (threadIdx.x == 0) {
    float scale = 0.0f;
    if (scale_ub) {
      scale = min(block_absmax_val_maybe, *scale_ub);
    } else {
      scale = block_absmax_val_maybe;
      x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
      block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
    }
    // token scale computation
    scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
    s_token_scale = scale;                 // Shared memory store
    all_token_scales[blockIdx.x] = scale;  // Global output store
  }
  __syncthreads();
    using BlockReduce = cub::BlockReduce<float, 1024>;
    __shared__ typename BlockReduce::TempStorage reduceStore;
    block_absmax_val_maybe =
        BlockReduce(reduceStore)
            .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);

    __shared__ float s_token_scale;
    if (threadIdx.x == 0) {
      float scale = 0.0f;
      if (scale_ub) {
        scale = min(block_absmax_val_maybe, *scale_ub);
      } else {
        scale = block_absmax_val_maybe;
      }
      // token scale computation
      scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
      s_token_scale = scale;                 // Shared memory store
      all_token_scales[blockIdx.x] = scale;  // Global output store
    }
    __syncthreads();

  *token_scale = s_token_scale;
    *token_scale = s_token_scale;
  }
 }

 template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
          bool has_residual = false>
          bool has_residual = false, bool is_scale_transposed = false>
 __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
                               scalar_t const* __restrict__ input,
                               scalar_t const* __restrict__ weight,
                               float const rms, float const scale,
                               float const rms, float* const scale,
                               int32_t const hidden_size,
                               scalar_t* __restrict__ residual = nullptr) {
                               scalar_t* __restrict__ residual = nullptr,
                               int32_t const group_size = 0) {
  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
  ;

  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
    float x = static_cast<float>(input[token_offset + i]);
@@ -109,8 +198,21 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
    // Norm
    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
    // Quant
    // If groupwise is_scale_inverted is true, so we invert the scale here.
    int64_t scale_idx = 0;
    if (group_size > 0) {
      if constexpr (is_scale_transposed) {
        scale_idx = (i / group_size) * gridDim.x + blockIdx.x;
      } else {
        scale_idx = blockIdx.x * (hidden_size / group_size) + i / group_size;
      }
    }
    auto scale_val =
        (group_size > 0
             ? (is_scale_inverted ? 1.0f / scale[scale_idx] : scale[scale_idx])
             : *scale);
    output[token_offset + i] =
        ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale);
        ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale_val);
  }
 }

@@ -178,95 +280,191 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,

 // Vectorized version of vllm::compute_dynamic_per_token_scales
 // hidden_size must be a multiple of 4
 template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
 template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
          bool is_scale_transposed = false, int32_t group_size = 0>
 __device__ void compute_dynamic_per_token_scales(
    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
    float const rms, float const* __restrict__ scale_ub,
    int32_t const hidden_size,
    scalar_t const* __restrict__ residual = nullptr) {
  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
  ;

  // Vectorized input/weight/residual to better utilize memory bandwidth.
  vec4_t<scalar_t> const* vec_input =
      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
  vec4_t<scalar_t> const* vec_weight =
      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
  vec4_t<scalar_t> const* vec_residual = nullptr;
  if constexpr (has_residual) {
    vec_residual =
        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
  }

  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};

  const int VEC_SIZE = 4;
  int32_t const num_vec_elems = hidden_size >> 2;
  float block_absmax_val_maybe = 0.0f;

  // Vectorized input/weight/residual to better utilize memory bandwidth.
  vec4_t<scalar_t> const* vec_input = nullptr;
  vec4_t<scalar_t> const* vec_weight = nullptr;
  vec4_t<scalar_t> const* vec_residual = nullptr;

  if constexpr (group_size > 0) {
    __shared__ float s_max_vals[1024];

    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
    int64_t const num_groups = hidden_size / group_size;
    int64_t const threads_per_group = blockDim.x / num_groups;
    int64_t const thread_in_group = threadIdx.x % threads_per_group;
    int64_t const group_offset =
        threadIdx.x / threads_per_group * (group_size >> 2);
    int64_t const thread_offset = group_offset + thread_in_group;
    int64_t const thread_end = min(group_offset + (group_size >> 2),
                                   static_cast<int64_t>(hidden_size >> 2));
    vec_input = reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
    vec_weight = reinterpret_cast<vec4_t<scalar_t> const*>(weight);
    if constexpr (has_residual) {
      vec_residual =
          reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
    }
    int32_t const num_vec_elems = thread_end;

 #pragma unroll 4
  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
    vec4_t<scalar_t> in = vec_input[i];
    vec4_t<scalar_t> const w = vec_weight[i];
    for (auto i = thread_offset; i < num_vec_elems; i += threads_per_group) {
      vec4_t<scalar_t> in = vec_input[i];
      vec4_t<scalar_t> const w = vec_weight[i];

    vec4_t<float> x;
      vec4_t<float> x;
 #pragma unroll
    for (int j = 0; j < VEC_SIZE; ++j) {
      x.val[j] = static_cast<float>(in.val[j]);
      for (int j = 0; j < VEC_SIZE; ++j) {
        x.val[j] = static_cast<float>(in.val[j]);
      }

      if constexpr (has_residual) {
        vec4_t<scalar_t> r = vec_residual[i];
 #pragma unroll
        for (int j = 0; j < VEC_SIZE; ++j) {
          x.val[j] += static_cast<float>(r.val[j]);
        }
      }

 #pragma unroll
      for (int j = 0; j < VEC_SIZE; ++j) {
        block_absmax_val_maybe =
            fmaxf(block_absmax_val_maybe,
                  fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
      }
    }

    s_max_vals[threadIdx.x] = block_absmax_val_maybe;
    __syncthreads();

    int64_t const warp_size = WARP_SIZE;
    int64_t const num_warps = blockDim.x / warp_size;
    int64_t const warp_id = threadIdx.x / warp_size;
    int64_t const thread_in_warp = threadIdx.x % warp_size;
    int64_t const groups_per_warp = (num_groups + num_warps - 1) / num_warps;
    for (auto i = 0; i < groups_per_warp; ++i) {
      int64_t const group_id = i * num_warps + warp_id;
      if (group_id < num_groups) {
        int64_t warp_start = group_id * threads_per_group;
        int64_t const start = warp_start + thread_in_warp;
        int64_t const warp_end = min(warp_start + threads_per_group,
                                     static_cast<int64_t>(hidden_size));
        for (auto j = start; j + warp_size < warp_end; j += warp_size) {
          s_max_vals[start] =
              fmaxf(s_max_vals[start], s_max_vals[j + warp_size]);
        }
        warpReduceMaxSpecialized(s_max_vals, start, thread_in_warp,
                                 min(warp_end - warp_start, warp_size));
      }
    }
    __syncthreads();

    if (thread_in_group == 0 && thread_offset < thread_end) {
      block_absmax_val_maybe = s_max_vals[threadIdx.x];
      float scale = 0.0f;
      if (scale_ub) {
        scale = min(block_absmax_val_maybe, *scale_ub);
      } else {
        scale = block_absmax_val_maybe;
      }
      // token scale computation
      scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
      // Global output store
      if constexpr (is_scale_transposed) {
        all_token_scales[(threadIdx.x / threads_per_group) * gridDim.x +
                         blockIdx.x] = scale;
      } else {
        all_token_scales[blockIdx.x * num_groups +
                         threadIdx.x / threads_per_group] = scale;
      }
    }
    __syncthreads();

  } else {
    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
    vec_input = reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
    vec_weight = reinterpret_cast<vec4_t<scalar_t> const*>(weight);
    if constexpr (has_residual) {
      vec4_t<scalar_t> r = vec_residual[i];
      vec_residual =
          reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
    }

    int32_t const num_vec_elems = (hidden_size >> 2);

 #pragma unroll 4
    for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
      vec4_t<scalar_t> in = vec_input[i];
      vec4_t<scalar_t> const w = vec_weight[i];

      vec4_t<float> x;
 #pragma unroll
      for (int j = 0; j < VEC_SIZE; ++j) {
        x.val[j] += static_cast<float>(r.val[j]);
        x.val[j] = static_cast<float>(in.val[j]);
      }
    }

      if constexpr (has_residual) {
        vec4_t<scalar_t> r = vec_residual[i];
 #pragma unroll
    for (int j = 0; j < VEC_SIZE; ++j) {
      block_absmax_val_maybe =
          fmaxf(block_absmax_val_maybe,
                fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
    }
  }
        for (int j = 0; j < VEC_SIZE; ++j) {
          x.val[j] += static_cast<float>(r.val[j]);
        }
      }

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
  block_absmax_val_maybe =
      BlockReduce(reduceStore)
          .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
 #pragma unroll
      for (int j = 0; j < VEC_SIZE; ++j) {
        block_absmax_val_maybe =
            fmaxf(block_absmax_val_maybe,
                  fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
      }
    }

  __shared__ float s_token_scale;
  if (threadIdx.x == 0) {
    float scale = 0.0f;
    if (scale_ub) {
      scale = min(block_absmax_val_maybe, *scale_ub);
    } else {
      scale = block_absmax_val_maybe;
    using BlockReduce = cub::BlockReduce<float, 1024>;
    __shared__ typename BlockReduce::TempStorage reduceStore;
    block_absmax_val_maybe =
        BlockReduce(reduceStore)
            .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);

    __shared__ float s_token_scale;
    if (threadIdx.x == 0) {
      float scale = 0.0f;
      if (scale_ub) {
        scale = min(block_absmax_val_maybe, *scale_ub);
      } else {
        scale = block_absmax_val_maybe;
      }
      // token scale computation
      scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
      s_token_scale = scale;                 // shared memory store
      all_token_scales[blockIdx.x] = scale;  // global output store
    }
    // token scale computation
    scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
    s_token_scale = scale;                 // shared memory store
    all_token_scales[blockIdx.x] = scale;  // global output store
  }
  __syncthreads();
    __syncthreads();

  *token_scale = s_token_scale;
    *token_scale = s_token_scale;
  }
 }

 // hidden_size must be a multiple of 4
 template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
          bool has_residual = false>
          bool has_residual = false, bool is_scale_transposed = false,
          int32_t group_size = 0>
 __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
                               scalar_t const* __restrict__ input,
                               scalar_t const* __restrict__ weight,
                               float const rms, float const scale,
                               float const rms, float* const scale,
                               int32_t const hidden_size,
                               scalar_t* __restrict__ residual = nullptr) {
  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
  ;

  // Vectorized input/output/weight/residual to better utilize memory bandwidth.
  vec4_t<scalar_t> const* vec_input =
@@ -311,10 +509,26 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
    }

    q8x4_t<scalar_out_t> out;

    float scale_val;

    if constexpr (group_size > 0) {
      int64_t const num_groups = hidden_size / group_size;
      int64_t scale_idx = 0;
      if constexpr (is_scale_transposed) {
        scale_idx = (i * VEC_SIZE / group_size) * gridDim.x + blockIdx.x;
      } else {
        scale_idx = blockIdx.x * num_groups + i * VEC_SIZE / group_size;
      }
      scale_val =
          is_scale_inverted ? 1.0f / scale[scale_idx] : scale[scale_idx];
    } else {
      scale_val = *scale;
    }
 #pragma unroll
    for (int j = 0; j < VEC_SIZE; ++j) {
      out.val[j] = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
          static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale);
          static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale_val);
    }
    vec_output[i] = out;
  }
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@@ -617,7 +617,7 @@ struct MacheteCollectiveMma {

  // Same as upstream, should be kept the same when possible, not formatted for
  // easier comparison
  //   with `SwapAB ? N : M -> M` since we dont support SwapAB
  //   with `SwapAB ? N : M -> M` since we don't support SwapAB
  // clang-format off
  template<class ProblemShape>
  static bool
--- a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
@@ -136,15 +136,17 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
 void get_cutlass_moe_mm_problem_sizes_caller(
    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
    std::optional<bool> force_swap_ab = std::nullopt) {
  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
  auto options_int32 =
      torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);

  // Swap-AB should be disabled for FP4 path
  bool may_swap_ab = (!blockscale_offsets.has_value()) &&
                     (topk_ids.numel() <= SWAP_AB_THRESHOLD);
  bool may_swap_ab =
      force_swap_ab.value_or((!blockscale_offsets.has_value()) &&
                             (topk_ids.numel() <= SWAP_AB_THRESHOLD));

  launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
                               atomic_buffer, num_experts, n, k, stream,
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@@ -80,7 +80,8 @@ void get_cutlass_moe_mm_data_caller(
 void get_cutlass_moe_mm_problem_sizes_caller(
    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
    std::optional<bool> force_swap_ab = std::nullopt);

 void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
                                         torch::Tensor& problem_sizes1,
@@ -303,14 +304,15 @@ void get_cutlass_moe_mm_data(
 void get_cutlass_moe_mm_problem_sizes(
    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
    std::optional<bool> force_swap_ab = std::nullopt) {
  int32_t version_num = get_sm_version_num();
 #if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
  get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
                                          problem_sizes2, num_experts, n, k,
                                          blockscale_offsets);
                                          blockscale_offsets, force_swap_ab);
  return;
 #endif
  TORCH_CHECK_NOT_IMPLEMENTED(
--- a/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
@@ -206,6 +206,191 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
 #undef LAUNCH_KERNEL
 }

 template <typename T, typename DST_DTYPE>
 __global__ void per_token_group_quant_8bit_packed_kernel(
    const T* __restrict__ input, void* __restrict__ output_q,
    unsigned int* __restrict__ output_s_packed, const int group_size,
    const int num_groups, const int groups_per_block, const int groups_per_row,
    const int mn, const int tma_aligned_mn, const float eps,
    const float min_8bit, const float max_8bit) {
  const int threads_per_group = 16;
  const int64_t local_group_id = threadIdx.x / threads_per_group;
  const int lane_id = threadIdx.x % threads_per_group;

  const int64_t block_group_id = blockIdx.x * groups_per_block;
  const int64_t global_group_id = block_group_id + local_group_id;
  if (global_group_id >= num_groups) {
    return;
  }

  const int64_t block_group_offset = global_group_id * group_size;

  float local_absmax = eps;

  const T* group_input = input + block_group_offset;
  DST_DTYPE* group_output =
      static_cast<DST_DTYPE*>(output_q) + block_group_offset;

  // shared memory to cache each group's data to avoid double DRAM reads.
  extern __shared__ __align__(16) char smem_raw[];
  T* smem = reinterpret_cast<T*>(smem_raw);
  T* smem_group = smem + local_group_id * group_size;

  constexpr int vec_size = 16 / sizeof(T);
  using vec_t = vllm::vec_n_t<T, vec_size>;

  // copy global -> shared & compute absmax
  auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
    float abs_v = fabsf(static_cast<float>(src));
    local_absmax = fmaxf(local_absmax, abs_v);
    dst = src;
  };

  vllm::vectorize_with_alignment<vec_size>(
      group_input,        // in
      smem_group,         // out (shared)
      group_size,         // elements per group
      lane_id,            // thread id
      threads_per_group,  // stride in group
      scalar_op_cache);   // scalar handler

  local_absmax = GroupReduceMax(local_absmax);

  float y_s = local_absmax / max_8bit;
  y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));

  // pack 4 scales into a uint32
  if (lane_id == 0) {
    // map flat group id to 2D indices (mn_idx, sf_k_idx)
    const int sf_k_idx = static_cast<int>(global_group_id % groups_per_row);
    const int mn_idx = static_cast<int>(global_group_id / groups_per_row);

    if (mn_idx < mn) {
      // each uint32 in output_s_packed stores 4 packed scales
      const int sf_k_pack_idx = sf_k_idx / 4;
      const int pos = sf_k_idx % 4;

      // reinterpret the UE8M0 scale y_s as IEEE bits, extract the 8-bit
      // exponent, and place it into the correct byte of the 32-bit word.
      const unsigned int bits = __float_as_uint(y_s);
      const unsigned int exponent = (bits >> 23u) & 0xffu;
      const unsigned int contrib = exponent << (pos * 8u);

      const int out_idx = sf_k_pack_idx * tma_aligned_mn + mn_idx;
      // atomically OR 8-bit exponent into the packed scales buffer
      atomicOr(output_s_packed + out_idx, contrib);
    }
  }

  __syncthreads();

  // quantize shared -> global 8-bit
  auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
    float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
    dst = DST_DTYPE(q);
  };

  vllm::vectorize_with_alignment<vec_size>(
      smem_group,         // in (shared)
      group_output,       // out (global quant tensor)
      group_size,         // elements
      lane_id,            // tid
      threads_per_group,  // stride
      scalar_op_quant);   // scalar handler
 }

 void per_token_group_quant_8bit_packed(const torch::Tensor& input,
                                       torch::Tensor& output_q,
                                       torch::Tensor& output_s_packed,
                                       int64_t group_size, double eps,
                                       double min_8bit, double max_8bit) {
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(output_q.is_contiguous());

  const int64_t k = input.size(-1);
  TORCH_CHECK(k % group_size == 0, "Last dimension (", k,
              ") must be divisible by group_size (", group_size, ").");

  const int64_t mn = input.numel() / k;
  const int64_t groups_per_row = k / group_size;
  const int64_t num_groups = mn * groups_per_row;

  TORCH_CHECK(output_s_packed.dim() == 2,
              "output_s_packed must be 2D, got dim=", output_s_packed.dim(),
              ".");

  const int64_t k_num_packed_sfk = (groups_per_row + 3) / 4;
  const int64_t tma_aligned_mn = ((mn + 3) / 4) * 4;

  TORCH_CHECK(output_s_packed.scalar_type() == at::ScalarType::Int,
              "output_s_packed must have dtype int32 for UE8M0-packed scales.");
  // DeepGEMM expects SFA scales in MN-major form with shape
  // [mn, ceil_div(K, 128 * 4)] and TMA-aligned stride on the last
  // dimension.
  TORCH_CHECK(output_s_packed.size(0) == mn &&
                  output_s_packed.size(1) == k_num_packed_sfk,
              "output_s_packed shape must be [", mn, ", ", k_num_packed_sfk,
              "], but got [", output_s_packed.size(0), ", ",
              output_s_packed.size(1), "].");

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  constexpr int THREADS_PER_GROUP = 16;

  int groups_per_block = 1;

  if (num_groups % 16 == 0) {
    groups_per_block = 16;
  } else if (num_groups % 8 == 0) {
    groups_per_block = 8;
  } else if (num_groups % 4 == 0) {
    groups_per_block = 4;
  } else if (num_groups % 2 == 0) {
    groups_per_block = 2;
  }

  auto dst_type = output_q.scalar_type();
  const int num_blocks = num_groups / groups_per_block;
  const int num_threads = groups_per_block * THREADS_PER_GROUP;

  // zero-initialize packed scales, since we use atomicOr to accumulate
  // exponents from different groups.
  output_s_packed.zero_();

 #define LAUNCH_PACKED_KERNEL(T, DST_DTYPE)                                \
  do {                                                                    \
    dim3 grid(num_blocks);                                                \
    dim3 block(num_threads);                                              \
    size_t smem_bytes =                                                   \
        static_cast<size_t>(groups_per_block) * group_size * sizeof(T);   \
    per_token_group_quant_8bit_packed_kernel<T, DST_DTYPE>                \
        <<<grid, block, smem_bytes, stream>>>(                            \
            static_cast<const T*>(input.data_ptr()), output_q.data_ptr(), \
            reinterpret_cast<unsigned int*>(output_s_packed.data_ptr()),  \
            static_cast<int>(group_size), static_cast<int>(num_groups),   \
            groups_per_block, static_cast<int>(groups_per_row),           \
            static_cast<int>(mn), static_cast<int>(tma_aligned_mn),       \
            static_cast<float>(eps), static_cast<float>(min_8bit),        \
            static_cast<float>(max_8bit));                                \
  } while (0)

  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "per_token_group_quant_8bit_packed", ([&] {
        if (dst_type == at::ScalarType::Float8_e4m3fn) {
          LAUNCH_PACKED_KERNEL(scalar_t, __nv_fp8_e4m3);
        } else if (dst_type == at::ScalarType::Char) {
          LAUNCH_PACKED_KERNEL(scalar_t, int8_t);
        } else {
          TORCH_CHECK(
              false,
              "per_token_group_quant_8bit_packed only supports FP8/INT8 "
              "outputs.");
        }
      }));

 #undef LAUNCH_PACKED_KERNEL
 }

 void per_token_group_quant_fp8(const torch::Tensor& input,
                               torch::Tensor& output_q, torch::Tensor& output_s,
                               int64_t group_size, double eps, double fp8_min,
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -1241,33 +1241,16 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx,
 }
 #endif  // defined(__HIP__GFX9__) TODO: Add NAVI support

 // Find the min val of div2 that doesn't increase N/(div1*div2)
 int mindiv(int N, int div1, int div2) {
  int nPrRnd = div1 * div2;
  int rnds0 = N / nPrRnd;
  nPrRnd -= div1 * 3;
  int rnds3 = N / nPrRnd;
  nPrRnd -= div1;
  int rnds4 = N / nPrRnd;
  nPrRnd -= div1;
  int rnds5 = N / nPrRnd;
  nPrRnd -= div1;
  int rnds6 = N / nPrRnd;
  nPrRnd -= div1;
  int rnds7 = N / nPrRnd;
  nPrRnd -= div1;
  int rnds8 = N / nPrRnd;
  nPrRnd -= div1;
  int rnds9 = N / nPrRnd;
  nPrRnd -= div1;
  int rtn = div2;
  if (rnds0 == rnds3) rtn = div2 - 3;
  if (rnds0 == rnds4) rtn = div2 - 4;
  if (rnds0 == rnds5) rtn = div2 - 5;
  if (rnds0 == rnds6) rtn = div2 - 6;
  if (rnds0 == rnds7) rtn = div2 - 7;
  if (rnds0 == rnds8) rtn = div2 - 8;
  if (rnds0 == rnds9) rtn = div2 - 9;
  return rtn;
  int rnds[13];
  for (int i = 0; i < 13; i++) {
    rnds[i] = (N + nPrRnd - 1) / nPrRnd;
    nPrRnd -= div1;
  }
  for (int i = 12; i >= 0; i--)
    if (rnds[0] == rnds[i]) return (div2 - i);
 }

 torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
@@ -1300,26 +1283,37 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  const int max_lds_len = get_lds_size() / 2;

 #define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
                 _N)                                                          \
  {                                                                           \
    dim3 block(64, _WvPrGrp);                                                 \
    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {              \
      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);              \
      wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N>          \
          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
                                       biasf4, c, __wvPrGrp, CuCount);        \
    } else if (K_in * N_in <= max_lds_len * 1.2) {                            \
      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);              \
      wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N>              \
          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
                                       biasf4, c, __wvPrGrp, CuCount);        \
    } else {                                                                  \
      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp);              \
      wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N>          \
          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
                                       biasf4, c, __wvPrGrp, CuCount);        \
    }                                                                         \
 #define WVSPLITK(_YTILE, _UNRL, _N)                                        \
  {                                                                        \
    dim3 block(64, 16);                                                    \
    int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16);                    \
    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILE == 0))              \
      wvSplitK_hf_sml_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>               \
          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
                                       biasf4, c, __wvPrGrp, CuCount);     \
    else if (K_in * N_in <= max_lds_len * 1.2)                             \
      wvSplitK_hf_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>                   \
          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
                                       biasf4, c, __wvPrGrp, CuCount);     \
    else                                                                   \
      wvSplitK_hf_big_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>               \
          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
                                       biasf4, c, __wvPrGrp, CuCount);     \
  }

 #define WVSPLIT_TILE(_sYT, __N)                           \
  {                                                       \
    bool fit_lds = (K_in * N_in <= max_lds_len);          \
    if (_sYT <= 1)                                        \
      WVSPLITK(1, 4, __N)                                 \
    else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \
      WVSPLITK(2, 2, __N)                                 \
    else if (_sYT <= 4 * 3)                               \
      WVSPLITK(3, 2, __N)                                 \
    else if (__N == 4)                                    \
      WVSPLITK(4, 1, __N)                                 \
    else                                                  \
      WVSPLITK(4, 2, __N)                                 \
  }

  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
@@ -1331,18 +1325,23 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
            ? reinterpret_cast<const fptype*>(in_bias->data_ptr())
            : nullptr;
    fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());

    // first shoot for biggest tile-size that keeps all simd busy,
    // then cut the active waves to balance their distribution...
    int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4);

    switch (N_in) {
      case 1:
        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1)
        WVSPLIT_TILE(sYT, 1)
        break;
      case 2:
        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2)
        WVSPLIT_TILE(sYT, 2)
        break;
      case 3:
        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3)
        WVSPLIT_TILE(sYT, 3)
        break;
      case 4:
        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4)
        WVSPLIT_TILE(sYT, 4)
        break;
      default:
        throw std::runtime_error(
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@@ -44,41 +44,300 @@ __global__ void apply_repetition_penalties_kernel(
  }
 }

 static inline __device__ uint16_t extractBinIdx(float x) {
  union {
    __half h;
    uint16_t u16;
  } tmp;
  tmp.h = __float2half_rn(x);
  tmp.u16 = (x < 0.f) ? (~tmp.u16 & 0xffff) : (tmp.u16 | 0x8000);
  return 511 - (tmp.u16 >> 7);
 __device__ __forceinline__ auto convert_to_uint32(float x) -> uint32_t {
  uint32_t bits = __float_as_uint(x);
  return (bits & 0x80000000) ? bits : ~bits & 0x7fffffff;
 }

 template <int kNumThreadsPerBlock = 512, int kNumBins = 512, int kTopK = 2048>
 __device__ void topKPerRowJob(const float* logits, const int rowStart,
                              const int rowEnd, const int rowIdx,
                              int* outIndices, int stride0, int stride1) {
  // The number of elements per thread for the final top-k sort.
  static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock;
  // The class to sort the elements during the final top-k sort.
  using TopKSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
                                       kNumTopKItemsPerThread, int>;
 template <int step>
 static inline __device__ uint32_t extractBinIdx(float x) {
  if constexpr (step == 0) {
    __half hx = __float2half(x);
    uint16_t bits = __half_as_ushort(hx);
    bits = (bits & 0x8000) ? bits : ~bits & 0x7fff;
    return bits >> 5;
  } else {
    uint32_t bits = __float_as_uint(x);
    bits = (bits & 0x80000000) ? bits : ~bits & 0x7fffffff;

    if constexpr (step == 1) {
      return bits >> 21;
    } else if constexpr (step == 2) {
      return (bits >> 10) & 0x7ff;
    } else if constexpr (step == 3) {
      return bits & 0x3ff;
    }
  }
 }

 template <int shift>
 static inline __device__ bool isPartialMatch(float x, uint32_t pattern) {
  if constexpr (shift == 0) {
    return true;
  }
  uint32_t bits = __float_as_uint(x);
  bits = (bits & 0x80000000) ? bits : ~bits & 0x7fffffff;
  return (bits ^ pattern) >> shift == 0;
 }

 /**
 * Map a Func over the input data, using vectorized load instructions if
 * possible.
 *
 * @tparam T element type
 * @tparam IdxT indexing type
 * @tparam Func void (T x, IdxT idx)
 *
 * @param thread_rank rank of the calling thread among all participating threads
 * @param num_threads number of the threads that participate in processing
 * @param in the input data
 * @param len the number of elements to read
 * @param f the lambda taking two arguments (T x, IdxT idx)
 */
 template <typename T, typename idxT, typename Func>
 __device__ void vectorized_process(size_t thread_rank, size_t num_threads,
                                   const T* in, idxT len, Func f) {
  constexpr int WARP_SIZE = 32;
  using WideT = float4;
  if constexpr (sizeof(T) >= sizeof(WideT)) {
    for (idxT i = thread_rank; i < len; i += num_threads) {
      f(in[i], i);
    }
  } else {
    static_assert(sizeof(WideT) % sizeof(T) == 0);
    constexpr int items_per_scalar = sizeof(WideT) / sizeof(T);
    // TODO: it's UB
    union {
      WideT scalar;
      T array[items_per_scalar];
    } wide;

    int skip_cnt =
        (reinterpret_cast<size_t>(in) % sizeof(WideT))
            ? ((sizeof(WideT) - reinterpret_cast<size_t>(in) % sizeof(WideT)) /
               sizeof(T))
            : 0;
    if (skip_cnt > len) {
      skip_cnt = len;
    }
    const WideT* in_cast = reinterpret_cast<decltype(in_cast)>(in + skip_cnt);
    const idxT len_cast = (len - skip_cnt) / items_per_scalar;

    for (idxT i = thread_rank; i < len_cast; i += num_threads) {
      wide.scalar = in_cast[i];
      const idxT real_i = skip_cnt + i * items_per_scalar;
 #pragma unroll
      for (int j = 0; j < items_per_scalar; ++j) {
        f(wide.array[j], real_i + j);
      }
    }

    static_assert(WARP_SIZE >= items_per_scalar);
    // and because items_per_scalar > skip_cnt, WARP_SIZE > skip_cnt
    // no need to use loop
    if (thread_rank < skip_cnt) {
      f(in[thread_rank], thread_rank);
    }
    // because len_cast = (len - skip_cnt) / items_per_scalar,
    // len_cast * items_per_scalar + items_per_scalar > len - skip_cnt;
    // and so
    // len - (skip_cnt + len_cast * items_per_scalar) < items_per_scalar <=
    // WARP_SIZE no need to use loop
    const idxT remain_i = skip_cnt + len_cast * items_per_scalar + thread_rank;
    if (remain_i < len) {
      f(in[remain_i], remain_i);
    }
  }
 }

 template <int step, int kNumThreadsPerBlock, int kNumBins, int kNumFinalItems,
          bool multipleBlocksPerRow, bool mergeBlocks, typename SmemFinalType,
          typename SmemOutputType>
 __device__ bool processHistogramStep(
    const int* indices, const float* logits, int rowEnd, uint32_t& logitPattern,
    int& thresholdBinIdx, SmemOutputType& smemOutput, int* smemThresholdBinIdx,
    int* smemFinalDstIdx, int* smemFinalBinSize, int* smemFoundTopKValues,
    SmemFinalType& smemFinal, int stride1, int rowStart, int topK) {
  // Clear the histogram.
 #pragma unroll
  for (int idx = threadIdx.x; idx < kNumBins; idx += kNumThreadsPerBlock) {
    smemFinal.histo.data[idx] = 0;
  }

  // Make sure the histogram is ready.
  __syncthreads();

  // Update pattern
  constexpr auto patternShift = step < 2 ? 0 : step == 2 ? 21 : 10;
  if constexpr (step == 2) {
    logitPattern = static_cast<uint32_t>(thresholdBinIdx & 0x7ff)
                   << patternShift;
  } else if constexpr (step == 3) {
    logitPattern |= static_cast<uint32_t>(thresholdBinIdx & 0x7ff)
                    << patternShift;
  }

  auto distributeToBins = [&](float logit, int /* idx */ = 0) {
    if (isPartialMatch<patternShift>(logit, logitPattern)) {
      uint32_t binIdx = extractBinIdx<step>(logit);
      atomicAdd(&smemFinal.histo.data[binIdx], 1);
    }
  };

  // Distribute the elements to the histogram bins.
  if (stride1 == 1) {
    vectorized_process(threadIdx.x, kNumThreadsPerBlock, logits + rowStart,
                       rowEnd - rowStart, distributeToBins);
  } else {
    for (int idx = rowStart + threadIdx.x; idx < rowEnd;
         idx += kNumThreadsPerBlock) {
      float logit = logits[idx * stride1];
      distributeToBins(logit, idx);
    }
  }
  // Make sure the histogram is ready.
  __syncthreads();

  // Reads the value of the starting position in the smemOutput array
  int lastValue = smemFoundTopKValues[0];

  for (int round = 0; round < kNumBins / kNumThreadsPerBlock; round++) {
    // Read the values from SMEM.
    int idx = threadIdx.x + kNumThreadsPerBlock * round;
    int binCount{0};
    binCount = smemFinal.histo.data[idx];

    // Make sure each thread has read its value.
    __syncthreads();

    // Compute the prefix sum.
    int prefixSum{0}, totalSum{0};
    using Scan = cub::BlockScan<int, kNumThreadsPerBlock>;
    Scan(smemFinal.histo.scan).ExclusiveSum(binCount, prefixSum, totalSum);

    // Update the histogram with the prefix sums.
    prefixSum += lastValue;
    totalSum += lastValue;
    smemFinal.histo.data[idx] = prefixSum;

    // Make sure the data is in shared memory.
    __syncthreads();

    // Find the last valid bin.
    bool foundThreshold = false;
    if (prefixSum < topK) {
      int nextPrefixSum = threadIdx.x == kNumThreadsPerBlock - 1
                              ? totalSum
                              : smemFinal.histo.data[idx + 1];

      if (nextPrefixSum >= topK) {
        smemThresholdBinIdx[0] = idx;
        smemFinalBinSize[0] = nextPrefixSum - prefixSum;
        foundThreshold = true;
      }
    }

    // Early exit: if any thread found the threshold, we can skip remaining
    // rounds
    if (__syncthreads_or(foundThreshold)) {
      break;
    }

    lastValue = totalSum;
  }

  // Make sure the data is in shared memory.
  __syncthreads();

  // The threshold bin.
  thresholdBinIdx = smemThresholdBinIdx[0];

  auto processBins = [&](float logit, int idx) {
    if (isPartialMatch<patternShift>(logit, logitPattern)) {
      uint32_t binIdx = extractBinIdx<step>(logit);
      if (binIdx < thresholdBinIdx) {
        // The element is part of the top-k selection
        int dstIdx = atomicAdd(&smemFoundTopKValues[0], 1);

        if constexpr (mergeBlocks) {
          smemOutput[dstIdx] = indices[idx];
        } else if constexpr (multipleBlocksPerRow) {
          smemOutput[dstIdx] = idx + rowStart;
          reinterpret_cast<float*>(smemOutput + topK)[dstIdx] = logit;
        } else {
          smemOutput[dstIdx] = idx;
        }
      }
      if constexpr (step < 3) {
        // Only fill the final items for sorting if the threshold bin fits
        if (binIdx == thresholdBinIdx &&
            smemFinalBinSize[0] <= kNumFinalItems) {
          int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
          smemFinal.items.logits[dstIdx] = logit;
          if constexpr (mergeBlocks) {
            smemFinal.items.indices[dstIdx] = indices[idx];
          } else if constexpr (multipleBlocksPerRow) {
            smemFinal.items.indices[dstIdx] = idx + rowStart;
          } else {
            smemFinal.items.indices[dstIdx] = idx;
          }
        }
      } else {
        if (binIdx == thresholdBinIdx) {
          // The elements in the threshold bin share the same 32 bits at step 3
          int dstIdx = atomicAdd(&smemFinal.histo.data[binIdx], 1);
          if (dstIdx < topK) {
            if constexpr (mergeBlocks) {
              smemOutput[dstIdx] = indices[idx];
            } else if constexpr (multipleBlocksPerRow) {
              smemOutput[dstIdx] = idx + rowStart;
              reinterpret_cast<float*>(smemOutput + topK)[dstIdx] = logit;
            } else {
              smemOutput[dstIdx] = idx;
            }
          }
        }
      }
    }
  };

  if (stride1 == 1) {
    vectorized_process(threadIdx.x, kNumThreadsPerBlock, logits + rowStart,
                       rowEnd - rowStart, processBins);
  } else {
    for (int idx = rowStart + threadIdx.x; idx < rowEnd;
         idx += kNumThreadsPerBlock) {
      float logit = logits[idx * stride1];
      processBins(logit, idx);
    }
  }

  // Make sure the elements are in shared memory.
  __syncthreads();

  // Check if we should continue to next step
  return smemFinalBinSize[0] > kNumFinalItems;
 }

 // Follows half - 11 - 11 - 10 bit iterations
 template <int kNumThreadsPerBlock, int kNumBins, bool useRadixSort,
          bool multipleBlocksPerRow = false, bool mergeBlocks = false>
 static __device__ void topKPerRowJob(const int* indices, const float* logits,
                                     int rowStart, int rowEnd, int* outIndices,
                                     float* outLogits, int stride1, int topK) {
  // The number of slots for the final pass.
  static constexpr int kNumFinalItems = 3072;
  static constexpr int kNumFinalItems = 2048;
  // The number of elements per thread for the final sort.
  static constexpr int kNumFinalItemsPerThread =
      kNumFinalItems / kNumThreadsPerBlock;
  // The class to sort the elements during the final pass.
  using FinalSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
                                        kNumFinalItemsPerThread, int>;

  using FinalSortTempStorage =
      std::conditional_t<useRadixSort, typename FinalSort::TempStorage, int>;
  // The class to compute the inclusive prefix-sum over the histogram.
  using Scan = cub::BlockScan<int, kNumThreadsPerBlock>;

  // Shared memory to compute the block scan.
  __shared__ typename Scan::TempStorage smemScan;

  // The structure to store the final items (for the final pass).
  struct FinalItems {
    // Shared memory to store the indices for the final pass.
@@ -87,200 +346,225 @@ __device__ void topKPerRowJob(const float* logits, const int rowStart,
    float logits[kNumFinalItems];
  };

  struct Histogram {
    typename Scan::TempStorage scan;
    int data[kNumBins];
  };

  // Shared memory to compute the block sort.
  __shared__ union {
    FinalItems items;
    typename FinalSort::TempStorage finalSort;
    typename TopKSort::TempStorage topKSort;
    FinalSortTempStorage finalSort;
    Histogram histo;
  } smemFinal;

  // Shared memory to store the histogram.
  __shared__ int smemHistogram[kNumBins];
  // Shared memory to store the selected indices.
  __shared__ int smemIndices[kTopK];
  // If we are processing using multiple blocks, we need to store the logits and
  // indices.
  extern __shared__ int32_t smemOutput[];

  // Shared memory to store the threshold bin.
  __shared__ int smemThresholdBinIdx[1];
  // Shared memory counter to register the candidates for the final phase.
  __shared__ int smemFinalDstIdx[1];
  // Shared memory to determine if the threshold bin fits in the final items.
  __shared__ int smemFinalBinSize[1];
  // Shared memory to keep track of the top-k values found so far by the
  // previous iterations
  __shared__ int smemFoundTopKValues[1];

  // The length of the row.
  int rowLen = rowEnd - rowStart;

  // Shortcut if the length of the row is smaller than Top-K. Indices are not
  // sorted by their corresponding logit.
  if (rowLen <= kTopK) {
  if (rowLen <= topK) {
    for (int rowIt = threadIdx.x; rowIt < rowLen;
         rowIt += kNumThreadsPerBlock) {
      int idx = rowStart + rowIt;
      outIndices[rowIdx * kTopK + rowIt] = idx - rowStart;
      if constexpr (multipleBlocksPerRow) {
        outIndices[rowIt] = rowIt + rowStart;
        outLogits[rowIt] = logits[rowIt + rowStart];
      } else {
        outIndices[rowIt] = rowIt;
      }
    }
    for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK;
    for (int rowIt = rowLen + threadIdx.x; rowIt < topK;
         rowIt += kNumThreadsPerBlock) {
      outIndices[rowIdx * kTopK + rowIt] = -1;
      outIndices[rowIt] = -1;
      if constexpr (multipleBlocksPerRow) {
        outLogits[rowIt] = -FLT_MAX;
      }
    }
    return;
  }

  // Clear the histogram.
  if (threadIdx.x < kNumBins) {
    smemHistogram[threadIdx.x] = 0;
  }

  // Make sure the histogram is ready.
  __syncthreads();

  // Fetch elements one-by-one.
  for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
       rowIt += kNumThreadsPerBlock) {
    uint16_t idx = extractBinIdx(logits[rowIdx * stride0 + rowIt * stride1]);
    atomicAdd(&smemHistogram[idx], 1);
  }

  // Make sure the histogram is ready.
  __syncthreads();

  // Read the values from SMEM.
  int binCount{0};
  if (threadIdx.x < kNumBins) {
    binCount = smemHistogram[threadIdx.x];
  }

  // Make sure each thread has read its value.
  __syncthreads();

  // Compute the prefix sum.
  int prefixSum{0}, totalSum{0};
  Scan(smemScan).ExclusiveSum(binCount, prefixSum, totalSum);

  // Update the histogram with the prefix sums.
  if (threadIdx.x < kNumBins) {
    smemHistogram[threadIdx.x] = prefixSum;
  }

  // Make sure the data is in shared memory.
  __syncthreads();

  // Find the last valid bin.
  if (threadIdx.x < kNumBins) {
    int nextPrefixSum =
        threadIdx.x == kNumBins - 1 ? totalSum : smemHistogram[threadIdx.x + 1];
    if (prefixSum < kTopK && nextPrefixSum >= kTopK) {
      smemThresholdBinIdx[0] = threadIdx.x;
    }
    return;
  }

  // Clear the counter to store the items for the final phase.
  // Initialize values
  if (threadIdx.x == 0) {
    smemFinalDstIdx[0] = 0;
    smemFoundTopKValues[0] = 0;
  }

  // Make sure the data is in shared memory.
  __syncthreads();
  int thresholdBinIdx = -1;
  uint32_t logitPattern = 0;

  // Step 0: Process first 11 bits of half representation
  bool continueToNextStep =
      processHistogramStep<0, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
                           multipleBlocksPerRow, mergeBlocks>(
          indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
          smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
          smemFoundTopKValues, smemFinal, stride1, rowStart, topK);

  if (continueToNextStep) {
    // Step 1: Process next 11 bits
    continueToNextStep =
        processHistogramStep<1, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
                             multipleBlocksPerRow, mergeBlocks>(
            indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
            smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
            smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
  }

  // The threshold bin.
  int thresholdBinIdx = smemThresholdBinIdx[0];

  // Fetch elements one-by-one and populate the shared memory buffers.
  for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
       rowIt += kNumThreadsPerBlock) {
    float logit = logits[rowIdx * stride0 + rowIt * stride1];
    uint16_t idx = extractBinIdx(logit);
    if (idx < thresholdBinIdx) {
      int dstIdx = atomicAdd(&smemHistogram[idx], 1);
      smemIndices[dstIdx] = rowIt;
    } else if (idx == thresholdBinIdx) {
      int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
      if (dstIdx < kNumFinalItems) {
        smemFinal.items.logits[dstIdx] = logit;
        smemFinal.items.indices[dstIdx] = rowIt;
      }
    }
  if (continueToNextStep) {
    // Step 2: Process next 11 bits
    continueToNextStep =
        processHistogramStep<2, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
                             multipleBlocksPerRow, mergeBlocks>(
            indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
            smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
            smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
  }

  // Make sure the elements are in shared memory.
  __syncthreads();
  if (continueToNextStep) {
    // Step 3: Process last 10 bits
    processHistogramStep<3, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
                         multipleBlocksPerRow, mergeBlocks>(
        indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
        smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
        smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
  }

  // The logits of the elements to be sorted in the final pass.
  float finalLogits[kNumFinalItemsPerThread];
  // The indices of the elements to be sorted in the final pass.
  int finalIndices[kNumFinalItemsPerThread];
  if (!continueToNextStep) {
    // The histogram did not proceed to the final 10 bits, therefore we need to
    // sort the final items The logits of the elements to be sorted in the final
    // pass.
    if constexpr (useRadixSort) {
      // Sorting with radix sort
      float finalLogits[kNumFinalItemsPerThread];
      // The indices of the elements to be sorted in the final pass.
      int finalIndices[kNumFinalItemsPerThread];

 // Init.
 #pragma unroll
  for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
    finalLogits[ii] = -FLT_MAX;
  }
      for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
        finalLogits[ii] = -FLT_MAX;
      }

 // Read the elements from SMEM.
      // Read the elements from SMEM.
 #pragma unroll
  for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
    int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
    if (srcIdx < smemFinalDstIdx[0]) {
      finalLogits[ii] = smemFinal.items.logits[srcIdx];
      finalIndices[ii] = smemFinal.items.indices[srcIdx];
    }
  }
      for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
        int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
        if (srcIdx < smemFinalDstIdx[0]) {
          finalLogits[ii] = smemFinal.items.logits[srcIdx];
          finalIndices[ii] = smemFinal.items.indices[srcIdx];
        }
      }
      // Make sure the shared memory has been read.
      __syncthreads();

  // Make sure the shared memory has been read.
  __syncthreads();
      // Sort the elements.
      FinalSort(smemFinal.finalSort)
          .SortDescendingBlockedToStriped(finalLogits, finalIndices);

  // Sort the elements.
  FinalSort(smemFinal.finalSort)
      .SortDescendingBlockedToStriped(finalLogits, finalIndices);
      // Copy the data back to the shared memory storage.
      int baseIdx = smemFoundTopKValues[0];

  // Copy the data back to the shared memory storage.
  int baseIdx = thresholdBinIdx > 0 ? smemHistogram[thresholdBinIdx - 1] : 0;
 #pragma unroll
  for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
    int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
    int dstIdx = baseIdx + srcIdx;
    if (dstIdx < kTopK) {
      smemIndices[dstIdx] = finalIndices[ii];
      for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
        int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
        int dstIdx = baseIdx + srcIdx;

        if (dstIdx < topK) {
          smemOutput[dstIdx] = finalIndices[ii];
          if constexpr (multipleBlocksPerRow) {
            reinterpret_cast<float*>(smemOutput + topK)[dstIdx] =
                finalLogits[ii];
          }
        }
      }
    } else {
      // Sorting with insertion sort
      auto baseIdx = smemFoundTopKValues[0];
      for (int i = threadIdx.x; i < smemFinalDstIdx[0];
           i += kNumThreadsPerBlock) {
        int outIndex = 0;
        auto logit = smemFinal.items.logits[i];
        for (int j = 0; j < smemFinalDstIdx[0]; j++) {
          auto otherLogit = smemFinal.items.logits[j];
          if (logit < otherLogit || (logit == otherLogit && i < j)) {
            outIndex++;
          }
        }
        // Store if outIndex is in bounds
        if (outIndex + baseIdx < topK) {
          smemOutput[outIndex + baseIdx] = smemFinal.items.indices[i];
          if constexpr (multipleBlocksPerRow) {
            reinterpret_cast<float*>(smemOutput + topK)[outIndex + baseIdx] =
                smemFinal.items.logits[i];
          }
        }
      }
    }
    __syncthreads();
  }

  // Make sure the data is in shared memory.
  __syncthreads();

 // Store to global memory.
 #pragma unroll
  for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
    int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x;
    outIndices[offset] =
        smemIndices[ii * kNumThreadsPerBlock + threadIdx.x] - rowStart;
  // Store to global memory.
  for (int i = threadIdx.x; i < topK; i += kNumThreadsPerBlock) {
    if constexpr (multipleBlocksPerRow) {
      outIndices[i] = smemOutput[i];
      outLogits[i] = reinterpret_cast<float*>(smemOutput + topK)[i];
    } else {
      if (stride1 == 1) {
        // stride1 == 1 will use vectorized_process, which indexes already skip
        // the rowStart.
        outIndices[i] = smemOutput[i];
      } else {
        outIndices[i] = smemOutput[i] - rowStart;
      }
    }
  }
 }

 template <int kNumThreadsPerBlock = 512>
 static __global__ void topKPerRow(const float* logits, const int* rowStarts,
                                  const int* rowEnds, int* outIndices,
                                  int stride0, int stride1) {
 template <int kNumThreadsPerBlock, bool useRadixSort>
 static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill(
    const float* logits, const int* rowStarts, const int* rowEnds,
    int* outIndices, int stride0, int stride1, const int topK,
    const int offsetIndex) {
  // The number of bins in the histogram.
  static constexpr int kNumBins = 512;

  // The top-k width.
  static constexpr int kTopK = 2048;
  static constexpr int kNumBins = 2048;

  // The row computed by this block.
  int rowIdx = blockIdx.x;
  int rowIdx = blockIdx.x + offsetIndex;

  // The range of logits within the row.
  int rowStart = rowStarts[rowIdx];
  int rowEnd = rowEnds[rowIdx];

  topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
      logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
  // Local pointers to this block
  outIndices += rowIdx * topK;
  logits += rowIdx * stride0;

  topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort>(
      nullptr, logits, rowStart, rowEnd, outIndices, nullptr, stride1, topK);
 }

 template <int kNumThreadsPerBlock = 512>
 static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
                                        int* outIndices, int stride0,
                                        int stride1, int next_n) {
 template <int kNumThreadsPerBlock, bool useRadixSort,
          bool multipleBlocksPerRow = false, bool mergeBlocks = false>
 static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(
    const float* logits, const int* seqLens, int* outIndices, int stride0,
    int stride1, const int topK, int next_n, float* outLogits = nullptr,
    const int numBlocksToMerge = 0, const int* indices = nullptr) {
  // The number of bins in the histogram.
  static constexpr int kNumBins = 512;

  // The top-k width.
  static constexpr int kTopK = 2048;
  static constexpr int kNumBins = 2048;

  // The row computed by this block.
  int rowIdx = blockIdx.x;
@@ -290,8 +574,25 @@ static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
  int seq_len = seqLens[rowIdx / next_n];
  int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1;

  topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
      logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
  // Local pointers to this block
  if constexpr (!multipleBlocksPerRow && !mergeBlocks) {
    outIndices += rowIdx * topK;
  } else if constexpr (multipleBlocksPerRow) {
    const auto blockSize = rowEnd / gridDim.y;  // 16384 / 2 = 8192
    rowStart = blockSize * blockIdx.y;          // 8192 * 1 = 8192
    rowEnd = gridDim.y == blockIdx.y + 1 ? rowEnd : rowStart + blockSize;
    outIndices += rowIdx * gridDim.y * topK + blockIdx.y * topK;
    outLogits += rowIdx * gridDim.y * topK + blockIdx.y * topK;
  } else if constexpr (mergeBlocks) {
    rowEnd = numBlocksToMerge * topK;
    indices += rowIdx * numBlocksToMerge * topK;
    outIndices += rowIdx * topK;
  }
  logits += rowIdx * stride0;

  topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort,
                multipleBlocksPerRow, mergeBlocks>(
      indices, logits, rowStart, rowEnd, outIndices, outLogits, stride1, topK);
 }

 }  // namespace vllm
@@ -339,28 +640,84 @@ void apply_repetition_penalties_(

 void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
                          const torch::Tensor& seqLens, torch::Tensor& indices,
                          int64_t numRows, int64_t stride0, int64_t stride1) {
  // Compute the results on the device.
                          int64_t numRows, int64_t stride0, int64_t stride1,
                          int64_t topK) {
  constexpr int kSortingAlgorithmThreshold = 12288;
  constexpr int kSplitWorkThreshold = 200 * 1000;
  constexpr int kNumThreadsPerBlock = 512;
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  vllm::topKPerRowDecode<kNumThreadsPerBlock>
      <<<numRows, kNumThreadsPerBlock, 0, stream>>>(
          logits.data_ptr<float>(), seqLens.data_ptr<int>(),
          indices.data_ptr<int>(), static_cast<int>(stride0),
          static_cast<int>(stride1), static_cast<int>(next_n));
  const auto numColumns = logits.size(1);

  if (numColumns < kSortingAlgorithmThreshold) {
    // Use insertion sort
    vllm::topKPerRowDecode<kNumThreadsPerBlock, false>
        <<<numRows, kNumThreadsPerBlock, topK * sizeof(int32_t), stream>>>(
            logits.data_ptr<float>(), seqLens.data_ptr<int>(),
            indices.data_ptr<int>(), static_cast<int>(stride0),
            static_cast<int>(stride1), static_cast<int>(topK),
            static_cast<int>(next_n));
  } else if (numColumns < kSplitWorkThreshold) {
    // From this threshold, use radix sort instead
    vllm::topKPerRowDecode<kNumThreadsPerBlock, true>
        <<<numRows, kNumThreadsPerBlock, topK * sizeof(int32_t), stream>>>(
            logits.data_ptr<float>(), seqLens.data_ptr<int>(),
            indices.data_ptr<int>(), static_cast<int>(stride0),
            static_cast<int>(stride1), static_cast<int>(topK),
            static_cast<int>(next_n));
  } else {
    // Long sequences are run in two steps
    constexpr auto multipleBlocksPerRowConfig = 10;

    const auto outIndicesAux =
        torch::empty({numRows, multipleBlocksPerRowConfig, topK},
                     torch::dtype(torch::kInt32).device(logits.device()));
    const auto outLogitsAux =
        torch::empty({numRows, multipleBlocksPerRowConfig, topK},
                     torch::dtype(torch::kFloat).device(logits.device()));

    vllm::topKPerRowDecode<kNumThreadsPerBlock, true, true>
        <<<dim3(numRows, multipleBlocksPerRowConfig), kNumThreadsPerBlock,
           2 * topK * sizeof(int32_t), stream>>>(
            logits.data_ptr<float>(), seqLens.data_ptr<int>(),
            outIndicesAux.data_ptr<int>(), static_cast<int>(stride0),
            static_cast<int>(stride1), static_cast<int>(topK),
            static_cast<int>(next_n), outLogitsAux.data_ptr<float>());

    constexpr int kNumThreadsPerBlockMerge = 1024;
    vllm::topKPerRowDecode<kNumThreadsPerBlockMerge, true, false, true>
        <<<numRows, kNumThreadsPerBlockMerge, topK * sizeof(int32_t), stream>>>(
            outLogitsAux.data_ptr<float>(), seqLens.data_ptr<int>(),
            indices.data_ptr<int>(), multipleBlocksPerRowConfig * topK, 1,
            static_cast<int>(topK), static_cast<int>(next_n), nullptr,
            multipleBlocksPerRowConfig, outIndicesAux.data_ptr<int>());
  }
 }

 void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
                   const torch::Tensor& rowEnds, torch::Tensor& indices,
                   int64_t numRows, int64_t stride0, int64_t stride1) {
  // Compute the results on the device.
 void top_k_per_row_prefill(const torch::Tensor& logits,
                           const torch::Tensor& rowStarts,
                           const torch::Tensor& rowEnds, torch::Tensor& indices,
                           int64_t numRows, int64_t stride0, int64_t stride1,
                           int64_t topK) {
  constexpr int kSortingAlgorithmThreshold = 12288;
  constexpr int kNumThreadsPerBlock = 512;
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  vllm::topKPerRow<kNumThreadsPerBlock>
      <<<numRows, kNumThreadsPerBlock, 0, stream>>>(
          logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
          rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
          static_cast<int>(stride0), static_cast<int>(stride1));
  int numInsertionBlocks =
      std::min(static_cast<int>(numRows), kSortingAlgorithmThreshold);
  vllm::topKPerRowPrefill<kNumThreadsPerBlock, false>
      <<<numInsertionBlocks, kNumThreadsPerBlock, topK * sizeof(int32_t),
         stream>>>(logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
                   rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
                   static_cast<int>(stride0), static_cast<int>(stride1),
                   static_cast<int>(topK), 0);

  if (numRows > kSortingAlgorithmThreshold) {
    int numRadixBlocks = numRows - kSortingAlgorithmThreshold;
    vllm::topKPerRowPrefill<kNumThreadsPerBlock, true>
        <<<numRadixBlocks, kNumThreadsPerBlock, topK * sizeof(int32_t),
           stream>>>(logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
                     rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
                     static_cast<int>(stride0), static_cast<int>(stride1),
                     static_cast<int>(topK), kSortingAlgorithmThreshold);
  }
 }
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -179,15 +179,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {

  // Optimized top-k per row operation
  ops.def(
      "top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
      "top_k_per_row_prefill(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
      "Tensor! indices, int numRows, int stride0, "
      "int stride1) -> ()");
  ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);
      "int stride1, int topK) -> ()");
  ops.impl("top_k_per_row_prefill", torch::kCUDA, &top_k_per_row_prefill);

  ops.def(
      "top_k_per_row_decode(Tensor logits, int next_n, "
      "Tensor seq_lens, Tensor! indices, int numRows, "
      "int stride0, int stride1) -> ()");
      "Tensor seq_lens, Tensor! indices, "
      "int numRows, int stride0, int stride1, int topK) -> ()");
  ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode);

  // Layernorm-quant
@@ -215,6 +215,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
           &rms_norm_dynamic_per_token_quant);

  // Fused Layernorm + Block quant kernels
  ops.def(
      "rms_norm_per_block_quant(Tensor! result, Tensor input, "
      "Tensor weight, Tensor! scale, float epsilon, "
      "Tensor? scale_ub, Tensor!? residual, int group_size, "
      "bool is_scale_transposed) -> ()");
  ops.impl("rms_norm_per_block_quant", torch::kCUDA, &rms_norm_per_block_quant);

  // Rotary embedding
  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
  ops.def(
@@ -342,6 +350,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def("cutlass_encode_and_reorder_int4b(Tensor B) -> Tensor");
  // conditionally compiled so impl registration is in source file

  // CUTLASS w4a8 grouped GEMM
  ops.def(
      "cutlass_w4a8_moe_mm("
      "   Tensor! out_tensors,"
      "   Tensor a_tensors,"
      "   Tensor b_tensors,"
      "   Tensor a_scales,"
      "   Tensor b_scales,"
      "   Tensor b_group_scales,"
      "   int b_group_size,"
      "   Tensor expert_offsets,"
      "   Tensor problem_sizes,"
      "   Tensor a_strides,"
      "   Tensor b_strides,"
      "   Tensor c_strides,"
      "   Tensor group_scale_strides,"
      "   str? maybe_schedule"
      ") -> ()");
  ops.def(
      "cutlass_encode_and_reorder_int4b_grouped(Tensor b_tensors) -> (Tensor, "
      "Tensor)");
  // conditionally compiled so impl registration is in source file

 #endif

  // Dequantization for GGML.
@@ -458,7 +489,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                                 Tensor! problem_sizes1, "
      "                                 Tensor! problem_sizes2, "
      "                                 int num_experts, int n, int k, "
      "                                 Tensor? blockscale_offsets) -> ()");
      "                                 Tensor? blockscale_offsets, "
      "                                 bool? force_swap_ab) -> ()");
  ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
           &get_cutlass_moe_mm_problem_sizes);

@@ -617,6 +649,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("per_token_group_fp8_quant", torch::kCUDA,
           &per_token_group_quant_fp8);

  // Compute per-token-group 8-bit quantized tensor and UE8M0-packed,
  // TMA-aligned scales for DeepGEMM.
  ops.def(
      "per_token_group_fp8_quant_packed(Tensor input, Tensor! output_q, "
      "Tensor! output_s_packed, int group_size, float eps, float fp8_min, "
      "float fp8_max) -> ()");
  ops.impl("per_token_group_fp8_quant_packed", torch::kCUDA,
           &per_token_group_quant_8bit_packed);

  // Compute per-token-group INT8 quantized tensor and scaling factor.
  ops.def(
      "per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! "
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -59,6 +59,7 @@ nav:
  - CLI Reference: cli
  - Community:
    - community/*
    - Governance: governance
    - Blog: https://blog.vllm.ai
    - Forum: https://discuss.vllm.ai
    - Slack: https://slack.vllm.ai
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -15,6 +15,7 @@ API documentation for vLLM's configuration classes.
 - [vllm.config.MultiModalConfig][]
 - [vllm.config.PoolerConfig][]
 - [vllm.config.StructuredOutputsConfig][]
 - [vllm.config.ProfilerConfig][]
 - [vllm.config.ObservabilityConfig][]
 - [vllm.config.KVTransferConfig][]
 - [vllm.config.CompilationConfig][]
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@@ -84,7 +84,7 @@ Total input tokens:                      1369
 Total generated tokens:                  2212
 Request throughput (req/s):              1.73
 Output token throughput (tok/s):         382.89
 Total Token throughput (tok/s):          619.85
 Total token throughput (tok/s):          619.85
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          71.54
 Median TTFT (ms):                        73.88
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -5,16 +5,15 @@

 ## Profile with PyTorch Profiler

 We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`. Additionally, you can control the profiling content by specifying the following environment variables:
 We support tracing vLLM workers using the `torch.profiler` module. You can enable the torch profiler by setting `--profiler-config`
 when launching the server, and setting the entries `profiler` to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config:

 - `VLLM_TORCH_PROFILER_RECORD_SHAPES=1` to enable recording Tensor Shapes, off by default
 - `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
 - `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
 - `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
 - `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default
 - `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default

 The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
 - `torch_profiler_record_shapes` to enable recording Tensor Shapes, off by default
 - `torch_profiler_with_memory` to record memory, off by default
 - `torch_profiler_with_stack` to enable recording stack information, on by default
 - `torch_profiler_with_flops` to enable recording FLOPs, off by default
 - `torch_profiler_use_gzip` to control gzip-compressing profiling files, on by default
 - `torch_profiler_dump_cuda_time_total` to control dumping and printing the aggregated CUDA self time table, on by default

 When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.

@@ -40,8 +39,7 @@ Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline
 #### OpenAI Server

 ```bash
 VLLM_TORCH_PROFILER_DIR=./vllm_profile \
    vllm serve meta-llama/Llama-3.1-8B-Instruct
 vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}'
 ```

 vllm bench command:
@@ -104,13 +102,12 @@ To profile the server, you will want to prepend your `vllm serve` command with `

 ```bash
 # server
 VLLM_TORCH_CUDA_PROFILE=1 \
 nsys profile \
    --trace-fork-before-exec=true \
    --cuda-graph-trace=node \
    --capture-range=cudaProfilerApi \
    --capture-range-end repeat \
    vllm serve meta-llama/Llama-3.1-8B-Instruct
    vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config.profiler cuda

 # client
 vllm bench serve \
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -21,30 +21,20 @@ The mental model is that server-level metrics help explain the values of request

 ### v1 Metrics

 In v1, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
 In v1, an extensive set of metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix, for example:

 - `vllm:num_requests_running` (Gauge) - Number of requests currently running.
 - `vllm:num_requests_waiting` (Gauge) - Number of requests currently waiting.
 - `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (0–1).
 - `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries.
 - `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits.
 - `vllm:mm_cache_queries` (Counter) - (For multimodal models) Number of multimodal cache queries.
 - `vllm:mm_cache_hits` (Counter) - (For multimodal models) Number of multimodal cache hits.
 - `vllm:num_preemptions_total` (Counter) - Number of preemptions.
 - `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed.
 - `vllm:generation_tokens_total` (Counter) - Total number of generated tokens.
 - `vllm:iteration_tokens_total` (Histogram) - Histogram of tokens processed in each engine step.
 - `vllm:cache_config_info` (Gauge) - Information about the cache configuration.
 - `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason).
 - `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts.
 - `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts.
 - `vllm:request_params_n` (Histogram) - Histogram of request parameter n.
 - `vllm:request_params_max_tokens` - (Histogram) - Histogram of max_tokens parameter in requests.
 - `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT).
 - `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency.
 - `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency.
 - `vllm:request_queue_time_seconds` (Histogram) - Time spent in the queue.
 - `vllm:request_inference_time_seconds` (Histogram) - Request inference time.
 - `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time.
 - `vllm:request_decode_time_seconds` (Histogram) - Request decode time.

--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -68,8 +68,8 @@ th:not(:first-child) {
 | CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
 | [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ✅        |
 | [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | [🟠](https://github.com/vllm-project/vllm/issues/26965)       |
 | [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ✅       |
 | [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ✅        |
 | <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
 | <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ✅        |
--- a/docs/features/disagg_encoder.md
+++ b/docs/features/disagg_encoder.md
@@ -32,14 +32,14 @@ Design doc: <https://docs.google.com/document/d/1aed8KtC6XkXtdoV87pWT0a8OJlZ-Cpn

 ## 2  Usage Example

 The current reference pathway is **SharedStorageConnector**.  
 The current reference pathway is **ExampleConnector**.  
 Below ready-to-run scripts shows the workflow:

 1 Encoder instance + 1 PD instance:
 `examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_encoder_example.sh`
 `examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh`

 1 Encoder instance + 1 Prefill instance + 1 Decode instance:
 `examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_epd_example.sh`
 `examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh`

 ---

--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -21,14 +21,14 @@ Please refer to [examples/online_serving/disaggregated_prefill.sh](../../example

 Now supports 5 types of connectors:

 - **SharedStorageConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of SharedStorageConnector disaggregated prefilling.
 - **ExampleConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of ExampleConnector disaggregated prefilling.
 - **LMCacheConnectorV1**: refer to [examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
 - **NixlConnector**: refer to [tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
 - **P2pNcclConnector**: refer to [examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
 - **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:

  ```bash
  --kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"SharedStorageConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}'
  --kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"ExampleConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}'
  ```

 For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -445,7 +445,7 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd

 For Qwen3-VL, the `image_embeds` should contain both the base image embedding and deepstack features.

 #### Audio Embeddings
 #### Audio Embedding Inputs

 You can pass pre-computed audio embeddings similar to image embeddings:

@@ -892,5 +892,11 @@ For Online Serving, you can also skip sending media if you expect cache hits wit
    ```

 !!! note
    Only one message can contain `{"type": "image_embeds"}`.
    Multiple messages can now contain `{"type": "image_embeds"}`, enabling you to pass multiple image embeddings in a single request (similar to regular images). The number of embeddings is limited by `--limit-mm-per-prompt`.

    **Important**: The embedding shape format differs based on the number of embeddings:

    - **Single embedding**: 3D tensor of shape `(1, feature_size, hidden_size)`
    - **Multiple embeddings**: List of 2D tensors, each of shape `(feature_size, hidden_size)`

    If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -22,7 +22,7 @@ python tools/install_nixl_from_source_ubuntu.py
 NixlConnector uses NIXL library for underlying communication, which supports multiple transport backends. UCX (Unified Communication X) is the primary default transport library used by NIXL. Configure transport environment variables:

 ```bash
 # Example UCX configuration, adjust according to your enviroment
 # Example UCX configuration, adjust according to your environment
 export UCX_TLS=all  # or specify specific transports like "rc,ud,sm,^cuda_ipc" ..etc
 export UCX_NET_DEVICES=all  # or specify network devices like "mlx5_0:1,mlx5_1:1"
 ```
@@ -146,6 +146,8 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
  --decoder-ports 8000 8000
 ```

 For multi-host DP deployment, only need to provide the host/port of the head instances.

 ### KV Role Options

 - **kv_producer**: For prefiller instances that generate KV caches
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -14,7 +14,7 @@ Contents:
 - [INT4 W4A16](int4.md)
 - [INT8 W8A8](int8.md)
 - [FP8 W8A8](fp8.md)
 - [NVIDIA TensorRT Model Optimizer](modelopt.md)
 - [NVIDIA Model Optimizer](modelopt.md)
 - [AMD Quark](quark.md)
 - [Quantized KV Cache](quantized_kvcache.md)
 - [TorchAO](torchao.md)
--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@@ -1,6 +1,6 @@
 # NVIDIA TensorRT Model Optimizer
 # NVIDIA Model Optimizer

 The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a library designed to optimize models for inference with NVIDIA GPUs. It includes tools for Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) of Large Language Models (LLMs), Vision Language Models (VLMs), and diffusion models.
 The [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) is a library designed to optimize models for inference with NVIDIA GPUs. It includes tools for Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) of Large Language Models (LLMs), Vision Language Models (VLMs), and diffusion models.

 We recommend installing the library with:

@@ -10,7 +10,7 @@ pip install nvidia-modelopt

 ## Quantizing HuggingFace Models with PTQ

 You can quantize HuggingFace models using the example scripts provided in the TensorRT Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory.
 You can quantize HuggingFace models using the example scripts provided in the Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory.

 Below is an example showing how to quantize a model using modelopt's PTQ API:

--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -299,6 +299,9 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner

        def is_reasoning_end(self, input_ids: list[int]) -> bool:
            return self.end_token_id in input_ids

        def is_reasoning_end_streaming(self, input_ids: list[int], delta_ids: list[int]) -> bool:
            return self.end_token_id in delta_token_ids
        ...
    ```

--- a/docs/governance/collaboration.md
+++ b/docs/governance/collaboration.md
@@ -0,0 +1,43 @@
 # Collaboration Policy

 This page outlines how vLLM collaborates with model providers, hardware vendors, and other stakeholders.

 ## Adding New Major Features

 Anyone can contribute to vLLM. For major features, submit an RFC (request for comments) first. To submit an RFC, create an [issue](https://github.com/vllm-project/vllm/issues/new/choose) and select the `RFC` template.
 RFCs are similar to design docs that discuss the motivation, problem solved, alternatives considered, and proposed change.

 Once you submit the RFC, please post it in the #contributors channel in vLLM Slack, and loop in area owners and committers for feedback.
 For high-interest features, the committers nominate a person to help with the RFC process and PR review. This makes sure someone is guiding you through the process. It is reflected as the "assignee" field in the RFC issue.
 If the assignee and lead maintainers find the feature to be contentious, the maintainer team aims to make decisions quickly after learning the details from everyone. This involves assigning a committer as the DRI (Directly Responsible Individual) to make the decision and shepherd the code contribution process.

 For features that you intend to maintain, please feel free to add yourself in [`mergify.yml`](https://github.com/vllm-project/vllm/blob/main/.github/mergify.yml) to receive notifications and auto-assignment when the PRs touching the feature you are maintaining. Over time, the ownership will be evaluated and updated through the committers nomination and voting process.

 ## Adding New Models

 If you use vLLM, we recommend you making the model work with vLLM by following the [model registration](../contributing/model/registration.md) process before you release it publicly.

 The vLLM team helps with new model architectures not supported by vLLM, especially models pushing architectural frontiers.
 Here's how the vLLM team works with model providers. The vLLM team includes all [committers](./committers.md) of the project. model providers can exclude certain members but shouldn't, as this may harm release timelines due to missing expertise. Contact [project leads](./process.md) if you want to collaborate.

 Once we establish the connection between the vLLM team and model provider:

 - The vLLM team learns the model architecture and relevant changes, then plans which area owners to involve and what features to include.
 - The vLLM team creates a private communication channel (currently a Slack channel in the vLLM workspace) and a private fork within the vllm-project organization. The model provider team can invite others to the channel and repo.
 - Third parties like compute providers, hosted inference providers, hardware vendors, and other organizations often work with both the model provider and vLLM on model releases. We establish direct communication (with permission) or three-way communication as needed.

 The vLLM team works with model providers on features, integrations, and release timelines. We work to meet release timelines, but engineering challenges like feature development, model accuracy alignment, and optimizations can cause delays.

 The vLLM maintainers will not publicly share details about model architecture, release timelines, or upcoming releases. We maintain model weights on secure servers with security measures (though we can work with security reviews and testing without certification). We delete pre-release weights or artifacts upon request.

 The vLLM team collaborates on marketing and promotional efforts for model releases. model providers can use vLLM's trademark and logo in publications and materials.

 ## Adding New Hardware

 vLLM is designed as a platform for frontier model architectures and high-performance accelerators.
 For new hardware, follow the [hardware plugin](../design/plugin_system.md) system to add support.
 Use the platform plugin system to add hardware support.
 As hardware gains popularity, we help endorse it in our documentation and marketing materials.
 The vLLM GitHub organization can host hardware plugin repositories, especially for collaborative efforts among companies.

 We rarely add new hardware to vLLM directly. Instead, we make existing hardware platforms modular to keep the vLLM core hardware-agnostic.
--- a/docs/governance/committers.md
+++ b/docs/governance/committers.md
@@ -0,0 +1,183 @@
 # Committers

 This document lists the current committers of the vLLM project and the core areas they maintain.
 Committers have write access to the vLLM repository and are responsible for reviewing and merging PRs.
 You can also refer to the [CODEOWNERS](https://github.com/vllm-project/vllm/blob/main/.github/CODEOWNERS) file for concrete file-level ownership and reviewers. Both this documents and the CODEOWNERS file are living documents and they complement each other.

 ## Active Committers

 We try to summarize each committer's role in vLLM in a few words. In general, vLLM committers cover a wide range of areas and help each other in the maintenance process.
 Please refer to the later section about Area Owners for exact component ownership details.
 Sorted alphabetically by GitHub handle:

 - [@22quinn](https://github.com/22quinn): RL API
 - [@aarnphm](https://github.com/aarnphm): Structured output
 - [@alexm-redhat](https://github.com/alexm-redhat): Performance
 - [@ApostaC](https://github.com/ApostaC): Connectors, offloading
 - [@benchislett](https://github.com/benchislett): Engine core and spec decode
 - [@bigPYJ1151](https://github.com/bigPYJ1151): Intel CPU/XPU integration
 - [@chaunceyjiang](https://github.com/chaunceyjiang): Tool use and reasoning parser
 - [@DarkLight1337](https://github.com/DarkLight1337): Multimodality, API server
 - [@esmeetu](https://github.com/esmeetu): developer marketing, community
 - [@gshtras](https://github.com/gshtras): AMD integration
 - [@heheda12345](https://github.com/heheda12345): Hybrid memory allocator
 - [@hmellor](https://github.com/hmellor): Hugging Face integration, documentation
 - [@houseroad](https://github.com/houseroad): Engine core and Llama models
 - [@Isotr0py](https://github.com/Isotr0py): Multimodality, new model support
 - [@jeejeelee](https://github.com/jeejeelee): LoRA, new model support
 - [@jikunshang](https://github.com/jikunshang): Intel CPU/XPU integration
 - [@khluu](https://github.com/khluu): CI infrastructure
 - [@KuntaiDu](https://github.com/KuntaiDu): KV Connector
 - [@LucasWilkinson](https://github.com/LucasWilkinson): Kernels and performance
 - [@luccafong](https://github.com/luccafong): Llama models, speculative decoding, distributed
 - [@markmc](https://github.com/markmc): Observability
 - [@mgoin](https://github.com/mgoin): Quantization and performance
 - [@NickLucche](https://github.com/NickLucche): KV connector
 - [@njhill](https://github.com/njhill): Distributed, API server, engine core
 - [@noooop](https://github.com/noooop): Pooling models
 - [@patrickvonplaten](https://github.com/patrickvonplaten): Mistral models, new model support
 - [@pavanimajety](https://github.com/pavanimajety): NVIDIA GPU integration
 - [@ProExpertProg](https://github.com/ProExpertProg): Compilation, startup UX
 - [@robertgshaw2-redhat](https://github.com/robertgshaw2-redhat): Core, distributed, disagg
 - [@ruisearch42](https://github.com/ruisearch42): Pipeline parallelism, Ray Support
 - [@russellb](https://github.com/russellb): Structured output, engine core, security
 - [@sighingnow](https://github.com/sighingnow): Qwen models, new model support
 - [@simon-mo](https://github.com/simon-mo): Project lead, API entrypoints, community
 - [@tdoublep](https://github.com/tdoublep): State space models
 - [@tjtanaa](https://github.com/tjtanaa): AMD GPU integration
 - [@tlrmchlsmth](https://github.com/tlrmchlsmth): Kernels and performance, distributed, disagg
 - [@WoosukKwon](https://github.com/WoosukKwon): Project lead, engine core
 - [@yaochengji](https://github.com/yaochengji): TPU integration
 - [@yeqcharlotte](https://github.com/yeqcharlotte): Benchmark, Llama models
 - [@yewentao256](https://github.com/yewentao256): Kernels and performance
 - [@Yikun](https://github.com/Yikun): Pluggable hardware interface
 - [@youkaichao](https://github.com/youkaichao): Project lead, distributed, compile, community
 - [@ywang96](https://github.com/ywang96): Multimodality, benchmarks
 - [@zhuohan123](https://github.com/zhuohan123): Project lead, RL integration, numerics
 - [@zou3519](https://github.com/zou3519): Compilation

 ### Emeritus Committers

 Committers who have contributed to vLLM significantly in the past (thank you!) but no longer active:

 - [@andoorve](https://github.com/andoorve): Pipeline parallelism
 - [@cadedaniel](https://github.com/cadedaniel): Speculative decoding
 - [@comaniac](https://github.com/comaniac): KV cache management, pipeline parallelism
 - [@LiuXiaoxuanPKU](https://github.com/LiuXiaoxuanPKU): Speculative decoding
 - [@pcmoritz](https://github.com/pcmoritz): MoE
 - [@rkooo567](https://github.com/rkooo567): Chunked prefill
 - [@sroy745](https://github.com/sroy745): Speculative decoding
 - [@Yard1](https://github.com/Yard1): kernels and performance
 - [@zhisbug](https://github.com/zhisbug): Arctic models, distributed

 ## Area Owners

 This section breaks down the active committers by vLLM components and lists the area owners.
 If you have PRs touching the area, please feel free to ping the area owner for review.

 ### Engine Core

 - Scheduler: the core vLLM engine loop scheduling requests to next batch
    - @WoosukKwon, @robertgshaw2-redhat, @njhill, @heheda12345
 - KV Cache Manager: memory management layer within scheduler maintaining KV cache logical block data
    - @heheda12345, @WoosukKwon
 - AsyncLLM: the zmq based protocol hosting engine core and making it accessible for entrypoints
    - @robertgshaw2-redhat, @njhill, @russellb
 - ModelRunner, Executor, Worker: the abstractions for engine wrapping model implementation
    - @WoosukKwon, @tlrmchlsmth, @heheda12345, @LucasWilkinson, @ProExpertProg
 - KV Connector: Connector interface and implementation for KV cache offload and transfer
    - @robertgshaw2-redhat, @njhill, @KuntaiDu, @NickLucche, @ApostaC
 - Distributed, Parallelism, Process Management: Process launchers managing each worker, and assign them to the right DP/TP/PP/EP ranks
    - @youkaichao, @njhill, @WoosukKwon, @ruisearch42
 - Collectives: the usage of nccl and other communication libraries/kernels
    - @tlrmchlsmth, @youkaichao
 - Multimodality engine and memory management: core scheduling and memory management concerning vision, audio, and video inputs.
    - @ywang96, @DarkLight1337

 ### Model Implementations

 - Model Interface: The `nn.Module` interface and implementation for various models
    - @zhuohan123, @mgoin, @simon-mo, @houseroad, @ywang96 (multimodality), @jeejeelee (lora)
 - Logits Processors / Sampler: The provided sampler class and pluggable logits processors
    - @njhill, @houseroad, @22quinn
 - Custom Layers: Utility layers in vLLM such as rotary embedding and rms norms
    - @ProExpertProg
 - Attention: Attention interface for paged attention
    - @WoosukKwon, @LucasWilkinson, @heheda12345
 - FusedMoE: FusedMoE kernel, Modular kernel framework, EPLB
    - @tlrmchlsmth
 - Quantization: Various quantization config, weight loading, and kernel.
    - @mgoin, @Isotr0py, @yewentao256
 - Custom quantized GEMM kernels (cutlass_scaled_mm, marlin, machete)
    - @tlrmchlsmth, @LucasWilkinson
 - Multi-modal Input Processing: Components that load and process image/video/audio data into feature tensors
    - @DarkLight1337, @ywang96, @Isotr0py
 - torch compile: The torch.compile integration in vLLM, custom passes & transformations
    - @ProExpertProg, @zou3519, @youkaichao
 - State space models: The state space models implementation in vLLM
    - @tdoublep, @tlrmchlsmth
 - Reasoning and tool calling parsers
    - @chaunceyjiang, @aarnphm

 ### Entrypoints

 - LLM Class: The LLM class for offline inference
    - @DarkLight1337
 - API Server: The OpenAI-compatible API server
    - @DarkLight1337, @njhill, @aarnphm, @simon-mo, @heheda12345 (Responses API)
 - Batch Runner: The OpenAI-compatible batch runner
    - @simon-mo

 ### Features

 - Spec Decode: Covers model definition, attention, sampler, and scheduler related to n-grams, EAGLE, and MTP.
    - @WoosukKwon, @benchislett, @luccafong
 - Structured Output: The structured output implementation
    - @russellb, @aarnphm
 - RL: The RL related features such as collective rpc, sleep mode, etc.
    - @youkaichao, @zhuohan123, @22quinn
 - LoRA: @jeejeelee
 - Observability: Metrics and Logging
    - @markmc, @robertgshaw2-redhat, @simon-mo

 ### Code Base

 - Config: Configuration registration and parsing
    - @hmellor
 - Documentation: @hmellor, @DarkLight1337, @simon-mo
 - Benchmarks: @ywang96, @simon-mo
 - CI, Build, Release Process: @khluu, @njhill, @simon-mo
 - Security: @russellb

 ### External Kernels Integration

 - FlashAttention: @LucasWilkinson
 - FlashInfer: @LucasWilkinson, @mgoin, @WoosukKwon
 - Blackwell Kernels: @mgoin, @yewentao256
 - DeepEP/DeepGEMM/pplx: @mgoin, @yewentao256

 ### Integrations

 - Hugging Face: @hmellor, @Isotr0py
 - Ray: @ruisearch42
 - NIXL: @robertgshaw2-redhat, @NickLucche

 ### Collaboration with Model Vendors

 - gpt-oss: @heheda12345, @simon-mo, @zhuohan123
 - Llama: @luccafong
 - Qwen: @sighingnow
 - Mistral: @patrickvonplaten

 ### Hardware

 - Plugin Interface: @youkaichao, @Yikun
 - NVIDIA GPU: @pavanimajety
 - AMD GPU: @gshtras, @tjtanaa
 - Intel CPU/GPU: @jikunshang, @bigPYJ1151
 - Google TPU: @yaochengji

 ### Ecosystem Projects

 - Ascend NPU: [@wangxiyuan](https://github.com/wangxiyuan) and [see more details](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html#maintainers)
 - Intel Gaudi HPU [@xuechendi](https://github.com/xuechendi) and [@kzawora-intel](https://github.com/kzawora-intel)
--- a/docs/governance/process.md
+++ b/docs/governance/process.md
@@ -0,0 +1,125 @@
 # Governance Process

 vLLM's success comes from our strong open source community. We favor informal, meritocratic norms over formal policies. This document clarifies our governance philosophy and practices.

 ## Values

 vLLM aims to be the fastest and easiest-to-use LLM inference and serving engine. We stay current with advances, enable innovation, and support diverse models, modalities, and hardware.

 ### Design Values

 1. **Top performance**: System performance is our top priority. We monitor overheads, optimize kernels, and publish benchmarks. We never leave performance on the table.
 2. **Ease of use**: vLLM must be simple to install, configure, and operate. We provide clear documentation, fast startup, clean logs, helpful error messages, and monitoring guides. Many users fork our code or study it deeply, so we keep it readable and modular.
 3. **Wide coverage**: vLLM supports frontier models and high-performance accelerators. We make it easy to add new models and hardware. vLLM + PyTorch form a simple interface that avoids complexity.
 4. **Production ready**: vLLM runs 24/7 in production. It must be easy to operate and monitor for health issues.
 5. **Extensibility**: vLLM serves as fundamental LLM infrastructure. Our codebase cannot cover every use case, so we design for easy forking and customization.

 ### Collaboration Values

 1. **Tightly Knit and Fast-Moving**: Our maintainer team is aligned on vision, philosophy, and roadmap. We work closely to unblock each other and move quickly.
 2. **Individual Merit**: No one buys their way into governance. Committer status belongs to individuals, not companies. We reward contribution, maintenance, and project stewardship.

 ## Project Maintainers

 Maintainers form a hierarchy based on sustained, high-quality contributions and alignment with our design philosophy.

 ### Core Maintainers

 Core Maintainers function like a project planning and decision making committee. In other convention, they might be called a Technical Steering Committee (TSC). In vLLM vocabulary, they are often known as "Project Leads". They meet weekly to coordinate roadmap priorities and allocate engineering resources. Current active leads: @WoosukKwon, @zhuohan123, @simon-mo, @youkaichao, @robertshaw2-redhat, @tlrmchlsmth, @mgoin, @njhill, @ywang96, @houseroad, @yeqcharlotte, @ApostaC

 The responsibilities of the core maintainers are:

 * Author quarterly roadmap and responsible for each development effort.
 * Making major changes to the technical direction or scope of vLLM and vLLM projects.
 * Defining the project's release strategy.
 * Work with model providers, hardware vendors, and key users of vLLM to ensure the project is on the right track.

 ### Lead Maintainers

 While Core maintainers assume the day-to-day responsibilities of the project, Lead maintainers are responsible for the overall direction and strategy of the project. A committee of @WoosukKwon, @zhuohan123, @simon-mo, and @youkaichao currently shares this role with divided responsibilities.

 The responsibilities of the lead maintainers are:

 * Making decisions where consensus among core maintainers cannot be reached.
 * Adopting changes to the project's technical governance.
 * Organizing the voting process for new committers.

 ### Committers and Area Owners

 Committers have write access and merge rights. They typically have deep expertise in specific areas and help the community.

 The responsibilities of the committers are:

 * Reviewing PRs and providing feedback.
 * Addressing issues and questions from the community.
 * Own specific areas of the codebase and development efforts: reviewing PRs, addressing issues, answering questions, improving documentation.

 Specially, committers are almost all area owners. They author subsystems, review PRs, refactor code, monitor tests, and ensure compatibility with other areas. All area owners are committers with deep expertise in that area, but not all committers own areas.

 For a full list of committers and their respective areas, see the [committers](./committers.md) page.

 #### Nomination Process

 Any committer can nominate candidates via our private mailing list:

 1. **Nominate**: Any committer may nominate a candidate by email to the private maintainers’ list, citing evidence mapped to the pre‑existing standards with links to PRs, reviews, RFCs, issues, benchmarks, and adoption evidence.
 2. **Vote**: The lead maintainers will group voices support or concerns. Shared concerns can stop the process. The vote typically last 3 working days. For concerns, committers group discuss the clear criteria for such person to be nominated again. The lead maintainers will make the final decision.
 3. **Confirm**: The lead maintainers send invitation, update CODEOWNERS, assign permissions, add to communications channels (mailing list and Slack).

 Committership is highly selective and merit based. The selection criteria requires:

 * **Area expertise**: leading design/implementation of core subsystems, material performance or reliability improvements adopted project‑wide, or accepted RFCs that shape technical direction.
 * **Sustained contributions**: high‑quality merged contributions and reviews across releases, responsiveness to feedback, and stewardship of code health.
 * **Community leadership**: mentoring contributors, triaging issues, improving docs, and elevating project standards.

 To further illustrate, a committer typically satisfies at least two of the following accomplishment patterns:

 * Author of an accepted RFC or design that materially shaped project direction
 * Measurable, widely adopted performance or reliability improvement in core paths
 * Long‑term ownership of a subsystem with demonstrable quality and stability gains
 * Significant cross‑project compatibility or ecosystem enablement work (models, hardware, tooling)

 While there isn't a quantitative bar, past committers have:

 * Submitted approximately 30+ PRs of substantial quality and scope
 * Provided high-quality reviews of approximately 10+ substantial external contributor PRs
 * Addressed multiple issues and questions from the community in issues/forums/Slack
 * Led concentrated efforts on RFCs and their implementation, or significant performance or reliability improvements adopted project‑wide

 ### Working Groups

 vLLM runs informal working groups such as CI, CI infrastructure, torch compile, and startup UX. These can be loosely tracked via `#sig-` (or `#feat-`) channels in vLLM Slack. Some groups have regular sync meetings.

 ### Advisory Board

 vLLM project leads consult with an informal advisory board that is composed of model providers, hardware vendors, and ecosystem partners. This manifests as a collaboration channel in Slack and frequent communications.

 ## Process

 ### Project Roadmap

 Project Leads publish quarterly roadmaps as GitHub issues. These clarify current priorities. Unlisted topics aren't excluded but may get less review attention. See [https://roadmap.vllm.ai/](https://roadmap.vllm.ai/).

 ### Decision Making

 We make technical decisions in Slack and GitHub using RFCs and design docs. Discussion may happen elsewhere, but we maintain public records of significant changes: problem statements, rationale, and alternatives considered.

 ### Merging Code

 Contributors and maintainers often collaborate closely on code changes, especially within organizations or specific areas. Maintainers should give others appropriate review opportunities based on change significance.

 PRs requires at least one committer review and approval. If the code is covered by CODEOWNERS, the PR should be reviewed by the CODEOWNERS. There are cases where the code is trivial or hotfix, the PR can be merged by the lead maintainers directly.

 In case where CI didn't pass due to the failure is not related to the PR, the PR can be merged by the lead maintainers using "force merge" option that overrides the CI checks.

 ### Slack

 Contributors are encouraged to join `#pr-reviews` and `#contributors` channels.

 There are `#sig-` and `#feat-` channels for discussion and coordination around specific topics.

 The project maintainer group also uses a private channel for high-bandwidth collaboration.

 ### Meetings

 We hold weekly contributor syncs with standup-style updates on progress, blockers, and plans. You can refer to the notes [standup.vllm.ai](https://standup.vllm.ai) for joining instructions.
--- a/docs/mkdocs/hooks/generate_metrics.py
+++ b/docs/mkdocs/hooks/generate_metrics.py
@@ -0,0 +1,149 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import ast
 import logging
 from pathlib import Path
 from typing import Literal

 logger = logging.getLogger("mkdocs")

 ROOT_DIR = Path(__file__).parent.parent.parent.parent
 DOCS_DIR = ROOT_DIR / "docs"
 GENERATED_METRICS_DIR = DOCS_DIR / "generated" / "metrics"

 # Files to scan for metric definitions - each will generate a separate table
 METRIC_SOURCE_FILES = [
    {"path": "vllm/v1/metrics/loggers.py", "output": "general.md"},
    {
        "path": "vllm/v1/spec_decode/metrics.py",
        "output": "spec_decode.md",
    },
    {
        "path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
        "output": "nixl_connector.md",
    },
 ]


 class MetricExtractor(ast.NodeVisitor):
    """AST visitor to extract metric definitions."""

    def __init__(self):
        self.metrics: list[dict[str, str]] = []

    def visit_Call(self, node: ast.Call) -> None:
        """Visit function calls to find metric class instantiations."""
        metric_type = self._get_metric_type(node)
        if metric_type:
            name = self._extract_kwarg(node, "name")
            documentation = self._extract_kwarg(node, "documentation")

            if name:
                self.metrics.append(
                    {
                        "name": name,
                        "type": metric_type,
                        "documentation": documentation or "",
                    }
                )

        self.generic_visit(node)

    def _get_metric_type(self, node: ast.Call) -> str | None:
        """Determine if this call creates a metric and return its type."""
        metric_type_map = {
            "_gauge_cls": "gauge",
            "_counter_cls": "counter",
            "_histogram_cls": "histogram",
        }
        if isinstance(node.func, ast.Attribute):
            return metric_type_map.get(node.func.attr)
        return None

    def _extract_kwarg(self, node: ast.Call, key: str) -> str | None:
        """Extract a keyword argument value from a function call."""
        for keyword in node.keywords:
            if keyword.arg == key:
                return self._get_string_value(keyword.value)
        return None

    def _get_string_value(self, node: ast.AST) -> str | None:
        """Extract string value from an AST node."""
        if isinstance(node, ast.Constant):
            return str(node.value) if node.value is not None else None
        return None


 def extract_metrics_from_file(filepath: Path) -> list[dict[str, str]]:
    """Parse a Python file and extract all metric definitions."""
    try:
        with open(filepath, encoding="utf-8") as f:
            source = f.read()

        tree = ast.parse(source, filename=str(filepath))
        extractor = MetricExtractor()
        extractor.visit(tree)
        return extractor.metrics
    except Exception as e:
        raise RuntimeError(f"Failed to parse {filepath}: {e}") from e


 def generate_markdown_table(metrics: list[dict[str, str]]) -> str:
    """Generate a markdown table from extracted metrics."""
    if not metrics:
        return "No metrics found.\n"

    # Sort by type, then by name
    metrics_sorted = sorted(metrics, key=lambda m: (m["type"], m["name"]))

    lines = []
    lines.append("| Metric Name | Type | Description |")
    lines.append("|-------------|------|-------------|")

    for metric in metrics_sorted:
        name = metric["name"]
        metric_type = metric["type"].capitalize()
        doc = metric["documentation"].replace("\n", " ").strip()
        lines.append(f"| `{name}` | {metric_type} | {doc} |")

    return "\n".join(lines) + "\n"


 def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
    """Generate metrics documentation tables from source files."""
    logger.info("Generating metrics documentation")

    # Create generated directory if it doesn't exist
    GENERATED_METRICS_DIR.mkdir(parents=True, exist_ok=True)

    total_metrics = 0
    for source_config in METRIC_SOURCE_FILES:
        source_path = source_config["path"]
        output_file = source_config["output"]

        filepath = ROOT_DIR / source_path
        if not filepath.exists():
            raise FileNotFoundError(f"Metrics source file not found: {filepath}")

        logger.debug("Extracting metrics from: %s", source_path)
        metrics = extract_metrics_from_file(filepath)
        logger.debug("Found %d metrics in %s", len(metrics), source_path)

        # Generate and write the markdown table for this source
        table_content = generate_markdown_table(metrics)
        output_path = GENERATED_METRICS_DIR / output_file
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(table_content)

        total_metrics += len(metrics)
        logger.info(
            "Generated metrics table: %s (%d metrics)",
            output_path.relative_to(ROOT_DIR),
            len(metrics),
        )

    logger.info(
        "Total metrics generated: %d across %d files",
        total_metrics,
        len(METRIC_SOURCE_FILES),
    )
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -33,8 +33,8 @@ shown in the table below.
 | Architecture                                    | `--convert` | Supported pooling tasks               |
 |-------------------------------------------------|-------------|---------------------------------------|
 | `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed`     | `token_embed`, `embed`                |
 | `*ForRewardModeling`, `*RewardModel`            | `embed`     | `token_embed`, `embed`                |
 | `*For*Classification`, `*ClassificationModel`   | `classify`  | `token_classify`, `classify`, `score` |
 | `*ForRewardModeling`, `*RewardModel`            | `reward`    | `token_classify`                      |

 !!! tip
    You can explicitly set `--convert <type>` to specify how to convert the model.
@@ -70,7 +70,6 @@ the pooler assigned to each task has the following attributes by default:

 | Task       | Pooling Type | Normalization | Softmax |
 |------------|--------------|---------------|---------|
 | `reward`   | `ALL`        | ❌            | ❌     |
 | `embed`    | `LAST`       | ✅︎            | ❌      |
 | `classify` | `LAST`       | ❌            | ✅︎      |

@@ -318,3 +317,10 @@ We have split the `encode` task into two more specific token-wise tasks: `token_
 ### Remove softmax from PoolingParams

 We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.

 ### as_reward_model

 Pooling models now default support all pooling, you can use it without any settings.

 - Extracting hidden states prefers using `token_embed` task.
 - Reward models prefers using `token_classify` task.
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -581,16 +581,9 @@ These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward)
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|-------------------|----------------------|---------------------------|
 | `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ |
 | `LlamaForCausalLM`<sup>C</sup> | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ |
 | `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ |
 | `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ |
 | `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |

 <sup>C</sup> Automatically converted into a reward model via `--convert reward`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.

 If your model is not in the above list, we will try to automatically convert the model using
 [as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.

 !!! important
    For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
@@ -740,23 +733,6 @@ Some models are supported only via the [Transformers modeling backend](#transfor
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.

 !!! warning
    Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
    However, there are differences in how they handle text + image inputs:

    V0 correctly implements the model's attention pattern:
    - Uses bidirectional attention between the image tokens corresponding to the same image
    - Uses causal attention for other tokens
    - Implemented via (naive) PyTorch SDPA with masking tensors
    - Note: May use significant memory for long prompts with image

    V1 currently uses a simplified attention pattern:
    - Uses causal attention for all tokens, including image tokens
    - Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
    - Will be updated in the future to support the correct behavior

    This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.

 !!! note
    `Gemma3nForConditionalGeneration` is only supported on V1 due to shared KV caching and it depends on `timm>=1.0.17` to make use of its
    MobileNet-v5 vision backbone.
@@ -776,9 +752,6 @@ Some models are supported only via the [Transformers modeling backend](#transfor
    The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
    For more details, please see: <https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630>

 !!! warning
    Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.

 !!! note
    For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported.

--- a/docs/usage/metrics.md
+++ b/docs/usage/metrics.md
@@ -33,11 +33,19 @@ Then query the endpoint to get the latest metrics from the server:

 The following metrics are exposed:

 ??? code
 ## General Metrics

    ```python
    --8<-- "vllm/engine/metrics.py:metrics-definitions"
    ```
 --8<-- "docs/generated/metrics/general.md"

 ## Speculative Decoding Metrics

 --8<-- "docs/generated/metrics/spec_decode.md"

 ## NIXL KV Connector Metrics

 --8<-- "docs/generated/metrics/nixl_connector.md"

 ## Deprecation Policy

 Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
 but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
--- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
@@ -30,7 +30,7 @@ def main():
        max_num_batched_tokens=64,
        max_num_seqs=16,
        kv_transfer_config=KVTransferConfig(
            kv_connector="SharedStorageConnector",
            kv_connector="ExampleConnector",
            kv_role="kv_both",
            kv_connector_extra_config={"shared_storage_path": "local_storage"},
        ),
--- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
@@ -26,7 +26,7 @@ def main():
        enforce_eager=True,
        gpu_memory_utilization=0.8,
        kv_transfer_config=KVTransferConfig(
            kv_connector="SharedStorageConnector",
            kv_connector="ExampleConnector",
            kv_role="kv_both",
            kv_connector_extra_config={"shared_storage_path": "local_storage"},
        ),
--- a/examples/offline_inference/kv_load_failure_recovery/README.md
+++ b/examples/offline_inference/kv_load_failure_recovery/README.md
@@ -10,7 +10,7 @@ It demonstrates vLLM's ability to recover from KV load failures in both synchron
 - `decode_example.py` – performs the decode stage. Accepts:
    - `--simulate-failure`: simulates KV load failure using a custom connector.
    - `--async-load`: enables asynchronous KV loading mode.
 - `rogue_shared_storage_connector.py` – defines `RogueSharedStorageConnector`, a subclass of `SharedStorageConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request.
 - `load_recovery_example_connector.py` – defines `LoadRecoveryExampleConnector`, a subclass of `ExampleConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request.
 - `run.sh` – orchestrates the test: runs the prefill stage, then three decode stages:
    1. Normal decode (baseline).
    2. Decode with simulated sync KV load failure.
@@ -20,7 +20,7 @@ It demonstrates vLLM's ability to recover from KV load failures in both synchron

 ## How It Works

 - The test dynamically loads `RogueSharedStorageConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector.
 - The test dynamically loads `LoadRecoveryExampleConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector.
 - The decode stages that simulate failure are expected to trigger recovery logic in vLLM, resulting in the same output as the baseline decode.
 - If recovery fails, the script prints a unified diff of the output mismatch and exits with error.

--- a/examples/offline_inference/kv_load_failure_recovery/decode_example.py
+++ b/examples/offline_inference/kv_load_failure_recovery/decode_example.py
@@ -35,13 +35,13 @@ def main():

    if args.simulate_failure:
        ktc = KVTransferConfig(
            kv_connector="RogueSharedStorageConnector",
            kv_connector="LoadRecoveryExampleConnector",
            kv_role="kv_both",
            kv_connector_extra_config={
                "shared_storage_path": "local_storage",
                "async_load": args.async_load,
            },
            kv_connector_module_path="rogue_shared_storage_connector",
            kv_connector_module_path="load_recovery_example_connector",
        )
        out_file = (
            "async_decode_recovered_output.txt"
@@ -50,7 +50,7 @@ def main():
        )
    else:
        ktc = KVTransferConfig(
            kv_connector="SharedStorageConnector",
            kv_connector="ExampleConnector",
            kv_role="kv_both",
            kv_connector_extra_config={
                "shared_storage_path": "local_storage",
--- a/examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py
+++ b/examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py
@@ -10,9 +10,9 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
    KVConnectorMetadata,
    KVConnectorRole,
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (
    SharedStorageConnector,
    SharedStorageConnectorMetadata,
 from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import (
    ExampleConnector,
    ExampleConnectorMetadata,
 )
 from vllm.forward_context import ForwardContext
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -26,15 +26,15 @@ logging.basicConfig(level=logging.INFO)


@dataclass
 class RogueSharedStorageConnectorMetadata(SharedStorageConnectorMetadata):
 class LoadRecoveryExampleConnectorMetadata(ExampleConnectorMetadata):
    req_to_block_ids: dict[str, set[int]] = field(default_factory=dict)

    @classmethod
    def from_base(cls, base: SharedStorageConnectorMetadata):
    def from_base(cls, base: ExampleConnectorMetadata):
        return cls(requests=base.requests)


 class RogueSharedStorageConnector(SharedStorageConnector):
 class LoadRecoveryExampleConnector(ExampleConnector):
    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
        super().__init__(vllm_config=vllm_config, role=role)
        self._async_load = vllm_config.kv_transfer_config.get_from_extra_config(
@@ -45,7 +45,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
        self._req_to_block_ids: dict[str, list[int]] = dict()

    def bind_connector_metadata(self, connector_metadata: KVConnectorMetadata) -> None:
        assert isinstance(connector_metadata, RogueSharedStorageConnectorMetadata)
        assert isinstance(connector_metadata, LoadRecoveryExampleConnectorMetadata)
        index, failed_request = next(
            (
                (i, x)
@@ -84,7 +84,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
    ) -> tuple[set[str] | None, set[str] | None]:
        if self._async_load:
            meta = self._get_connector_metadata()
            assert isinstance(meta, RogueSharedStorageConnectorMetadata)
            assert isinstance(meta, LoadRecoveryExampleConnectorMetadata)
            if meta.req_to_block_ids:
                return None, set(meta.req_to_block_ids)

@@ -126,9 +126,9 @@ class RogueSharedStorageConnector(SharedStorageConnector):
    ) -> KVConnectorMetadata:
        if not self._async_load:
            base = super().build_connector_meta(scheduler_output)
            meta = RogueSharedStorageConnectorMetadata.from_base(base)
            meta = LoadRecoveryExampleConnectorMetadata.from_base(base)
        else:
            meta = RogueSharedStorageConnectorMetadata()
            meta = LoadRecoveryExampleConnectorMetadata()
            if self._requests_need_load:
                for req_id, request in self._requests_need_load.items():
                    meta.add_request(
--- a/examples/offline_inference/kv_load_failure_recovery/prefill_example.py
+++ b/examples/offline_inference/kv_load_failure_recovery/prefill_example.py
@@ -26,7 +26,7 @@ def main():
        enforce_eager=True,
        gpu_memory_utilization=0.8,
        kv_transfer_config=KVTransferConfig(
            kv_connector="SharedStorageConnector",
            kv_connector="ExampleConnector",
            kv_role="kv_both",
            kv_connector_extra_config={"shared_storage_path": "local_storage"},
        ),
--- a/examples/offline_inference/simple_profiling.py
+++ b/examples/offline_inference/simple_profiling.py
@@ -1,14 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import os
 import time

 from vllm import LLM, SamplingParams

 # enable torch profiler, can also be set on cmd line
 os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"

 # Sample prompts.
 prompts = [
    "Hello, my name is",
@@ -22,7 +18,14 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

 def main():
    # Create an LLM.
    llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
    llm = LLM(
        model="facebook/opt-125m",
        tensor_parallel_size=1,
        profiler_config={
            "profiler": "torch",
            "torch_profiler_dir": "./vllm_profile",
        },
    )

    llm.start_profile()

--- a/examples/online_serving/disaggregated_encoder/README.md
+++ b/examples/online_serving/disaggregated_encoder/README.md
@@ -50,12 +50,12 @@ The vllm instances and `disagg_encoder_proxy` supports local URIs with ```{"url"

 ## EC connector and KV transfer

 The `ECSharedStorageConnector` is used to store the encoder cache on local disk and facilitate transfer. To enable the encoder disaggregation feature, add the following configuration:
 The `ECExampleonnector` is used to store the encoder cache on local disk and facilitate transfer. To enable the encoder disaggregation feature, add the following configuration:

 ```bash
 # Add to encoder instance: 
 --ec-transfer-config '{
    "ec_connector": "ECSharedStorageConnector",
    "ec_connector": "ECExampleConnector",
    "ec_role": "ec_producer",
    "ec_connector_extra_config": {
        "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
@@ -64,7 +64,7 @@ The `ECSharedStorageConnector` is used to store the encoder cache on local disk

 # Add to prefill/prefill+decode instance: 
 --ec-transfer-config '{
    "ec_connector": "ECSharedStorageConnector",
    "ec_connector": "ECExampleConnector",
    "ec_role": "ec_consumer",
    "ec_connector_extra_config": {
        "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
--- a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
@@ -102,7 +102,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
    --max-num-seqs 128 \
    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
    --ec-transfer-config '{
        "ec_connector": "ECSharedStorageConnector",
        "ec_connector": "ECExampleConnector",
        "ec_role": "ec_producer",
        "ec_connector_extra_config": {
            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
@@ -126,7 +126,7 @@ vllm serve "$MODEL" \
    --max-num-seqs 128 \
    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
    --ec-transfer-config '{
        "ec_connector": "ECSharedStorageConnector",
        "ec_connector": "ECExampleConnector",
        "ec_role": "ec_consumer",
        "ec_connector_extra_config": {
            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
--- a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
@@ -96,7 +96,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
    --max-num-seqs 128 \
    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
    --ec-transfer-config '{
        "ec_connector": "ECSharedStorageConnector",
        "ec_connector": "ECExampleConnector",
        "ec_role": "ec_producer",
        "ec_connector_extra_config": {
            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
@@ -117,7 +117,7 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
    --max-num-seqs 128 \
    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
    --ec-transfer-config '{
        "ec_connector": "ECSharedStorageConnector",
        "ec_connector": "ECExampleConnector",
        "ec_role": "ec_consumer",
        "ec_connector_extra_config": {
            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"