18 Commits

Author SHA1 Message Date
  Kevin Musgrave c01d589813
[Benchmarks] `auto_tune.sh`: Use hostname variable for server requests (#30529) 3 days ago
  Matthew Bonanni 60dbf7d8f1
Update batch invariant to use attention config (#30704) 3 days ago
  Michael Goin a450c64a30
[Bugfix] Fail instead of ignoring when CompilationConfig gets invalid args (#30708) 3 days ago
  Fadi Arafeh b2191abdca
[docs][fix] Update Arm CPU vLLM wheel installation docs (#30594) 3 days ago
  Matthew Bonanni 51e5b3e3c4
[Bugfix] Fix ViT with FlashAttention on ROCm (#30703) 3 days ago
  Isotr0py ec154c36ee
[Platform] Refactor Platform attention backend selection to avoid breakpoint for OOT platform (#30212) 3 days ago
  Harry Mellor 970713d4a4
Remove `SkipValidation` from `ModelConfig` (#30695) 3 days ago
  mondaylord 17fec3af09
[Bugfix] Fix missing first token in tool calls during reasoning-to-tool transition (#30671) 3 days ago
  yjc9696 855b101d75
[Frontend] add tools for dsv32 developer role (#30040) 3 days ago
  Robert Shaw d0502b4928
[MoE][Refactor 1/N] Separate Online Quantization (#30627) 3 days ago
  Max Hu 3f175f18a2
[Bugfix] Fix multimodal configuration for Qwen3VL MOE model (#30670) 3 days ago
  Cyrus Leung ed586e7724
[Refactor] [3/N] Move tool parser tests and run on CPU (#30693) 3 days ago
  Chauncey 2a1776b7ac
[Refactor] [2/N] Move tool parsers into the vLLM main directory (#30675) 3 days ago
  Nicolò Lucchesi 185c22bf2f
[Misc][Hybrid allocator + kv connector] Optionally enable hybrid allocator + KV cache connector (#29805) 4 days ago
  duke e4806d973a
[BugFix] Add embed_input_ids method to make QWenLMHeadModel a vllm model (#30674) 4 days ago
  wang.yuqi 4429d934de
[Model] Automatic conversion of TokenClassification model (#30666) 4 days ago
  ゆり 33278073d6
typing: Add type hints to TurnMetrics class in context.py (#30552) 4 days ago
  汪志鹏 1adeb3b84c
[New Model] BAGEL support (AR only) (#28439) 4 days ago
100 changed files with 1542 additions and 662 deletions
Split View
  1. +5
    -15
      .buildkite/test-amd.yaml
  2. +5
    -12
      .buildkite/test-pipeline.yaml
  3. +3
    -1
      .buildkite/test_areas/misc.yaml
  4. +1
    -11
      .buildkite/test_areas/tool_use.yaml
  5. +11
    -2
      benchmarks/auto_tune/auto_tune.sh
  6. +2
    -2
      docs/features/tool_calling.md
  7. +20
    -12
      docs/getting_started/installation/cpu.arm.inc.md
  8. +1
    -0
      docs/models/supported_models.md
  9. +27
    -0
      examples/offline_inference/vision_language.py
  10. +0
    -8
      tests/benchmarks/test_param_sweep.py
  11. +1
    -1
      tests/entrypoints/openai/test_serving_chat.py
  12. +1
    -1
      tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
  13. +1
    -1
      tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
  14. +1
    -1
      tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
  15. +1
    -1
      tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
  16. +1
    -1
      tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
  17. +1
    -1
      tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
  18. +1
    -1
      tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
  19. +1
    -1
      tests/entrypoints/openai/tool_parsers/utils.py
  20. +3
    -3
      tests/models/language/generation/test_mistral.py
  21. +31
    -0
      tests/models/language/pooling/test_token_classification.py
  22. +2
    -0
      tests/models/registry.py
  23. +0
    -0
      tests/tool_parsers/__init__.py
  24. +2
    -2
      tests/tool_parsers/test_deepseekv31_tool_parser.py
  25. +1
    -1
      tests/tool_parsers/test_ernie45_moe_tool_parser.py
  26. +2
    -4
      tests/tool_parsers/test_glm4_moe_tool_parser.py
  27. +1
    -3
      tests/tool_parsers/test_jamba_tool_parser.py
  28. +1
    -3
      tests/tool_parsers/test_kimi_k2_tool_parser.py
  29. +1
    -3
      tests/tool_parsers/test_minimax_tool_parser.py
  30. +1
    -1
      tests/tool_parsers/test_mistral_tool_parser.py
  31. +1
    -1
      tests/tool_parsers/test_openai_tool_parser.py
  32. +4
    -6
      tests/tool_parsers/test_qwen3coder_tool_parser.py
  33. +1
    -3
      tests/tool_parsers/test_seed_oss_tool_parser.py
  34. +1
    -3
      tests/tool_parsers/test_xlam_tool_parser.py
  35. +1
    -1
      tests/tool_use/test_tool_choice_required.py
  36. +4
    -1
      vllm/attention/layer.py
  37. +36
    -23
      vllm/attention/selector.py
  38. +4
    -4
      vllm/config/compilation.py
  39. +15
    -5
      vllm/config/model.py
  40. +3
    -1
      vllm/config/scheduler.py
  41. +60
    -36
      vllm/config/vllm.py
  42. +1
    -1
      vllm/engine/arg_utils.py
  43. +9
    -0
      vllm/entrypoints/chat_utils.py
  44. +8
    -8
      vllm/entrypoints/context.py
  45. +1
    -1
      vllm/entrypoints/openai/api_server.py
  46. +1
    -1
      vllm/entrypoints/openai/cli_args.py
  47. +1
    -1
      vllm/entrypoints/openai/parser/responses_parser.py
  48. +30
    -34
      vllm/entrypoints/openai/serving_chat.py
  49. +1
    -1
      vllm/entrypoints/openai/serving_engine.py
  50. +23
    -140
      vllm/entrypoints/openai/tool_parsers/__init__.py
  51. +22
    -17
      vllm/model_executor/layers/batch_invariant.py
  52. +154
    -89
      vllm/model_executor/layers/quantization/fp8.py
  53. +12
    -0
      vllm/model_executor/models/adapters.py
  54. +584
    -0
      vllm/model_executor/models/bagel.py
  55. +3
    -0
      vllm/model_executor/models/qwen.py
  56. +32
    -0
      vllm/model_executor/models/qwen2.py
  57. +5
    -0
      vllm/model_executor/models/qwen3_vl_moe.py
  58. +1
    -0
      vllm/model_executor/models/registry.py
  59. +4
    -11
      vllm/platforms/cpu.py
  60. +15
    -59
      vllm/platforms/cuda.py
  61. +2
    -10
      vllm/platforms/interface.py
  62. +9
    -13
      vllm/platforms/rocm.py
  63. +3
    -10
      vllm/platforms/tpu.py
  64. +4
    -11
      vllm/platforms/xpu.py
  65. +150
    -0
      vllm/tool_parsers/__init__.py
  66. +2
    -2
      vllm/tool_parsers/abstract_tool_parser.py
  67. +1
    -3
      vllm/tool_parsers/deepseekv31_tool_parser.py
  68. +3
    -3
      vllm/tool_parsers/deepseekv32_tool_parser.py
  69. +3
    -3
      vllm/tool_parsers/deepseekv3_tool_parser.py
  70. +3
    -3
      vllm/tool_parsers/ernie45_tool_parser.py
  71. +1
    -1
      vllm/tool_parsers/gigachat3_tool_parser.py
  72. +3
    -3
      vllm/tool_parsers/glm4_moe_tool_parser.py
  73. +4
    -4
      vllm/tool_parsers/granite_20b_fc_tool_parser.py
  74. +4
    -4
      vllm/tool_parsers/granite_tool_parser.py
  75. +3
    -3
      vllm/tool_parsers/hermes_tool_parser.py
  76. +4
    -4
      vllm/tool_parsers/hunyuan_a13b_tool_parser.py
  77. +4
    -4
      vllm/tool_parsers/internlm2_tool_parser.py
  78. +2
    -2
      vllm/tool_parsers/jamba_tool_parser.py
  79. +3
    -3
      vllm/tool_parsers/kimi_k2_tool_parser.py
  80. +2
    -2
      vllm/tool_parsers/llama4_pythonic_tool_parser.py
  81. +3
    -3
      vllm/tool_parsers/llama_tool_parser.py
  82. +1
    -1
      vllm/tool_parsers/longcat_tool_parser.py
  83. +3
    -3
      vllm/tool_parsers/minimax_m2_tool_parser.py
  84. +4
    -4
      vllm/tool_parsers/minimax_tool_parser.py
  85. +3
    -3
      vllm/tool_parsers/mistral_tool_parser.py
  86. +2
    -2
      vllm/tool_parsers/olmo3_tool_parser.py
  87. +2
    -2
      vllm/tool_parsers/openai_tool_parser.py
  88. +2
    -2
      vllm/tool_parsers/phi4mini_tool_parser.py
  89. +2
    -2
      vllm/tool_parsers/pythonic_tool_parser.py
  90. +3
    -3
      vllm/tool_parsers/qwen3coder_tool_parser.py
  91. +3
    -3
      vllm/tool_parsers/qwen3xml_tool_parser.py
  92. +3
    -3
      vllm/tool_parsers/seed_oss_tool_parser.py
  93. +3
    -3
      vllm/tool_parsers/step3_tool_parser.py
  94. +0
    -0
      vllm/tool_parsers/utils.py
  95. +1
    -1
      vllm/tool_parsers/xlam_tool_parser.py
  96. +1
    -0
      vllm/transformers_utils/config.py
  97. +2
    -0
      vllm/transformers_utils/configs/__init__.py
  98. +53
    -0
      vllm/transformers_utils/configs/bagel.py
  99. +2
    -0
      vllm/transformers_utils/processors/__init__.py
  100. +73
    -0
      vllm/transformers_utils/processors/bagel.py

+ 5
- 15
.buildkite/test-amd.yaml View File

@@ -61,8 +61,8 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_

- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
timeout_in_minutes: 20
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
timeout_in_minutes: 30
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
grade: Blocking
@@ -73,6 +73,7 @@ steps:
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
- tests/tokenizers_
- tests/tool_parsers
- tests/transformers_utils
- tests/config
no_gpu: true
@@ -82,6 +83,7 @@ steps:
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s tokenizers_
- pytest -v -s tool_parsers
- pytest -v -s transformers_utils
- pytest -v -s config

@@ -759,19 +761,7 @@ steps:
- vllm/
- tests/tool_use
commands:
- pytest -v -s -m 'not cpu_test' tool_use

- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/tool_use
no_gpu: true
commands:
- pytest -v -s -m 'cpu_test' tool_use
- pytest -v -s tool_use

##### models test #####



+ 5
- 12
.buildkite/test-pipeline.yaml View File

@@ -57,8 +57,8 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_

- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
timeout_in_minutes: 20
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/test_inputs.py
@@ -66,6 +66,7 @@ steps:
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
- tests/tokenizers_
- tests/tool_parsers
- tests/transformers_utils
- tests/config
no_gpu: true
@@ -75,6 +76,7 @@ steps:
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s tokenizers_
- pytest -v -s tool_parsers
- pytest -v -s transformers_utils
- pytest -v -s config

@@ -672,16 +674,7 @@ steps:
- vllm/
- tests/tool_use
commands:
- pytest -v -s -m 'not cpu_test' tool_use

- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/tool_use
no_gpu: true
commands:
- pytest -v -s -m 'cpu_test' tool_use
- pytest -v -s tool_use

##### models test #####



+ 3
- 1
.buildkite/test_areas/misc.yaml View File

@@ -115,7 +115,7 @@ steps:

- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
depends_on: ~
timeout_in_minutes: 20
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/test_inputs.py
@@ -123,6 +123,7 @@ steps:
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
- tests/tokenizers_
- tests/tool_parsers
- tests/transformers_utils
- tests/config
no_gpu: true
@@ -132,6 +133,7 @@ steps:
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s tokenizers_
- pytest -v -s tool_parsers
- pytest -v -s transformers_utils
- pytest -v -s config



+ 1
- 11
.buildkite/test_areas/tool_use.yaml View File

@@ -10,14 +10,4 @@ steps:
- vllm/
- tests/tool_use
commands:
- pytest -v -s -m 'not cpu_test' tool_use

- label: OpenAI-Compatible Tool Use (CPU)
depends_on: ~
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/tool_use
no_gpu: true
commands:
- pytest -v -s -m 'cpu_test' tool_use
- pytest -v -s tool_use

+ 11
- 2
benchmarks/auto_tune/auto_tune.sh View File

@@ -18,6 +18,11 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
HOSTNAME=$(hostname)
if [[ -z "$HOSTNAME" ]]; then
echo "Error: Failed to determine hostname." >&2
exit 1
fi

LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
@@ -82,6 +87,7 @@ start_server() {
"$MODEL"
"--disable-log-requests"
"--port" "8004"
"--host" "$HOSTNAME"
"--gpu-memory-utilization" "$gpu_memory_utilization"
"--max-num-seqs" "$max_num_seqs"
"--max-num-batched-tokens" "$max_num_batched_tokens"
@@ -113,7 +119,7 @@ start_server() {
# since that we should always have permission to send signal to the server process.
kill -0 $server_pid 2> /dev/null || break

RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1
@@ -173,6 +179,7 @@ run_benchmark() {
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 1000 \
--random-prefix-len $prefix_len \
--host "$HOSTNAME" \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -188,7 +195,7 @@ run_benchmark() {
request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do
# clear prefix cache
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
vllm bench serve \
@@ -204,6 +211,7 @@ run_benchmark() {
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--random-prefix-len $prefix_len \
--host "$HOSTNAME" \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -304,6 +312,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--random-prefix-len $prefix_len \
--host "$HOSTNAME" \
--port 8004 \
--profile &> "$bm_log"
else


+ 2
- 2
docs/features/tool_calling.md View File

@@ -420,7 +420,7 @@ Flags: `--tool-call-parser pythonic --chat-template {see_above}`

## How to Write a Tool Parser Plugin

A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in [vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py](../../vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py).
A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in [vllm/tool_parsers/hermes_tool_parser.py](../../vllm/tool_parsers/hermes_tool_parser.py).

Here is a summary of a plugin file:

@@ -468,7 +468,7 @@ Here is a summary of a plugin file:
# register the tool parser to ToolParserManager
ToolParserManager.register_lazy_module(
name="example",
module_path="vllm.entrypoints.openai.tool_parsers.example",
module_path="vllm.tool_parsers.example",
class_name="ExampleToolParser",
)



+ 20
- 12
docs/getting_started/installation/cpu.arm.inc.md View File

@@ -16,15 +16,15 @@ vLLM offers basic model inferencing and serving on Arm CPU platform, with suppor
# --8<-- [start:pre-built-wheels]

Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels contain pre-compiled C++ binaries.
Please replace `<version>` in the commands below with a specific version string (e.g., `0.11.2`).

```bash
uv pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.ai/<version>%2Bcpu/
export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
```

??? console "pip"
```bash
pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.ai/<version>%2Bcpu/
pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
```

The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
@@ -35,20 +35,28 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe

* `https://wheels.vllm.ai/nightly/cpu/vllm`

To install from nightly index, copy the link address of the `*.whl` under this index to run, for example:

To install from nightly index, run:
```bash
uv pip install -U https://wheels.vllm.ai/c756fb678184b867ed94e5613a529198f1aee423/vllm-0.13.0rc2.dev11%2Bgc756fb678.cpu-cp38-abi3-manylinux_2_31_aarch64.whl # current nightly build (the filename will change!)
uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu
```

??? console "pip (there's a caveat)"

Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes).

If you insist on using `pip`, you have to specify the full URL (link address) of the wheel file (which can be obtained from https://wheels.vllm.ai/nightly/cpu/vllm).

```bash
pip install https://wheels.vllm.ai/4fa7ce46f31cbd97b4651694caf9991cc395a259/vllm-0.13.0rc2.dev104%2Bg4fa7ce46f.cpu-cp38-abi3-manylinux_2_35_aarch64.whl # current nightly build (the filename will change!)
```

**Install specific revisions**

If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), specify the full commit hash in the index:
https://wheels.vllm.ai/${VLLM_COMMIT}/cpu/vllm .
Then, copy the link address of the `*.whl` under this index to run:
If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:

```bash
uv pip install -U <wheel-url>
export VLLM_COMMIT=730bd35378bf2a5b56b6d3a45be28b3092d26519 # use full commit hash from the main branch
uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu
```

# --8<-- [end:pre-built-wheels]
@@ -103,10 +111,10 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image.

Stable vLLM Docker images are being pre-built for Arm from version 0.12.0. Available image tags are here: [https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo).
Please replace `<version>` in the command below with a specific version string (e.g., `0.12.0`).

```bash
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v<version>
export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${VLLM_VERSION}
```

You can also access the latest code with Docker images. These are not intended for production use and are meant for CI and testing only. They will expire after several days.


+ 1
- 0
docs/models/supported_models.md View File

@@ -661,6 +661,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
| `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A<sup>+</sup> | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-hf` | ✅︎ | ✅︎ |
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
| `BagelForConditionalGeneration` | BAGEL | T + I<sup>+</sup> | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ |
| `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ |
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |


+ 27
- 0
examples/offline_inference/vision_language.py View File

@@ -118,6 +118,32 @@ def run_bee(questions: list[str], modality: str) -> ModelRequestData:
)


def run_bagel(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "ByteDance-Seed/BAGEL-7B-MoT"

engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={modality: 1},
)

prompts = [
(
f"<|im_start|>user\n<|image_pad|>\n{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
for question in questions
]

return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)


# BLIP-2
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@@ -1832,6 +1858,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
model_example_map = {
"aria": run_aria,
"aya_vision": run_aya_vision,
"bagel": run_bagel,
"bee": run_bee,
"blip-2": run_blip2,
"chameleon": run_chameleon,


+ 0
- 8
tests/benchmarks/test_param_sweep.py View File

@@ -23,14 +23,6 @@ class TestParameterSweepItem:
{"compilation_config.use_inductor_graph_partition": True},
"--compilation-config.use_inductor_graph_partition=true",
),
(
{"compilation_config.use_inductor": False},
"--compilation-config.use_inductor=false",
),
(
{"compilation_config.use_inductor": True},
"--compilation-config.use_inductor=true",
),
],
)
def test_nested_boolean_params(self, input_dict, expected):


+ 1
- 1
tests/entrypoints/openai/test_serving_chat.py View File

@@ -19,9 +19,9 @@ from vllm.entrypoints.openai.protocol import (
)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.tokenizers import get_tokenizer
from vllm.tool_parsers import ToolParserManager
from vllm.v1.engine.async_llm import AsyncLLM

from ...utils import RemoteOpenAIServer


+ 1
- 1
tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py View File

@@ -10,8 +10,8 @@ from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction_streaming,
)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser, ToolParserManager

SIMPLE_ARGS_DICT = {
"action": "create",


+ 1
- 1
tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py View File

@@ -6,8 +6,8 @@ import json
import pytest

from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser

from ....utils import RemoteOpenAIServer



+ 1
- 1
tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py View File

@@ -12,7 +12,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction_streaming,
)
from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
from vllm.tool_parsers import ToolParser, ToolParserManager


def make_tool_call(name, arguments):


+ 1
- 1
tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py View File

@@ -6,8 +6,8 @@ from unittest.mock import MagicMock, patch
import pytest

from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
from vllm.entrypoints.openai.tool_parsers.llama_tool_parser import Llama3JsonToolParser
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.llama_tool_parser import Llama3JsonToolParser


@pytest.fixture


+ 1
- 1
tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py View File

@@ -10,8 +10,8 @@ from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction_streaming,
)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser, ToolParserManager

# Test cases similar to pythonic parser but with Llama4 specific format
SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]"


+ 1
- 1
tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py View File

@@ -10,8 +10,8 @@ from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction_streaming,
)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser, ToolParserManager

# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"


+ 1
- 1
tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py View File

@@ -10,8 +10,8 @@ from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction_streaming,
)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser, ToolParserManager

# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"


+ 1
- 1
tests/entrypoints/openai/tool_parsers/utils.py View File

@@ -10,8 +10,8 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers import ToolParser
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser


class StreamingToolReconstructor:


+ 3
- 3
tests/models/language/generation/test_mistral.py View File

@@ -5,12 +5,12 @@ import json

import pytest

from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
from vllm.sampling_params import SamplingParams
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.tool_parsers.mistral_tool_parser import (
MistralToolCall,
MistralToolParser,
)
from vllm.sampling_params import SamplingParams
from vllm.tokenizers.mistral import MistralTokenizer

from ...utils import check_logprobs_close



+ 31
- 0
tests/models/language/pooling/test_token_classification.py View File

@@ -68,3 +68,34 @@ def test_modernbert_models(
hf_output = torch.tensor(hf_output).cpu().float()
vllm_output = torch.tensor(vllm_output).cpu().float()
assert torch.allclose(hf_output, vllm_output, atol=1e-2)


@pytest.mark.parametrize("model", ["bd2lcco/Qwen3-0.6B-finetuned"])
@pytest.mark.parametrize("dtype", ["float"])
@torch.inference_mode
def test_auto_conversion(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
) -> None:
with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.token_classify(example_prompts)

with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
) as hf_model:
tokenizer = hf_model.tokenizer
hf_outputs = []
for prompt in example_prompts:
inputs = tokenizer([prompt], return_tensors="pt")
inputs = hf_model.wrap_device(inputs)
output = hf_model.model(**inputs)
hf_outputs.append(softmax(output.logits[0]))

# check logits difference
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output).cpu().float()
vllm_output = torch.tensor(vllm_output).cpu().float()
assert torch.allclose(hf_output, vllm_output, atol=1e-2)

+ 2
- 0
tests/models/registry.py View File

@@ -573,6 +573,7 @@ _AUTOMATIC_CONVERTED_MODELS = {
"Qwen3ForSequenceClassification": _HfExamplesInfo(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
),
"Qwen3ForTokenClassification": _HfExamplesInfo("bd2lcco/Qwen3-0.6B-finetuned"),
}

_MULTIMODAL_EXAMPLE_MODELS = {
@@ -582,6 +583,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0.dev"
),
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
"BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
"BeeForConditionalGeneration": _HfExamplesInfo(
"Open-Bee/Bee-8B-RL",
trust_remote_code=True,


+ 0
- 0
tests/tool_parsers/__init__.py View File


tests/tool_use/test_deepseekv31_tool_parser.py → tests/tool_parsers/test_deepseekv31_tool_parser.py View File

@@ -3,10 +3,10 @@

import pytest

from vllm.entrypoints.openai.tool_parsers.deepseekv31_tool_parser import (
from vllm.tokenizers import get_tokenizer
from vllm.tool_parsers.deepseekv31_tool_parser import (
DeepSeekV31ToolParser,
)
from vllm.tokenizers import get_tokenizer

MODEL = "deepseek-ai/DeepSeek-V3.1"


tests/tool_use/test_ernie45_moe_tool_parser.py → tests/tool_parsers/test_ernie45_moe_tool_parser.py View File

@@ -13,9 +13,9 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser
from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
from vllm.tool_parsers.ernie45_tool_parser import Ernie45ToolParser

# Use a common model that is likely to be available
MODEL = "baidu/ERNIE-4.5-21B-A3B-Thinking"

tests/tool_use/test_glm4_moe_tool_parser.py → tests/tool_parsers/test_glm4_moe_tool_parser.py View File

@@ -7,12 +7,10 @@ import json
import pytest

from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
from vllm.entrypoints.openai.tool_parsers.glm4_moe_tool_parser import (
from vllm.tokenizers import get_tokenizer
from vllm.tool_parsers.glm4_moe_tool_parser import (
Glm4MoeModelToolParser,
)
from vllm.tokenizers import get_tokenizer

pytestmark = pytest.mark.cpu_test

pytest.skip("skip glm4_moe parser test", allow_module_level=True)
# Use a common model that is likely to be available

tests/tool_use/test_jamba_tool_parser.py → tests/tool_parsers/test_jamba_tool_parser.py View File

@@ -9,11 +9,9 @@ import pytest
from partial_json_parser.core.options import Allow

from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser
from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally

pytestmark = pytest.mark.cpu_test
from vllm.tool_parsers.jamba_tool_parser import JambaToolParser

MODEL = "ai21labs/Jamba-tiny-dev"


tests/tool_use/test_kimi_k2_tool_parser.py → tests/tool_parsers/test_kimi_k2_tool_parser.py View File

@@ -7,10 +7,8 @@ import json
import pytest

from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
from vllm.entrypoints.openai.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser
from vllm.tokenizers import get_tokenizer

pytestmark = pytest.mark.cpu_test
from vllm.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser

# Use a common model that is likely to be available
MODEL = "moonshotai/Kimi-K2-Instruct"

tests/tool_use/test_minimax_tool_parser.py → tests/tool_parsers/test_minimax_tool_parser.py View File

@@ -12,10 +12,8 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.minimax_tool_parser import MinimaxToolParser
from vllm.tokenizers import get_tokenizer

pytestmark = pytest.mark.cpu_test
from vllm.tool_parsers.minimax_tool_parser import MinimaxToolParser

# Use a common model that is likely to be available
MODEL = "MiniMaxAi/MiniMax-M1-40k"

tests/tool_use/test_mistral_tool_parser.py → tests/tool_parsers/test_mistral_tool_parser.py View File

@@ -12,10 +12,10 @@ from mistral_common.protocol.instruct.tool_calls import FunctionCall, ToolCall
from partial_json_parser.core.options import Allow

from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolParser
from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.tool_parsers.mistral_tool_parser import MistralToolParser


@pytest.fixture(scope="module")

tests/tool_use/test_openai_tool_parser.py → tests/tool_parsers/test_openai_tool_parser.py View File

@@ -15,8 +15,8 @@ from openai_harmony import (
)

from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
from vllm.entrypoints.openai.tool_parsers.openai_tool_parser import OpenAIToolParser
from vllm.tokenizers import get_tokenizer
from vllm.tool_parsers.openai_tool_parser import OpenAIToolParser

MODEL = "gpt2"


tests/tool_use/test_qwen3coder_tool_parser.py → tests/tool_parsers/test_qwen3coder_tool_parser.py View File

@@ -13,14 +13,12 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import (
Qwen3CoderToolParser,
)
from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally

pytestmark = pytest.mark.cpu_test
from vllm.tool_parsers.qwen3coder_tool_parser import (
Qwen3CoderToolParser,
)
from vllm.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser

MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"


tests/tool_use/test_seed_oss_tool_parser.py → tests/tool_parsers/test_seed_oss_tool_parser.py View File

@@ -14,11 +14,9 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser
from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally

pytestmark = pytest.mark.cpu_test
from vllm.tool_parsers.seed_oss_tool_parser import SeedOssToolParser

# Use a common model that is likely to be available
MODEL = "ByteDance-Seed/Seed-OSS-36B-Instruct"

tests/tool_use/test_xlam_tool_parser.py → tests/tool_parsers/test_xlam_tool_parser.py View File

@@ -12,11 +12,9 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser
from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally

pytestmark = pytest.mark.cpu_test
from vllm.tool_parsers.xlam_tool_parser import xLAMToolParser

# Use a common model that is likely to be available
MODEL = "Salesforce/Llama-xLAM-2-8B-fc-r"

+ 1
- 1
tests/tool_use/test_tool_choice_required.py View File

@@ -12,7 +12,7 @@ from vllm.entrypoints.openai.protocol import (
ChatCompletionToolsParam,
)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.tool_parsers.utils import get_json_schema_from_tools
from vllm.tool_parsers.utils import get_json_schema_from_tools

pytestmark = pytest.mark.cpu_test



+ 4
- 1
vllm/attention/layer.py View File

@@ -464,7 +464,10 @@ class MultiHeadAttention(nn.Module):
}

self.fa_version = None
if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
if (
self.attn_backend == AttentionBackendEnum.FLASH_ATTN
and current_platform.is_cuda()
):
self.fa_version = get_flash_attn_version()
assert self._flash_attn_varlen_func is not None
self._flash_attn_varlen_func = functools.partial(


+ 36
- 23
vllm/attention/selector.py View File

@@ -2,11 +2,11 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from functools import cache
from typing import cast, get_args
from typing import NamedTuple, cast, get_args

import torch

from vllm.attention.backends.abstract import AttentionBackend
from vllm.attention.backends.abstract import AttentionBackend, AttentionType
from vllm.attention.backends.registry import (
MAMBA_TYPE_TO_BACKEND_MAP,
MambaAttentionBackendEnum,
@@ -18,6 +18,31 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
logger = init_logger(__name__)


class AttentionSelectorConfig(NamedTuple):
head_size: int
dtype: torch.dtype
kv_cache_dtype: CacheDType | None
block_size: int | None
use_mla: bool = False
has_sink: bool = False
use_sparse: bool = False
use_mm_prefix: bool = False
attn_type: str = AttentionType.DECODER

def __repr__(self):
return (
f"AttentionSelectorConfig(head_size={self.head_size}, "
f"dtype={self.dtype}, "
f"kv_cache_dtype={self.kv_cache_dtype}, "
f"block_size={self.block_size}, "
f"use_mla={self.use_mla}, "
f"has_sink={self.has_sink}, "
f"use_sparse={self.use_sparse}, "
f"use_mm_prefix={self.use_mm_prefix}, "
f"attn_type={self.attn_type})"
)


def get_attn_backend(
head_size: int,
dtype: torch.dtype,
@@ -43,8 +68,7 @@ def get_attn_backend(
vllm_config = get_current_vllm_config()
backend_enum = vllm_config.attention_config.backend

return _cached_get_attn_backend(
backend=backend_enum,
attn_selector_config = AttentionSelectorConfig(
head_size=head_size,
dtype=dtype,
kv_cache_dtype=cast(CacheDType | None, kv_cache_dtype),
@@ -53,36 +77,25 @@ def get_attn_backend(
has_sink=has_sink,
use_sparse=use_sparse,
use_mm_prefix=use_mm_prefix,
attn_type=attn_type,
attn_type=attn_type or AttentionType.DECODER,
)

return _cached_get_attn_backend(
backend=backend_enum,
attn_selector_config=attn_selector_config,
)


@cache
def _cached_get_attn_backend(
backend,
head_size: int,
dtype: torch.dtype,
kv_cache_dtype: CacheDType | None,
block_size: int | None,
use_mla: bool = False,
has_sink: bool = False,
use_sparse: bool = False,
use_mm_prefix: bool = False,
attn_type: str | None = None,
attn_selector_config: AttentionSelectorConfig,
) -> type[AttentionBackend]:
from vllm.platforms import current_platform

attention_cls = current_platform.get_attn_backend_cls(
backend,
head_size,
dtype,
kv_cache_dtype,
block_size,
use_mla,
has_sink,
use_sparse,
use_mm_prefix,
attn_type,
attn_selector_config=attn_selector_config,
)
if not attention_cls:
raise ValueError(


+ 4
- 4
vllm/config/compilation.py View File

@@ -8,7 +8,7 @@ from dataclasses import field
from pathlib import Path
from typing import TYPE_CHECKING, Any, ClassVar, Literal

from pydantic import Field, TypeAdapter, field_validator
from pydantic import ConfigDict, Field, TypeAdapter, field_validator
from pydantic.dataclasses import dataclass

import vllm.envs as envs
@@ -96,7 +96,7 @@ class CUDAGraphMode(enum.Enum):


@config
@dataclass
@dataclass(config=ConfigDict(extra="forbid"))
class PassConfig:
"""Configuration for custom Inductor passes.

@@ -251,7 +251,7 @@ class DynamicShapesType(str, enum.Enum):


@config
@dataclass
@dataclass(config=ConfigDict(extra="forbid"))
class DynamicShapesConfig:
"""Configuration to control/debug torch compile dynamic shapes."""

@@ -290,7 +290,7 @@ class DynamicShapesConfig:


@config
@dataclass
@dataclass(config=ConfigDict(extra="forbid"))
class CompilationConfig:
"""Configuration for compilation.



+ 15
- 5
vllm/config/model.py View File

@@ -8,7 +8,7 @@ from functools import cached_property
from typing import TYPE_CHECKING, Any, Literal, cast, get_args

import torch
from pydantic import ConfigDict, SkipValidation, field_validator, model_validator
from pydantic import ConfigDict, Field, field_validator, model_validator
from pydantic.dataclasses import dataclass
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
from transformers.configuration_utils import ALLOWED_LAYER_TYPES
@@ -109,7 +109,7 @@ class ModelConfig:
"""Convert the model using adapters defined in
[vllm.model_executor.models.adapters][]. The most common use case is to
adapt a text generation model to be used for pooling tasks."""
tokenizer: SkipValidation[str] = None # type: ignore
tokenizer: str = Field(default=None)
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
tokenizer_mode: TokenizerMode | str = "auto"
@@ -164,7 +164,7 @@ class ModelConfig:
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
max_model_len: SkipValidation[int] = None # type: ignore
max_model_len: int = Field(default=None, gt=0)
"""Model context length (prompt and output). If unspecified, will be
automatically derived from the model config.

@@ -175,7 +175,7 @@ class ModelConfig:
- 25.6k -> 25,600"""
spec_target_max_model_len: int | None = None
"""Specify the maximum length for spec decoding draft models."""
quantization: SkipValidation[QuantizationMethods | None] = None
quantization: QuantizationMethods | str | None = None
"""Method used to quantize the weights. If `None`, we first check the
`quantization_config` attribute in the model config file. If that is
`None`, we assume the model weights are not quantized and use `dtype` to
@@ -597,6 +597,14 @@ class ModelConfig:
self._verify_cuda_graph()
self._verify_bnb_config()

@field_validator("tokenizer", "max_model_len", mode="wrap")
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
"""Skip validation if the value is `None` when initialisation is delayed."""
if value is None:
return value
return handler(value)

@field_validator("tokenizer_mode", mode="after")
def _lowercase_tokenizer_mode(cls, tokenizer_mode: str) -> str:
return tokenizer_mode.lower()
@@ -610,13 +618,14 @@ class ModelConfig:

@model_validator(mode="after")
def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
"""Called after __post_init__"""
if not isinstance(self.tokenizer, str):
raise ValueError(
f"tokenizer must be a string, got "
f"{type(self.tokenizer).__name__}: {self.tokenizer!r}. "
"Please provide a valid tokenizer path or HuggingFace model ID."
)
if not isinstance(self.max_model_len, int) or self.max_model_len <= 0:
if not isinstance(self.max_model_len, int):
raise ValueError(
f"max_model_len must be a positive integer, "
f"got {type(self.max_model_len).__name__}: {self.max_model_len!r}. "
@@ -1796,6 +1805,7 @@ _SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
("ForTextEncoding", ("pooling", "embed")),
("EmbeddingModel", ("pooling", "embed")),
("ForSequenceClassification", ("pooling", "classify")),
("ForTokenClassification", ("pooling", "classify")),
("ForAudioClassification", ("pooling", "classify")),
("ForImageClassification", ("pooling", "classify")),
("ForVideoClassification", ("pooling", "classify")),


+ 3
- 1
vllm/config/scheduler.py View File

@@ -122,10 +122,12 @@ class SchedulerConfig:
the default scheduler. Can be a class directly or the path to a class of
form "mod.custom_class"."""

disable_hybrid_kv_cache_manager: bool = False
disable_hybrid_kv_cache_manager: bool | None = None
"""If set to True, KV cache manager will allocate the same size of KV cache
for all attention layers even if there are multiple type of attention layers
like full attention and sliding window attention.
If set to None, the default value will be determined based on the environment
and starting configuration.
"""

async_scheduling: bool = False


+ 60
- 36
vllm/config/vllm.py View File

@@ -887,17 +887,48 @@ class VllmConfig:
if not self.instance_id:
self.instance_id = random_uuid()[:5]

if not self.scheduler_config.disable_hybrid_kv_cache_manager:
# logger should only print warning message for hybrid models. As we
# can't know whether the model is hybrid or not now, so we don't log
# warning message here and will log it later.
if not current_platform.support_hybrid_kv_cache():
# Hybrid KV cache manager is not supported on non-GPU platforms.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
# Hybrid KV cache manager (HMA) runtime rules:
# - Explicit enable (--no-disable-kv-cache-manager): error if runtime
# disables it
# - No preference: auto-disable for unsupported features (e.g. kv connector)
# - Explicit disable (--disable-kv-cache-manager): always respect it
need_disable_hybrid_kv_cache_manager = False
# logger should only print warning message for hybrid models. As we
# can't know whether the model is hybrid or not now, so we don't log
# warning message here and will log it later.
if not current_platform.support_hybrid_kv_cache():
# Hybrid KV cache manager is not supported on non-GPU platforms.
need_disable_hybrid_kv_cache_manager = True
if self.kv_events_config is not None:
# Hybrid KV cache manager is not compatible with KV events.
need_disable_hybrid_kv_cache_manager = True
if (
self.model_config is not None
and self.model_config.attention_chunk_size is not None
):
if (
self.speculative_config is not None
and self.speculative_config.use_eagle()
):
# Hybrid KV cache manager is not yet supported with chunked
# local attention + eagle.
need_disable_hybrid_kv_cache_manager = True
elif not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
logger.warning(
"There is a latency regression when using chunked local"
" attention with the hybrid KV cache manager. Disabling"
" it, by default. To enable it, set the environment "
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
)
# Hybrid KV cache manager is not yet supported with chunked
# local attention.
need_disable_hybrid_kv_cache_manager = True

if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
# Default to disable HMA, but only if the user didn't express a preference.
if self.kv_transfer_config is not None:
# NOTE(Kuntai): turn HMA off for connector for now.
# TODO(Kuntai): have a more elegent solution to check and
# turn off HMA for connector that does not support HMA.
# NOTE(Kuntai): turn HMA off for connector unless specifically enabled.
need_disable_hybrid_kv_cache_manager = True
logger.warning(
"Turning off hybrid kv cache manager because "
"`--kv-transfer-config` is set. This will reduce the "
@@ -905,33 +936,26 @@ class VllmConfig:
"or Mamba attention. If you are a developer of kv connector"
", please consider supporting hybrid kv cache manager for "
"your connector by making sure your connector is a subclass"
" of `SupportsHMA` defined in kv_connector/v1/base.py."
" of `SupportsHMA` defined in kv_connector/v1/base.py and"
" use --no-disable-hybrid-kv-cache-manager to start vLLM."
)
self.scheduler_config.disable_hybrid_kv_cache_manager = True
if self.kv_events_config is not None:
# Hybrid KV cache manager is not compatible with KV events.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
if (
self.model_config is not None
and self.model_config.attention_chunk_size is not None
):
if (
self.speculative_config is not None
and self.speculative_config.use_eagle()
):
# Hybrid KV cache manager is not yet supported with chunked
# local attention + eagle.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
elif not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
logger.warning(
"There is a latency regression when using chunked local"
" attention with the hybrid KV cache manager. Disabling"
" it, by default. To enable it, set the environment "
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
)
# Hybrid KV cache manager is not yet supported with chunked
# local attention.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
self.scheduler_config.disable_hybrid_kv_cache_manager = (
need_disable_hybrid_kv_cache_manager
)
elif (
self.scheduler_config.disable_hybrid_kv_cache_manager is False
and need_disable_hybrid_kv_cache_manager
):
raise ValueError(
"Hybrid KV cache manager was explicitly enabled but is not "
"supported in this configuration. Consider omitting the "
"--no-disable-hybrid-kv-cache-manager flag to let vLLM decide"
" automatically."
)

if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
# Default to enable HMA if not explicitly disabled by user or logic above.
self.scheduler_config.disable_hybrid_kv_cache_manager = False

if self.compilation_config.debug_dump_path:
self.compilation_config.debug_dump_path = (


+ 1
- 1
vllm/engine/arg_utils.py View File

@@ -491,7 +491,7 @@ class EngineArgs:
enable_chunked_prefill: bool | None = None
disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input

disable_hybrid_kv_cache_manager: bool = (
disable_hybrid_kv_cache_manager: bool | None = (
SchedulerConfig.disable_hybrid_kv_cache_manager
)



+ 9
- 0
vllm/entrypoints/chat_utils.py View File

@@ -24,6 +24,7 @@ from openai.types.chat import (
ChatCompletionContentPartInputAudioParam,
ChatCompletionContentPartRefusalParam,
ChatCompletionContentPartTextParam,
ChatCompletionFunctionToolParam,
ChatCompletionMessageToolCallParam,
ChatCompletionToolMessageParam,
)
@@ -269,6 +270,9 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
reasoning: str | None
"""The reasoning content for interleaved thinking."""

tools: list[ChatCompletionFunctionToolParam] | None
"""The tools for developer role."""


ChatCompletionMessageParam: TypeAlias = (
OpenAIChatCompletionMessageParam
@@ -300,6 +304,9 @@ class ConversationMessage(TypedDict, total=False):
reasoning_content: str | None
"""Deprecated: The reasoning content for interleaved thinking."""

tools: list[ChatCompletionFunctionToolParam] | None
"""The tools for developer role."""


# Passed in by user
ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
@@ -1619,6 +1626,8 @@ def _parse_chat_message_content(
if "name" in message and isinstance(message["name"], str):
result_msg["name"] = message["name"]

if role == "developer":
result_msg["tools"] = message.get("tools", None)
return result




+ 8
- 8
vllm/entrypoints/context.py View File

@@ -34,13 +34,13 @@ from vllm.entrypoints.openai.protocol import (
ResponseRawMessageAndToken,
ResponsesRequest,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
from vllm.entrypoints.responses_utils import construct_tool_dicts
from vllm.entrypoints.tool import Tool
from vllm.entrypoints.tool_server import ToolServer
from vllm.outputs import RequestOutput
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
from vllm.tokenizers.protocol import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import ToolParser
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import random_uuid

@@ -74,24 +74,24 @@ class TurnMetrics:

def __init__(
self,
input_tokens=0,
output_tokens=0,
cached_input_tokens=0,
tool_output_tokens=0,
):
input_tokens: int = 0,
output_tokens: int = 0,
cached_input_tokens: int = 0,
tool_output_tokens: int = 0,
) -> None:
self.input_tokens = input_tokens
self.output_tokens = output_tokens
self.cached_input_tokens = cached_input_tokens
self.tool_output_tokens = tool_output_tokens

def reset(self):
def reset(self) -> None:
"""Reset counters for a new turn."""
self.input_tokens = 0
self.output_tokens = 0
self.cached_input_tokens = 0
self.tool_output_tokens = 0

def copy(self):
def copy(self) -> "TurnMetrics":
"""Create a copy of this turn's token counts."""
return TurnMetrics(
self.input_tokens,


+ 1
- 1
vllm/entrypoints/openai/api_server.py View File

@@ -72,7 +72,6 @@ from vllm.entrypoints.openai.serving_transcription import (
OpenAIServingTranscription,
OpenAIServingTranslation,
)
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.classify.serving import ServingClassification
from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
@@ -95,6 +94,7 @@ from vllm.entrypoints.utils import (
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
from vllm.tasks import POOLING_TASKS
from vllm.tool_parsers import ToolParserManager
from vllm.usage.usage_lib import UsageContext
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.gc_utils import freeze_gc_heap


+ 1
- 1
vllm/entrypoints/openai/cli_args.py View File

@@ -27,8 +27,8 @@ from vllm.entrypoints.constants import (
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT,
)
from vllm.entrypoints.openai.serving_models import LoRAModulePath
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
from vllm.logger import init_logger
from vllm.tool_parsers import ToolParserManager
from vllm.utils.argparse_utils import FlexibleArgumentParser

logger = init_logger(__name__)


+ 1
- 1
vllm/entrypoints/openai/parser/responses_parser.py View File

@@ -12,10 +12,10 @@ from openai.types.responses.response_reasoning_item import (
)

from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
from vllm.outputs import CompletionOutput
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
from vllm.tokenizers.protocol import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import ToolParser
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import random_uuid



+ 30
- 34
vllm/entrypoints/openai/serving_chat.py View File

@@ -57,8 +57,6 @@ from vllm.entrypoints.openai.serving_engine import (
clamp_prompt_logprobs,
)
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.tool_parsers import ToolParser
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
from vllm.inputs.data import TokensPrompt
@@ -73,6 +71,8 @@ from vllm.tokenizers.mistral import (
truncate_tool_call_ids,
validate_request_params,
)
from vllm.tool_parsers import ToolParser
from vllm.tool_parsers.mistral_tool_parser import MistralToolCall
from vllm.utils.collection_utils import as_list
from vllm.v1.sample.logits_processor import validate_logits_processors_parameters

@@ -964,21 +964,9 @@ class OpenAIServingChat(OpenAIServing):
assert reasoning_end_arr is not None
output_token_ids = as_list(output.token_ids)
if not reasoning_end_arr[i]:
delta_message = (
reasoning_parser.extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
output_token_ids,
)
)
# When encountering think end id in prompt_token_ids
# i.e {"enable_thinking": False},
# set reasoning status to end.
# Remove the text and token ids related
# to 'reasoning'.
if (
res.prompt_token_ids
and reasoning_parser.is_reasoning_end(
@@ -987,30 +975,38 @@ class OpenAIServingChat(OpenAIServing):
):
reasoning_end_arr[i] = True
current_token_ids = output_token_ids
if delta_message and delta_message.content:
current_text = delta_message.content
delta_message.content = None
else:
current_text = ""
# When encountering think end id in delta_token_ids,
# set reasoning status to end.
# Remove the text and token ids related
# to 'reasoning'.
if reasoning_parser.is_reasoning_end(output_token_ids):
reasoning_end_arr[i] = True
current_token_ids = (
reasoning_parser.extract_content_ids(
output_token_ids
# Don't update current_text, keep it as is from delta
else:
delta_message = (
reasoning_parser.extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
output_token_ids,
)
)
if delta_message and delta_message.content:
current_text = delta_message.content
delta_message.content = None
else:
current_text = ""

# When encountering think end id in delta_token_ids,
# set reasoning status to end.
# Remove the text and token ids related
# to 'reasoning'.
if reasoning_parser.is_reasoning_end(output_token_ids):
reasoning_end_arr[i] = True
current_token_ids = (
reasoning_parser.extract_content_ids(
output_token_ids
)
)
if delta_message and delta_message.content:
current_text = delta_message.content
delta_message.content = None
else:
current_text = ""

# handle tool calls only after reasoning is done,
else:
if reasoning_end_arr[i]:
delta_token_ids = output_token_ids
# First time to tool call,
# add the remaining text and token ids


+ 1
- 1
vllm/entrypoints/openai/serving_engine.py View File

@@ -59,7 +59,6 @@ from vllm.entrypoints.openai.protocol import (
TranslationRequest,
)
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
from vllm.entrypoints.pooling.classify.protocol import (
ClassificationChatRequest,
ClassificationCompletionRequest,
@@ -104,6 +103,7 @@ from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.deepseek_v32 import DeepseekV32Tokenizer
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.tool_parsers import ToolParser, ToolParserManager
from vllm.tracing import (
contains_trace_headers,
extract_trace_headers,


+ 23
- 140
vllm/entrypoints/openai/tool_parsers/__init__.py View File

@@ -1,150 +1,33 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
ToolParserManager,
)
import warnings

__all__ = ["ToolParser", "ToolParserManager"]

def __getattr__(name: str):
if name == "ToolParser":
from vllm.tool_parsers import ToolParser

"""
Register a lazy module mapping.
warnings.warn(
"`vllm.entrypoints.openai.tool_parsers.ToolParser` has been moved to "
"`vllm.tool_parsers.ToolParser`. "
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)

Example:
ToolParserManager.register_lazy_module(
name="kimi_k2",
module_path="vllm.entrypoints.openai.tool_parsers.kimi_k2_parser",
class_name="KimiK2ToolParser",
)
"""
return ToolParser
if name == "ToolParserManager":
from vllm.tool_parsers import ToolParserManager

warnings.warn(
"`vllm.entrypoints.openai.tool_parsers.ToolParserManager` "
"has been moved to `vllm.tool_parsers.ToolParserManager`. "
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)

_TOOL_PARSERS_TO_REGISTER = {
"deepseek_v3": ( # name
"deepseekv3_tool_parser", # filename
"DeepSeekV3ToolParser", # class_name
),
"deepseek_v31": (
"deepseekv31_tool_parser",
"DeepSeekV31ToolParser",
),
"deepseek_v32": (
"deepseekv32_tool_parser",
"DeepSeekV32ToolParser",
),
"ernie45": (
"ernie45_tool_parser",
"Ernie45ToolParser",
),
"glm45": (
"glm4_moe_tool_parser",
"Glm4MoeModelToolParser",
),
"granite-20b-fc": (
"granite_20b_fc_tool_parser",
"Granite20bFCToolParser",
),
"granite": (
"granite_tool_parser",
"GraniteToolParser",
),
"hermes": (
"hermes_tool_parser",
"Hermes2ProToolParser",
),
"hunyuan_a13b": (
"hunyuan_a13b_tool_parser",
"HunyuanA13BToolParser",
),
"internlm": (
"internlm2_tool_parser",
"Internlm2ToolParser",
),
"jamba": (
"jamba_tool_parser",
"JambaToolParser",
),
"kimi_k2": (
"kimi_k2_tool_parser",
"KimiK2ToolParser",
),
"llama3_json": (
"llama_tool_parser",
"Llama3JsonToolParser",
),
"llama4_json": (
"llama_tool_parser",
"Llama3JsonToolParser",
),
"llama4_pythonic": (
"llama4_pythonic_tool_parser",
"Llama4PythonicToolParser",
),
"longcat": (
"longcat_tool_parser",
"LongcatFlashToolParser",
),
"minimax_m2": (
"minimax_m2_tool_parser",
"MinimaxM2ToolParser",
),
"minimax": (
"minimax_tool_parser",
"MinimaxToolParser",
),
"mistral": (
"mistral_tool_parser",
"MistralToolParser",
),
"olmo3": (
"olmo3_tool_parser",
"Olmo3PythonicToolParser",
),
"openai": (
"openai_tool_parser",
"OpenAIToolParser",
),
"phi4_mini_json": (
"phi4mini_tool_parser",
"Phi4MiniJsonToolParser",
),
"pythonic": (
"pythonic_tool_parser",
"PythonicToolParser",
),
"qwen3_coder": (
"qwen3coder_tool_parser",
"Qwen3CoderToolParser",
),
"qwen3_xml": (
"qwen3xml_tool_parser",
"Qwen3XMLToolParser",
),
"seed_oss": (
"seed_oss_tool_parser",
"SeedOssToolParser",
),
"step3": (
"step3_tool_parser",
"Step3ToolParser",
),
"xlam": (
"xlam_tool_parser",
"xLAMToolParser",
),
"gigachat3": (
"gigachat3_tool_parser",
"GigaChat3ToolParser",
),
}
return ToolParserManager


def register_lazy_tool_parsers():
for name, (file_name, class_name) in _TOOL_PARSERS_TO_REGISTER.items():
module_path = f"vllm.entrypoints.openai.tool_parsers.{file_name}"
ToolParserManager.register_lazy_module(name, module_path, class_name)


register_lazy_tool_parsers()
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

+ 22
- 17
vllm/model_executor/layers/batch_invariant.py View File

@@ -6,7 +6,7 @@ from typing import Any

import torch

import vllm.envs as envs
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.triton_utils import tl, triton
@@ -1004,27 +1004,30 @@ def vllm_is_batch_invariant() -> bool:
return VLLM_BATCH_INVARIANT


def override_envs_for_invariance():
curr_attn_backend = envs.VLLM_ATTENTION_BACKEND
def override_envs_for_invariance(
attention_backend: AttentionBackendEnum | None,
):
supported_backends = [
"FLASH_ATTN", # best supported backend
"FLASHINFER",
"FLASH_ATTN_MLA",
"TRITON_MLA",
AttentionBackendEnum.FLASH_ATTN, # best supported backend
AttentionBackendEnum.FLASHINFER,
AttentionBackendEnum.FLASH_ATTN_MLA,
AttentionBackendEnum.TRITON_MLA,
# Not yet supported MLA backends
# "FLASHMLA",
# "FLEX_ATTENTION", # IMA issue even if we disable batch invariance
# "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967
# AttentionBackendEnum.FLASHMLA,
# AttentionBackendEnum.FLEX_ATTENTION, # IMA issue
# AttentionBackendEnum.FLASHINFER_MLA, # PR #28967
]
if curr_attn_backend not in supported_backends:
if attention_backend not in supported_backends:
supported_names = [b.name for b in supported_backends]
backend_name = attention_backend.name if attention_backend else None
error = (
"VLLM batch_invariant mode requires an attention backend in "
f"{supported_backends}, but got '{curr_attn_backend}'. "
"Please set the 'VLLM_ATTENTION_BACKEND' environment variable "
"to one of the supported backends before enabling batch_invariant."
f"{supported_names}, but got '{backend_name}'. "
"Please use --attention-backend or attention_config to set "
"one of the supported backends before enabling batch_invariant."
)
raise RuntimeError(error)
if os.environ["VLLM_ATTENTION_BACKEND"] != supported_backends[0]:
if attention_backend != supported_backends[0]:
warning = (
"You are using a decode-invariant form of batch invariance. "
"This will not be invariant between prefill and decode."
@@ -1050,10 +1053,12 @@ def override_envs_for_invariance():
os.environ["VLLM_USE_AOT_COMPILE"] = "0"


def init_batch_invariance():
def init_batch_invariance(
attention_backend: AttentionBackendEnum | None,
):
# this will hit all the csrc overrides as well
if vllm_is_batch_invariant():
override_envs_for_invariance()
override_envs_for_invariance(attention_backend)
enable_batch_invariant_mode()

# Disable TF32 for batch invariance - it causes non-deterministic rounding


+ 154
- 89
vllm/model_executor/layers/quantization/fp8.py View File

@@ -332,7 +332,10 @@ class Fp8Config(QuantizationConfig):
fused_mapping=self.packed_modules_mapping,
):
return UnquantizedFusedMoEMethod(layer.moe_config)
moe_quant_method = Fp8MoEMethod(self, layer)
if self.is_checkpoint_fp8_serialized:
moe_quant_method = Fp8MoEMethod(self, layer)
else:
moe_quant_method = Fp8OnlineMoEMethod(self, layer)
moe_quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
return moe_quant_method
elif isinstance(layer, Attention):
@@ -745,8 +748,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
layer.orig_dtype = params_dtype
layer.weight_block_size = None

if self.quant_config.is_checkpoint_fp8_serialized:
params_dtype = torch.float8_e4m3fn
assert self.quant_config.is_checkpoint_fp8_serialized
params_dtype = torch.float8_e4m3fn

if self.block_quant:
assert self.weight_block_size is not None
layer.weight_block_size = self.weight_block_size
@@ -773,41 +777,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
f"weight quantization block_k = {block_k}."
)

# if we are doing online quantization, patch the weight
# loaded to call `process_weights_after_loading` in a streaming fashion
# as soon as the last weight chunk is loaded
if not self.quant_config.is_checkpoint_fp8_serialized:
weight_loader = extra_weight_attrs["weight_loader"]
# create a new holder to prevent modifying behavior of any other
# objects which might depend on the old one
new_extra_weight_attrs = extra_weight_attrs

def patched_weight_loader(param, loaded_weight, *args, **kwargs):
# load the current weight chunk
res = weight_loader(param, loaded_weight, *args, **kwargs) # type: ignore[misc]

# add a counter to track how many elements we have updated
if not hasattr(layer, "_loaded_numel"):
layer._loaded_numel = 0
layer._loaded_numel += loaded_weight.numel()

# if we have loaded all of the elements, call
# process_weights_after_loading
target_loaded_numel = layer.w13_weight.numel() + layer.w2_weight.numel()
if layer._loaded_numel == target_loaded_numel:
self.process_weights_after_loading(layer)

# Delete the bookkeeping
del layer._loaded_numel
# Prevent the usual `process_weights_after_loading` call
# from doing anything
layer._already_called_process_weights_after_loading = True

return res

new_extra_weight_attrs["weight_loader"] = patched_weight_loader
extra_weight_attrs = new_extra_weight_attrs

# WEIGHTS
w13_weight = torch.nn.Parameter(
torch.empty(
@@ -875,21 +844,11 @@ class Fp8MoEMethod(FusedMoEMethodBase):
if self.block_quant
else {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
)
# If loading fp8 checkpoint, pass the weight loaders.
# If loading an fp16 checkpoint, do not (we will quantize in
# process_weights_after_loading()
if self.quant_config.is_checkpoint_fp8_serialized:
set_weight_attrs(w13_weight_scale, extra_weight_attrs)
set_weight_attrs(w2_weight_scale, extra_weight_attrs)
set_weight_attrs(w13_weight_scale, extra_weight_attrs)
set_weight_attrs(w2_weight_scale, extra_weight_attrs)

# INPUT_SCALES
if self.quant_config.activation_scheme == "static":
if not self.quant_config.is_checkpoint_fp8_serialized:
raise ValueError(
"Found static activation scheme for checkpoint that "
"was not serialized fp8."
)

w13_input_scale = torch.nn.Parameter(
torch.ones(num_experts, dtype=torch.float32), requires_grad=False
)
@@ -986,45 +945,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
layer.w2_weight_scale_inv = Parameter(
dg_w2_weight_scale_inv, requires_grad=False
)

# If checkpoint is fp16, quantize in place.
elif not self.quant_config.is_checkpoint_fp8_serialized:
fp8_dtype = current_platform.fp8_dtype()
w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)

# Re-initialize w13_scale because we directly quantize
# merged w13 weights and generate a single scaling factor.
replace_parameter(
layer,
"w13_weight_scale",
torch.ones(
layer.local_num_experts,
dtype=torch.float32,
device=w13_weight.device,
),
)
for expert in range(layer.local_num_experts):
w13_weight[expert, :, :], layer.w13_weight_scale[expert] = (
ops.scaled_fp8_quant(layer.w13_weight.data[expert, :, :])
)
w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
)
replace_parameter(layer, "w13_weight", w13_weight)
replace_parameter(layer, "w2_weight", w2_weight)

if self.rocm_aiter_moe_enabled:
# reshaping weights is required for aiter moe kernel.
shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
layer.w13_weight, layer.w2_weight
)

replace_parameter(layer, "w13_weight", shuffled_w13)
replace_parameter(layer, "w2_weight", shuffled_w2)
# If checkpoint is fp8, we need to handle that the
# MoE kernels require single activation scale and single weight
# scale for w13 per expert.
else:
# Fp8 moe kernels require a single activation scale.
# We take the max of all the scales in case they differ.
@@ -1387,6 +1307,151 @@ class Fp8MoEMethod(FusedMoEMethodBase):
return result


class Fp8OnlineMoEMethod(Fp8MoEMethod):
"""MoE method for online FP8 quantization.
Supports loading quantized FP16/BF16 model checkpoints with dynamic
activation scaling. The weight scaling factor will be initialized after
the model weights are loaded.

Args:
quant_config: The quantization config.
"""

def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
super().__init__(quant_config, layer)
assert not quant_config.is_checkpoint_fp8_serialized
assert quant_config.activation_scheme == "dynamic"
assert quant_config.weight_block_size is None
assert self.flashinfer_moe_backend is None

def create_weights(
self,
layer: Module,
num_experts: int,
hidden_size: int,
intermediate_size_per_partition: int,
params_dtype: torch.dtype,
**extra_weight_attrs,
):
layer.intermediate_size_per_partition = intermediate_size_per_partition
layer.hidden_size = hidden_size
layer.num_experts = num_experts
layer.orig_dtype = params_dtype
layer.weight_block_size = None

# We are doing online quantization, patch the weight loaded
# to call `process_weights_after_loading` in a streaming fashion
# as soon as the last weight chunk is loaded.
weight_loader = extra_weight_attrs["weight_loader"]
# create a new holder to prevent modifying behavior of any other
# objects which might depend on the old one
new_extra_weight_attrs = extra_weight_attrs

def patched_weight_loader(param, loaded_weight, *args, **kwargs):
# load the current weight chunk
res = weight_loader(param, loaded_weight, *args, **kwargs) # type: ignore[misc]

# add a counter to track how many elements we have updated
if not hasattr(layer, "_loaded_numel"):
layer._loaded_numel = 0
layer._loaded_numel += loaded_weight.numel()

# if we have loaded all of the elements, call
# process_weights_after_loading
target_loaded_numel = layer.w13_weight.numel() + layer.w2_weight.numel()
if layer._loaded_numel == target_loaded_numel:
self.process_weights_after_loading(layer)

# Delete the bookkeeping
del layer._loaded_numel
# Prevent the usual `process_weights_after_loading` call
# from doing anything
layer._already_called_process_weights_after_loading = True

return res

new_extra_weight_attrs["weight_loader"] = patched_weight_loader
extra_weight_attrs = new_extra_weight_attrs

# WEIGHTS
w13_weight = torch.nn.Parameter(
torch.empty(
num_experts,
2 * intermediate_size_per_partition,
hidden_size,
dtype=params_dtype,
),
requires_grad=False,
)
layer.register_parameter("w13_weight", w13_weight)
set_weight_attrs(w13_weight, extra_weight_attrs)

w2_weight = torch.nn.Parameter(
torch.empty(
num_experts,
hidden_size,
intermediate_size_per_partition,
dtype=params_dtype,
),
requires_grad=False,
)
layer.register_parameter("w2_weight", w2_weight)
set_weight_attrs(w2_weight, extra_weight_attrs)

# WEIGHT_SCALES
# Allocate 2 scales for w1 and w3 respectively.
# They will be combined to a single scale after weight loading.
w13_weight_scale = torch.nn.Parameter(
torch.ones(num_experts, dtype=torch.float32), requires_grad=False
)
w2_weight_scale = torch.nn.Parameter(
torch.ones(num_experts, dtype=torch.float32), requires_grad=False
)
layer.register_parameter("w13_weight_scale", w13_weight_scale)
layer.register_parameter("w2_weight_scale", w2_weight_scale)

layer.w13_input_scale = None
layer.w2_input_scale = None

self.rocm_aiter_moe_enabled = False

def process_weights_after_loading(self, layer: Module) -> None:
if getattr(layer, "_already_called_process_weights_after_loading", False):
return

# Lazy import to avoid importing triton too early.
self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()

# If checkpoint is fp16, quantize in place.
fp8_dtype = current_platform.fp8_dtype()
w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)

for expert in range(layer.local_num_experts):
w13_weight[expert, :, :], layer.w13_weight_scale[expert] = (
ops.scaled_fp8_quant(layer.w13_weight.data[expert, :, :])
)
w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
)
replace_parameter(layer, "w13_weight", w13_weight)
replace_parameter(layer, "w2_weight", w2_weight)

# Reshuffle weights for AITER if needed.
if self.rocm_aiter_moe_enabled:
shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
layer.w13_weight, layer.w2_weight
)
replace_parameter(layer, "w13_weight", shuffled_w13)
replace_parameter(layer, "w2_weight", shuffled_w2)

# Rushuffle weights for MARLIN if needed.
if self.use_marlin:
prepare_moe_fp8_layer_for_marlin(
layer, False, input_dtype=self.marlin_input_dtype
)


class Fp8KVCacheMethod(BaseKVCacheMethod):
"""
Supports loading kv-cache scaling factors from FP8 checkpoints.


+ 12
- 0
vllm/model_executor/models/adapters.py View File

@@ -337,6 +337,18 @@ def as_seq_cls_model(cls: _T) -> _T:
tokens = getattr(text_config, "classifier_from_token", None)
method = getattr(text_config, "method", None)

def auto_set_score_bias(weights):
for name, weight in weights:
if name == "score.bias":
device = self.score.weight.device
dtype = self.score.weight.dtype
bias = weight.to(device).to(dtype)
self.score.bias = torch.nn.Parameter(bias)
self.score.skip_bias_add = False
else:
yield name, weight

weights = auto_set_score_bias(weights)
if tokens is None and method is None:
return super().load_weights(weights)
else:


+ 584
- 0
vllm/model_executor/models/bagel.py View File

@@ -0,0 +1,584 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2025 Bytedance Ltd. and/or its affiliates.
"""Inference-only BAGEL model compatible with HuggingFace weights.

BAGEL is a unified multimodal model for image understanding and generation.
For vLLM, we focus on the image understanding (vision-to-text) capabilities.
"""

from collections.abc import Iterable, Mapping, Sequence
from typing import Any, Literal, TypeAlias

import torch
import torch.nn as nn

from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.logger import init_logger
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.linear import (
ColumnParallelLinear,
RowParallelLinear,
)
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (
BaseMultiModalProcessor,
BaseProcessingInfo,
PromptReplacement,
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processors.bagel import BagelProcessor
from vllm.utils.tensor_schema import TensorSchema

from .interfaces import (
MultiModalEmbeddings,
SupportsLoRA,
SupportsMultiModal,
SupportsPP,
)
from .siglip import SiglipVisionModel
from .utils import (
AutoWeightsLoader,
WeightsMapper,
init_vllm_registered_model,
maybe_prefix,
)

logger = init_logger(__name__)


class BagelImagePixelInputs(TensorSchema):
"""
Dimensions:
- bn: Batch size * number of images
- c: Number of channels (3)
- h: Height of each image
- w: Width of each image
"""

type: Literal["pixel_values"]
pixel_values: torch.Tensor # Shape: (bn, 3, h, w)


BagelImageInputs: TypeAlias = BagelImagePixelInputs


class BagelVisionMLP(nn.Module):
"""MLP connector for vision features."""

def __init__(
self,
in_features: int,
hidden_features: int,
out_features: int,
act_layer: str = "gelu_pytorch_tanh",
quant_config: QuantizationConfig | None = None,
prefix: str = "",
):
super().__init__()
self.fc1 = ColumnParallelLinear(
in_features,
hidden_features,
bias=True,
quant_config=quant_config,
prefix=f"{prefix}.fc1",
)
self.act = get_act_fn(act_layer)
self.fc2 = RowParallelLinear(
hidden_features,
out_features,
bias=True,
quant_config=quant_config,
prefix=f"{prefix}.fc2",
)

def forward(self, x: torch.Tensor) -> torch.Tensor:
x, _ = self.fc1(x)
x = self.act(x)
x, _ = self.fc2(x)
return x


class PositionEmbedding(nn.Module):
"""2D position embedding for vision tokens using sin-cos embeddings."""

def __init__(self, max_num_patch_per_side: int, hidden_size: int):
super().__init__()
self.max_num_patch_per_side = max_num_patch_per_side
self.hidden_size = hidden_size

# Create learnable 2D position embeddings (frozen sin-cos)
pos_embed = self._get_2d_sincos_pos_embed(hidden_size, max_num_patch_per_side)
self.register_buffer(
"pos_embed",
torch.from_numpy(pos_embed).float(),
persistent=False,
)

@staticmethod
def _get_2d_sincos_pos_embed(embed_dim: int, grid_size: int):
"""Generate 2D sin-cos position embeddings."""
import numpy as np

grid_h = np.arange(grid_size, dtype=np.float32)
grid_w = np.arange(grid_size, dtype=np.float32)
grid = np.meshgrid(grid_w, grid_h) # w goes first
grid = np.stack(grid, axis=0)
grid = grid.reshape([2, 1, grid_size, grid_size])
pos_embed = PositionEmbedding._get_2d_sincos_pos_embed_from_grid(
embed_dim, grid
)
return pos_embed

@staticmethod
def _get_2d_sincos_pos_embed_from_grid(embed_dim: int, grid):
"""Generate 2D sin-cos position embeddings from grid."""
import numpy as np

assert embed_dim % 2 == 0
# use half of dimensions to encode grid_h
emb_h = PositionEmbedding._get_1d_sincos_pos_embed_from_grid(
embed_dim // 2, grid[0]
)
emb_w = PositionEmbedding._get_1d_sincos_pos_embed_from_grid(
embed_dim // 2, grid[1]
)
emb = np.concatenate([emb_h, emb_w], axis=1)
return emb

@staticmethod
def _get_1d_sincos_pos_embed_from_grid(embed_dim: int, pos):
"""Generate 1D sin-cos position embeddings."""
import numpy as np

assert embed_dim % 2 == 0
omega = np.arange(embed_dim // 2, dtype=np.float64)
omega /= embed_dim / 2.0
omega = 1.0 / 10000**omega

pos = pos.reshape(-1)
out = np.einsum("m,d->md", pos, omega)

emb_sin = np.sin(out)
emb_cos = np.cos(out)
emb = np.concatenate([emb_sin, emb_cos], axis=1)
return emb

def forward(self, position_ids: torch.Tensor) -> torch.Tensor:
"""
Args:
position_ids: Flattened position IDs, shape (N,) where each ID
corresponds to a position in the flattened grid
Returns:
Position embeddings of shape (N, hidden_size)
"""
# Ensure position_ids are on the same device as pos_embed
position_ids = position_ids.to(self.pos_embed.device)
return self.pos_embed[position_ids]


class BagelProcessingInfo(BaseProcessingInfo):
"""Processing information for BAGEL model."""

def get_hf_processor(self, **kwargs: object) -> BagelProcessor:
from vllm.transformers_utils.processor import cached_get_image_processor

image_processor = cached_get_image_processor(
self.ctx.model_config.model,
revision=self.ctx.model_config.revision,
trust_remote_code=self.ctx.model_config.trust_remote_code,
)

tokenizer = self.get_tokenizer()

return BagelProcessor(
image_processor=image_processor,
tokenizer=tokenizer,
**kwargs,
)

def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"image": None}

def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int]:
hf_config = self.get_hf_config()
# Calculate max tokens per image
# For BAGEL: (vit_max_num_patch_per_side) ** 2
max_num_patches = hf_config.vit_max_num_patch_per_side**2
return {"image": max_num_patches}

def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
hf_config = self.get_hf_config()
vit_config = hf_config.vit_config
patch_size = vit_config.patch_size

# Calculate number of patches
num_patches_h = image_height // patch_size
num_patches_w = image_width // patch_size
return num_patches_h * num_patches_w


class BagelDummyInputsBuilder(BaseDummyInputsBuilder[BagelProcessingInfo]):
"""Build dummy inputs for BAGEL model profiling."""

def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
num_images = mm_counts.get("image", 0)
# Use a simple placeholder for each image
return "<|image_pad|>" * num_images

def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
hf_config = self.info.get_hf_config()
vit_config = hf_config.vit_config

# Use the configured image size
image_size = vit_config.image_size
image_overrides = mm_options.get("image") if mm_options else None

return {
"image": self._get_dummy_images(
width=image_size,
height=image_size,
num_images=num_images,
overrides=image_overrides,
),
}


class BagelMultiModalProcessor(BaseMultiModalProcessor[BagelProcessingInfo]):
"""Multimodal processor for BAGEL model."""

def _hf_processor_applies_updates(
self,
prompt_text: str,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Mapping[str, object],
) -> bool:
return False

def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptReplacement]:
"""Replace image placeholders with the correct number of tokens."""
hf_config = self.info.get_hf_config()

# Get the tokenizer to look up the image token ID
tokenizer = self.info.get_tokenizer()
image_token_id = tokenizer.get_vocab().get("<|image_pad|>")
if image_token_id is None:
raise ValueError(
"Image token '<|image_pad|>' not found in tokenizer vocabulary"
)

def get_replacement_bagel(item_idx: int):
# For BAGEL, calculate number of tokens based on max patch size
num_tokens = hf_config.vit_max_num_patch_per_side**2
# Use the image token ID from tokenizer
return [image_token_id] * num_tokens

return [
PromptReplacement(
modality="image",
target=[image_token_id],
replacement=get_replacement_bagel,
)
]

def _get_mm_fields_config(
self,
hf_inputs: Any,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
return {
"pixel_values": MultiModalFieldConfig.batched("image"),
}


@MULTIMODAL_REGISTRY.register_processor(
BagelMultiModalProcessor,
info=BagelProcessingInfo,
dummy_inputs=BagelDummyInputsBuilder,
)
class BagelForConditionalGeneration(
nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP
):
"""
BAGEL: A unified multimodal model for image understanding and generation.

For vLLM, we focus on the image understanding (vision-to-text) capabilities.
The image generation part is not supported in vLLM.
"""

# Weight mapping from HF to vLLM
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
"language_model.": "language_model.",
"vit_model.": "vit_model.",
"connector.": "connector.",
"vit_pos_embed.": "vit_pos_embed.",
}
)

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()

config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
multimodal_config = vllm_config.model_config.multimodal_config

# Ensure we have a BagelConfig (check by name to handle trust_remote_code)
# When trust_remote_code=True, the config comes from transformers_modules
if type(config).__name__ != "BagelConfig":
raise ValueError(
f"Expected BagelConfig, got {type(config).__name__}. "
"Make sure the model config is properly loaded."
)

self.config = config
self.multimodal_config = multimodal_config

# Initialize language model (Qwen2)
# Pass the llm_config from BagelConfig to initialize Qwen2 properly
self.language_model = init_vllm_registered_model(
vllm_config=vllm_config,
hf_config=config.llm_config,
prefix=maybe_prefix(prefix, "language_model"),
architectures=["Qwen2ForCausalLM"],
)

# Initialize vision model (SigLIP) if visual understanding is enabled
if config.visual_und:
# Fix vit_config: checkpoint has 26 layers (0-25) but config says 27
# Also disable head as it's not in checkpoint
vit_config = config.vit_config
if vit_config.num_hidden_layers == 27:
logger.warning(
"Overriding vit_config.num_hidden_layers from 27 to 26 "
"to match the Bagel model checkpoint."
)
vit_config.num_hidden_layers = 26
if not hasattr(vit_config, "vision_use_head"):
logger.warning(
"Setting vit_config.vision_use_head to False as it is not "
"present in the Bagel model checkpoint."
)
vit_config.vision_use_head = False

self.vit_model = SiglipVisionModel(
config=vit_config,
quant_config=quant_config,
prefix=maybe_prefix(prefix, "vit_model"),
)

# Initialize connector (MLP)
vit_hidden_size = config.vit_config.hidden_size
llm_hidden_size = config.llm_config.hidden_size

self.connector = BagelVisionMLP(
in_features=vit_hidden_size,
hidden_features=llm_hidden_size,
out_features=llm_hidden_size,
act_layer=config.connector_act,
quant_config=quant_config,
prefix=maybe_prefix(prefix, "connector"),
)

# Position embedding for vision tokens
self.vit_pos_embed = PositionEmbedding(
max_num_patch_per_side=config.vit_max_num_patch_per_side,
hidden_size=llm_hidden_size,
)
else:
self.vit_model = None
self.connector = None
self.vit_pos_embed = None

self.make_empty_intermediate_tensors = (
self.language_model.make_empty_intermediate_tensors
)

def _parse_and_validate_image_input(
self, **kwargs: object
) -> BagelImageInputs | None:
pixel_values = kwargs.pop("pixel_values", None)

if pixel_values is None:
return None

return BagelImagePixelInputs(
type="pixel_values",
pixel_values=pixel_values,
)

def _process_image_input(
self, image_input: BagelImageInputs
) -> tuple[torch.Tensor, ...]:
"""Process image inputs through vision encoder and connector."""
pixel_values = image_input["pixel_values"]

# Handle potential extra batch dimension
# Expected shape: (batch_size * num_images, 3, H, W)
# But might receive: (batch_size, num_images, 3, H, W)
if pixel_values.ndim == 5:
# Flatten batch and num_images dimensions
batch_size, num_images, channels, height, width = pixel_values.shape
pixel_values = pixel_values.reshape(
batch_size * num_images, channels, height, width
)

# Get vision features from SigLIP
# pixel_values shape: (batch_size * num_images, 3, H, W)
vision_features = self.vit_model(pixel_values)

# Pass through connector
vision_embeds = self.connector(vision_features)

# Add position embeddings
batch_size, num_patches, hidden_size = vision_embeds.shape
patch_size = self.config.vit_config.patch_size
image_size = self.config.vit_config.image_size

# Calculate grid dimensions
num_patches_per_side = image_size // patch_size

# Create flattened position IDs (0 to num_patches-1)
# For BAGEL, we use extrapolate mode by default
h_coords = torch.arange(num_patches_per_side, device=vision_embeds.device)
w_coords = torch.arange(num_patches_per_side, device=vision_embeds.device)
position_ids = (
h_coords[:, None] * self.config.vit_max_num_patch_per_side + w_coords
).flatten()
position_ids = position_ids.unsqueeze(0).expand(batch_size, -1).flatten()

# Add position embeddings
pos_embeds = self.vit_pos_embed(position_ids)
pos_embeds = pos_embeds.reshape(batch_size, num_patches, hidden_size)
# Ensure pos_embeds are on the same device as vision_embeds
pos_embeds = pos_embeds.to(vision_embeds.device)
vision_embeds = vision_embeds + pos_embeds

# Split by image
return tuple(vision_embeds)

def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
"""Get multimodal embeddings from input."""
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return []

return self._process_image_input(image_input)

def get_language_model(self) -> nn.Module:
return self.language_model

def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None,
**kwargs: object,
) -> torch.Tensor | IntermediateTensors:
"""Run forward pass for BAGEL.

Args:
input_ids: Flattened (concatenated) input_ids corresponding to a batch.
positions: Flattened (concatenated) position ids corresponding to a batch.
intermediate_tensors: Intermediate tensors from prior forward pass.
inputs_embeds: Optional tensor of input embeddings.
"""
if intermediate_tensors is not None:
inputs_embeds = None

hidden_states = self.language_model.model(
input_ids=input_ids,
positions=positions,
intermediate_tensors=intermediate_tensors,
inputs_embeds=inputs_embeds,
)
return hidden_states

def compute_logits(
self,
hidden_states: torch.Tensor,
) -> torch.Tensor | None:
return self.language_model.compute_logits(hidden_states)

def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
"""Load weights from checkpoint."""
skip_prefixes = []
# Skip vit_pos_embed.pos_embed as it's handled by PositionEmbedding module
skip_prefixes.append("vit_pos_embed.pos_embed")

# If visual understanding is disabled, skip vision-related weights
if self.vit_model is None:
skip_prefixes.extend(["vit_model.", "connector.", "vit_pos_embed"])

# Skip generation-related weights since we only support text2text and image2text
# Filter out all image generation components:
# - 'moe_gen': MoE generation weights
# - 'latent_pos_embed': Latent position embeddings for VAE
# - 'llm2vae', 'vae2llm': LLM-VAE projections
# - 'time_embedder': Timestep embeddings for diffusion
# - VAE encoder/decoder: Use specific prefixes to avoid matching vision encoder
generation_keywords = [
"moe_gen",
"latent_pos_embed",
"llm2vae",
"vae2llm",
"time_embedder",
]
vae_prefixes = [
"decoder.",
"encoder.",
] # VAE encoder/decoder, not vision encoder
filtered_weights = []
for name, tensor in weights:
# Skip generation-related keywords
if any(skip in name for skip in generation_keywords):
continue
if any(name.startswith(prefix) for prefix in vae_prefixes):
continue

if "patch_embedding.weight" in name and tensor.ndim == 2:
out_channels = tensor.shape[0]
in_features = tensor.shape[1]
patch_size = self.config.vit_config.patch_size
in_channels = self.config.vit_config.num_channels
if in_features == in_channels * patch_size * patch_size:
tensor = tensor.reshape(
out_channels, patch_size, patch_size, in_channels
)
tensor = tensor.permute(0, 3, 1, 2).contiguous()

filtered_weights.append((name, tensor))

loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
return loader.load_weights(filtered_weights, mapper=self.hf_to_vllm_mapper)

+ 3
- 0
vllm/model_executor/models/qwen.py View File

@@ -281,6 +281,9 @@ class QWenBaseModel(nn.Module):
self.transformer.make_empty_intermediate_tensors
)

def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.transformer.wte(input_ids)

def compute_logits(
self,
hidden_states: torch.Tensor,


+ 32
- 0
vllm/model_executor/models/qwen2.py View File

@@ -122,6 +122,8 @@ class Qwen2Attention(nn.Module):
prefix: str = "",
attn_type: str = AttentionType.DECODER,
dual_chunk_attention_config: dict[str, Any] | None = None,
qk_norm: bool = False,
rms_norm_eps: float = 1e-6,
) -> None:
super().__init__()
self.hidden_size = hidden_size
@@ -144,6 +146,7 @@ class Qwen2Attention(nn.Module):
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.dual_chunk_attention_config = dual_chunk_attention_config
self.qk_norm = qk_norm

self.qkv_proj = QKVParallelLinear(
hidden_size,
@@ -162,6 +165,11 @@ class Qwen2Attention(nn.Module):
prefix=f"{prefix}.o_proj",
)

# QK Normalization support (used in BAGEL and some other models)
if self.qk_norm:
self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)

self.rotary_emb = get_rope(
self.head_dim,
max_position=max_position,
@@ -197,6 +205,23 @@ class Qwen2Attention(nn.Module):
) -> torch.Tensor:
qkv, _ = self.qkv_proj(hidden_states)
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)

# Apply QK normalization if enabled (before RoPE)
if self.qk_norm:
# Reshape to apply per-head normalization
# q shape: (total_tokens, q_size) -> (total_tokens, num_heads, head_dim)
total_tokens = q.shape[0]
q = q.view(total_tokens, self.num_heads, self.head_dim)
k = k.view(total_tokens, self.num_kv_heads, self.head_dim)

# Apply normalization
q = self.q_norm(q)
k = self.k_norm(k)

# Reshape back
q = q.view(total_tokens, self.q_size)
k = k.view(total_tokens, self.kv_size)

q, k = self.rotary_emb(positions, q, k)
attn_output = self.attn(q, k, v)
output, _ = self.o_proj(attn_output)
@@ -227,6 +252,9 @@ class Qwen2DecoderLayer(nn.Module):
else:
attn_type = AttentionType.ENCODER_ONLY

# Check if QK normalization is enabled (used in BAGEL and some other models)
qk_norm = getattr(config, "qk_norm", False)

self.self_attn = Qwen2Attention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
@@ -238,6 +266,8 @@ class Qwen2DecoderLayer(nn.Module):
prefix=f"{prefix}.self_attn",
attn_type=attn_type,
dual_chunk_attention_config=dual_chunk_attention_config,
qk_norm=qk_norm,
rms_norm_eps=config.rms_norm_eps,
)
self.mlp = Qwen2MLP(
hidden_size=self.hidden_size,
@@ -480,6 +510,8 @@ class Qwen2Model(nn.Module):
continue
if is_pp_missing_parameter(name, self):
continue
if name not in params_dict:
continue
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight)


+ 5
- 0
vllm/model_executor/models/qwen3_vl_moe.py View File

@@ -418,6 +418,11 @@ class Qwen3VLMoeForConditionalGeneration(

self.config = config
self.multimodal_config = multimodal_config
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
self.video_pruning_rate = multimodal_config.video_pruning_rate
self.is_multimodal_pruning_enabled = (
multimodal_config.is_multimodal_pruning_enabled()
)

if not multimodal_config.get_limit_per_prompt(
"image"


+ 1
- 0
vllm/model_executor/models/registry.py View File

@@ -272,6 +272,7 @@ _MULTIMODAL_MODELS = {
"aya_vision",
"AyaVisionForConditionalGeneration",
),
"BagelForConditionalGeneration": ("bagel", "BagelForConditionalGeneration"),
"BeeForConditionalGeneration": ("bee", "BeeForConditionalGeneration"),
"Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
"ChameleonForConditionalGeneration": (


+ 4
- 11
vllm/platforms/cpu.py View File

@@ -23,6 +23,7 @@ from .interface import CpuArchEnum, Platform, PlatformEnum
logger = init_logger(__name__)

if TYPE_CHECKING:
from vllm.attention.selector import AttentionSelectorConfig
from vllm.config import VllmConfig
else:
VllmConfig = None
@@ -126,21 +127,13 @@ class CpuPlatform(Platform):
def get_attn_backend_cls(
cls,
selected_backend: "AttentionBackendEnum",
head_size: int,
dtype: torch.dtype,
kv_cache_dtype: str | None,
block_size: int,
use_mla: bool,
has_sink: bool,
use_sparse: bool,
use_mm_prefix: bool,
attn_type: str | None = None,
attn_selector_config: "AttentionSelectorConfig",
) -> str:
if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN:
logger.info("Cannot use %s backend on CPU.", selected_backend)
if use_mla:
if attn_selector_config.use_mla:
raise NotImplementedError("MLA is not supported on CPU.")
if use_sparse:
if attn_selector_config.use_sparse:
raise NotImplementedError("Sparse Attention is not supported on CPU.")
return AttentionBackendEnum.CPU_ATTN.get_path()



+ 15
- 59
vllm/platforms/cuda.py View File

@@ -14,7 +14,6 @@ from typing_extensions import ParamSpec

# import custom ops, trigger op registration
import vllm._C # noqa
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.logger import init_logger
from vllm.utils.import_utils import import_pynvml
@@ -23,6 +22,7 @@ from vllm.utils.torch_utils import cuda_device_count_stateless
from .interface import DeviceCapability, Platform, PlatformEnum

if TYPE_CHECKING:
from vllm.attention.selector import AttentionSelectorConfig
from vllm.config import VllmConfig
from vllm.config.cache import CacheDType
else:
@@ -258,16 +258,8 @@ class CudaPlatformBase(Platform):
@classmethod
def get_valid_backends(
cls,
head_size,
dtype,
kv_cache_dtype,
block_size,
use_mla,
has_sink,
use_sparse,
use_mm_prefix,
device_capability,
attn_type,
device_capability: DeviceCapability,
attn_selector_config: "AttentionSelectorConfig",
) -> tuple[
list[tuple["AttentionBackendEnum", int]],
dict["AttentionBackendEnum", list[str]],
@@ -275,21 +267,15 @@ class CudaPlatformBase(Platform):
valid_backends_priorities = []
invalid_reasons = {}

backend_priorities = _get_backend_priorities(use_mla, device_capability)
backend_priorities = _get_backend_priorities(
attn_selector_config.use_mla, device_capability
)
for priority, backend in enumerate(backend_priorities):
try:
backend_class = backend.get_class()
invalid_reasons_i = backend_class.validate_configuration(
head_size,
dtype,
kv_cache_dtype,
block_size,
use_mla,
has_sink,
use_sparse,
use_mm_prefix,
device_capability,
attn_type,
device_capability=device_capability,
**attn_selector_config._asdict(),
)
except ImportError:
invalid_reasons_i = ["ImportError"]
@@ -304,37 +290,19 @@ class CudaPlatformBase(Platform):
def get_attn_backend_cls(
cls,
selected_backend: "AttentionBackendEnum",
head_size: int,
dtype: torch.dtype,
kv_cache_dtype: "CacheDType | None",
block_size: int | None,
use_mla: bool,
has_sink: bool,
use_sparse: bool,
use_mm_prefix: bool,
attn_type: str | None = None,
attn_selector_config: "AttentionSelectorConfig",
) -> str:
if attn_type is None:
attn_type = AttentionType.DECODER

device_capability = cls.get_device_capability()
assert device_capability is not None

attn_selector_config = attn_selector_config._replace(block_size=None)
# First try checking just the selected backend, if there is one.
if selected_backend is not None:
try:
backend_class = selected_backend.get_class()
invalid_reasons = backend_class.validate_configuration(
head_size,
dtype,
kv_cache_dtype,
None,
use_mla,
has_sink,
use_sparse,
use_mm_prefix,
device_capability,
attn_type,
device_capability=device_capability,
**attn_selector_config._asdict(),
)
except ImportError:
invalid_reasons = ["ImportError"]
@@ -350,16 +318,8 @@ class CudaPlatformBase(Platform):
# No selected backend or the selected backend is invalid,
# so we try finding a valid backend.
valid_backends_priorities, invalid_reasons = cls.get_valid_backends(
head_size,
dtype,
kv_cache_dtype,
None,
use_mla,
has_sink,
use_sparse,
use_mm_prefix,
device_capability,
attn_type,
device_capability=device_capability,
attn_selector_config=attn_selector_config,
)
reasons_str = (
"{"
@@ -369,11 +329,7 @@ class CudaPlatformBase(Platform):
)
+ "}"
)
config_str = (
f"head_size: {head_size}, dtype: {dtype}, "
f"kv_cache_dtype: {kv_cache_dtype}, block_size: {block_size}, "
f"use_mla: {use_mla}, has_sink: {has_sink}, use_sparse: {use_sparse}"
)
config_str = attn_selector_config.__repr__()
logger.debug_once(
f"Some attention backends are not valid for {cls.device_name} with "
f"{config_str}. Reasons: {reasons_str}."


+ 2
- 10
vllm/platforms/interface.py View File

@@ -18,8 +18,8 @@ from vllm.logger import init_logger
if TYPE_CHECKING:
from torch.distributed import PrefixStore, ProcessGroup

from vllm.attention.selector import AttentionSelectorConfig
from vllm.config import VllmConfig
from vllm.config.cache import CacheDType
from vllm.inputs import ProcessorInputs, PromptType
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
@@ -226,15 +226,7 @@ class Platform:
def get_attn_backend_cls(
cls,
selected_backend: "AttentionBackendEnum",
head_size: int,
dtype: torch.dtype,
kv_cache_dtype: "CacheDType | None",
block_size: int,
use_mla: bool,
has_sink: bool,
use_sparse: bool,
use_mm_prefix: bool,
attn_type: str | None = None,
attn_selector_config: "AttentionSelectorConfig",
) -> str:
"""Get the attention backend class of a device."""
return ""


+ 9
- 13
vllm/platforms/rocm.py View File

@@ -15,6 +15,7 @@ from vllm.utils.torch_utils import cuda_device_count_stateless
from .interface import DeviceCapability, Platform, PlatformEnum

if TYPE_CHECKING:
from vllm.attention.selector import AttentionSelectorConfig
from vllm.config import VllmConfig

logger = init_logger(__name__)
@@ -190,21 +191,16 @@ class RocmPlatform(Platform):
@classmethod
def get_attn_backend_cls(
cls,
selected_backend,
head_size,
dtype,
kv_cache_dtype,
block_size,
use_mla,
has_sink,
use_sparse,
use_mm_prefix,
attn_type: str | None = None,
selected_backend: "AttentionBackendEnum",
attn_selector_config: "AttentionSelectorConfig",
) -> str:
from vllm._aiter_ops import rocm_aiter_ops

if use_sparse:
if kv_cache_dtype.startswith("fp8"):
block_size = attn_selector_config.block_size
kv_cache_dtype = attn_selector_config.kv_cache_dtype

if attn_selector_config.use_sparse:
if kv_cache_dtype and kv_cache_dtype.startswith("fp8"):
raise ValueError(
"ROCMAiterMLASparseBackend doesn't support fp8 kv_cache_dtype."
)
@@ -214,7 +210,7 @@ class RocmPlatform(Platform):
logger.info_once("Using Sparse MLA backend on V1 engine.")
return AttentionBackendEnum.ROCM_AITER_MLA_SPARSE.get_path()

if use_mla:
if attn_selector_config.use_mla:
if selected_backend is None:
selected_backend = (
AttentionBackendEnum.ROCM_AITER_MLA


+ 3
- 10
vllm/platforms/tpu.py View File

@@ -16,6 +16,7 @@ from .interface import Platform, PlatformEnum
if TYPE_CHECKING:
from typing import TypeAlias

from vllm.attention.selector import AttentionSelectorConfig
from vllm.config import VllmConfig
from vllm.config.cache import BlockSize
from vllm.pooling_params import PoolingParams
@@ -57,17 +58,9 @@ class TpuPlatform(Platform):
def get_attn_backend_cls(
cls,
selected_backend: "AttentionBackendEnum",
head_size: int,
dtype: torch.dtype,
kv_cache_dtype: str | None,
block_size: int,
use_mla: bool,
has_sink: bool,
use_sparse: bool,
use_mm_prefix: bool,
attn_type: str | None = None,
attn_selector_config: "AttentionSelectorConfig",
) -> str:
if use_sparse:
if attn_selector_config.use_sparse:
raise NotImplementedError("Sparse Attention is not supported on TPU.")
if selected_backend != AttentionBackendEnum.PALLAS:
logger.info("Cannot use %s backend on TPU.", selected_backend)


+ 4
- 11
vllm/platforms/xpu.py View File

@@ -14,6 +14,7 @@ from vllm.logger import init_logger
from .interface import DeviceCapability, Platform, PlatformEnum

if TYPE_CHECKING:
from vllm.attention.selector import AttentionSelectorConfig
from vllm.config import VllmConfig
else:
VllmConfig = None
@@ -42,15 +43,7 @@ class XPUPlatform(Platform):
def get_attn_backend_cls(
cls,
selected_backend: "AttentionBackendEnum",
head_size: int,
dtype: torch.dtype,
kv_cache_dtype: str | None,
block_size: int,
use_mla: bool,
has_sink: bool,
use_sparse: bool,
use_mm_prefix: bool,
attn_type: str | None = None,
attn_selector_config: "AttentionSelectorConfig",
) -> str:
from vllm.v1.attention.backends.utils import set_kv_cache_layout

@@ -60,7 +53,7 @@ class XPUPlatform(Platform):
"only NHD layout is supported by XPU attention kernels."
)

if use_sparse:
if attn_selector_config.use_sparse:
raise NotImplementedError("Sparse Attention is not supported on XPU.")
if selected_backend == AttentionBackendEnum.TRITON_ATTN:
logger.info_once("Using Triton backend.")
@@ -71,7 +64,7 @@ class XPUPlatform(Platform):
elif selected_backend:
raise ValueError(
f"Invalid attention backend for {cls.device_name}, "
f"with use_mla: {use_mla}"
f"with use_mla: {attn_selector_config.use_mla}"
)

logger.info("Using Flash Attention backend.")


+ 150
- 0
vllm/tool_parsers/__init__.py View File

@@ -0,0 +1,150 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
ToolParserManager,
)

__all__ = ["ToolParser", "ToolParserManager"]


"""
Register a lazy module mapping.

Example:
ToolParserManager.register_lazy_module(
name="kimi_k2",
module_path="vllm.tool_parsers.kimi_k2_parser",
class_name="KimiK2ToolParser",
)
"""


_TOOL_PARSERS_TO_REGISTER = {
"deepseek_v3": ( # name
"deepseekv3_tool_parser", # filename
"DeepSeekV3ToolParser", # class_name
),
"deepseek_v31": (
"deepseekv31_tool_parser",
"DeepSeekV31ToolParser",
),
"deepseek_v32": (
"deepseekv32_tool_parser",
"DeepSeekV32ToolParser",
),
"ernie45": (
"ernie45_tool_parser",
"Ernie45ToolParser",
),
"glm45": (
"glm4_moe_tool_parser",
"Glm4MoeModelToolParser",
),
"granite-20b-fc": (
"granite_20b_fc_tool_parser",
"Granite20bFCToolParser",
),
"granite": (
"granite_tool_parser",
"GraniteToolParser",
),
"hermes": (
"hermes_tool_parser",
"Hermes2ProToolParser",
),
"hunyuan_a13b": (
"hunyuan_a13b_tool_parser",
"HunyuanA13BToolParser",
),
"internlm": (
"internlm2_tool_parser",
"Internlm2ToolParser",
),
"jamba": (
"jamba_tool_parser",
"JambaToolParser",
),
"kimi_k2": (
"kimi_k2_tool_parser",
"KimiK2ToolParser",
),
"llama3_json": (
"llama_tool_parser",
"Llama3JsonToolParser",
),
"llama4_json": (
"llama_tool_parser",
"Llama3JsonToolParser",
),
"llama4_pythonic": (
"llama4_pythonic_tool_parser",
"Llama4PythonicToolParser",
),
"longcat": (
"longcat_tool_parser",
"LongcatFlashToolParser",
),
"minimax_m2": (
"minimax_m2_tool_parser",
"MinimaxM2ToolParser",
),
"minimax": (
"minimax_tool_parser",
"MinimaxToolParser",
),
"mistral": (
"mistral_tool_parser",
"MistralToolParser",
),
"olmo3": (
"olmo3_tool_parser",
"Olmo3PythonicToolParser",
),
"openai": (
"openai_tool_parser",
"OpenAIToolParser",
),
"phi4_mini_json": (
"phi4mini_tool_parser",
"Phi4MiniJsonToolParser",
),
"pythonic": (
"pythonic_tool_parser",
"PythonicToolParser",
),
"qwen3_coder": (
"qwen3coder_tool_parser",
"Qwen3CoderToolParser",
),
"qwen3_xml": (
"qwen3xml_tool_parser",
"Qwen3XMLToolParser",
),
"seed_oss": (
"seed_oss_tool_parser",
"SeedOssToolParser",
),
"step3": (
"step3_tool_parser",
"Step3ToolParser",
),
"xlam": (
"xlam_tool_parser",
"xLAMToolParser",
),
"gigachat3": (
"gigachat3_tool_parser",
"GigaChat3ToolParser",
),
}


def register_lazy_tool_parsers():
for name, (file_name, class_name) in _TOOL_PARSERS_TO_REGISTER.items():
module_path = f"vllm.tool_parsers.{file_name}"
ToolParserManager.register_lazy_module(name, module_path, class_name)


register_lazy_tool_parsers()

vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py → vllm/tool_parsers/abstract_tool_parser.py View File

@@ -17,12 +17,12 @@ from vllm.entrypoints.openai.protocol import (
ResponsesRequest,
ResponseTextConfig,
)
from vllm.entrypoints.openai.tool_parsers.utils import get_json_schema_from_tools
from vllm.logger import init_logger
from vllm.sampling_params import (
StructuredOutputsParams,
)
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.utils import get_json_schema_from_tools
from vllm.utils.collection_utils import is_list_of
from vllm.utils.import_utils import import_from_path

@@ -203,7 +203,7 @@ class ToolParserManager:
Example:
ToolParserManager.register_lazy_module(
name="kimi_k2",
module_path="vllm.entrypoints.openai.tool_parsers.kimi_k2_parser",
module_path="vllm.tool_parsers.kimi_k2_parser",
class_name="KimiK2ToolParser",
)
"""

vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py → vllm/tool_parsers/deepseekv31_tool_parser.py View File

@@ -15,11 +15,9 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import ToolParser

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/deepseekv32_tool_parser.py → vllm/tool_parsers/deepseekv32_tool_parser.py View File

@@ -17,11 +17,11 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py → vllm/tool_parsers/deepseekv3_tool_parser.py View File

@@ -15,11 +15,11 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py → vllm/tool_parsers/ernie45_tool_parser.py View File

@@ -15,11 +15,11 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/gigachat3_tool_parser.py → vllm/tool_parsers/gigachat3_tool_parser.py View File

@@ -16,9 +16,9 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import ToolParser

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py → vllm/tool_parsers/glm4_moe_tool_parser.py View File

@@ -18,11 +18,11 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py → vllm/tool_parsers/granite_20b_fc_tool_parser.py View File

@@ -19,17 +19,17 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.entrypoints.openai.tool_parsers.utils import (
from vllm.tool_parsers.utils import (
consume_space,
find_common_prefix,
is_complete_json,
partial_json_loads,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py → vllm/tool_parsers/granite_tool_parser.py View File

@@ -17,17 +17,17 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.entrypoints.openai.tool_parsers.utils import (
from vllm.tool_parsers.utils import (
consume_space,
find_common_prefix,
is_complete_json,
partial_json_loads,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py → vllm/tool_parsers/hermes_tool_parser.py View File

@@ -18,12 +18,12 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py → vllm/tool_parsers/hunyuan_a13b_tool_parser.py View File

@@ -17,12 +17,12 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.entrypoints.openai.tool_parsers.utils import consume_space
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.tool_parsers.utils import consume_space
from vllm.utils import random_uuid

logger = init_logger(__name__)

vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py → vllm/tool_parsers/internlm2_tool_parser.py View File

@@ -17,12 +17,12 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.tool_parsers.utils import extract_intermediate_diff

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py → vllm/tool_parsers/jamba_tool_parser.py View File

@@ -18,11 +18,11 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers import ToolParser
from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.tool_parsers import ToolParser
from vllm.tool_parsers.utils import extract_intermediate_diff

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py → vllm/tool_parsers/kimi_k2_tool_parser.py View File

@@ -15,11 +15,11 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py → vllm/tool_parsers/llama4_pythonic_tool_parser.py View File

@@ -18,10 +18,10 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
from vllm.logger import init_logger
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py → vllm/tool_parsers/llama_tool_parser.py View File

@@ -20,15 +20,15 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
from vllm.logger import init_logger
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.entrypoints.openai.tool_parsers.utils import (
from vllm.tool_parsers.utils import (
find_common_prefix,
is_complete_json,
partial_json_loads,
)
from vllm.logger import init_logger

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py → vllm/tool_parsers/longcat_tool_parser.py View File

@@ -3,8 +3,8 @@

import regex as re

from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser


class LongcatFlashToolParser(Hermes2ProToolParser):

vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py → vllm/tool_parsers/minimax_m2_tool_parser.py View File

@@ -17,11 +17,11 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py → vllm/tool_parsers/minimax_tool_parser.py View File

@@ -17,12 +17,12 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.tool_parsers.utils import extract_intermediate_diff

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py → vllm/tool_parsers/mistral_tool_parser.py View File

@@ -21,12 +21,12 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py → vllm/tool_parsers/olmo3_tool_parser.py View File

@@ -18,10 +18,10 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
from vllm.logger import init_logger
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py → vllm/tool_parsers/openai_tool_parser.py View File

@@ -12,10 +12,10 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
from vllm.logger import init_logger
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger

if TYPE_CHECKING:
from vllm.tokenizers import TokenizerLike

vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py → vllm/tool_parsers/phi4mini_tool_parser.py View File

@@ -16,10 +16,10 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
from vllm.logger import init_logger
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py → vllm/tool_parsers/pythonic_tool_parser.py View File

@@ -19,10 +19,10 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
from vllm.logger import init_logger
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py → vllm/tool_parsers/qwen3coder_tool_parser.py View File

@@ -18,11 +18,11 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py → vllm/tool_parsers/qwen3xml_tool_parser.py View File

@@ -19,11 +19,11 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py → vllm/tool_parsers/seed_oss_tool_parser.py View File

@@ -21,11 +21,11 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)

logger = init_logger(__name__)


vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py → vllm/tool_parsers/step3_tool_parser.py View File

@@ -17,11 +17,11 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.utils import random_uuid

logger = init_logger(__name__)

vllm/entrypoints/openai/tool_parsers/utils.py → vllm/tool_parsers/utils.py View File


vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py → vllm/tool_parsers/xlam_tool_parser.py View File

@@ -17,7 +17,7 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger

+ 1
- 0
vllm/transformers_utils/config.py View File

@@ -66,6 +66,7 @@ class LazyConfigDict(dict):

_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
afmoe="AfmoeConfig",
bagel="BagelConfig",
chatglm="ChatGLMConfig",
deepseek_vl_v2="DeepseekVLV2Config",
deepseek_v32="DeepseekV3Config",


+ 2
- 0
vllm/transformers_utils/configs/__init__.py View File

@@ -16,6 +16,7 @@ import importlib

_CLASS_TO_MODULE: dict[str, str] = {
"AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
"BagelConfig": "vllm.transformers_utils.configs.bagel",
"ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
"DeepseekVLV2Config": "vllm.transformers_utils.configs.deepseek_vl2",
"DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr",
@@ -54,6 +55,7 @@ _CLASS_TO_MODULE: dict[str, str] = {

__all__ = [
"AfmoeConfig",
"BagelConfig",
"ChatGLMConfig",
"DeepseekVLV2Config",
"DeepseekV3Config",


+ 53
- 0
vllm/transformers_utils/configs/bagel.py View File

@@ -0,0 +1,53 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from transformers import PretrainedConfig, SiglipVisionConfig
from transformers.models.qwen2 import Qwen2Config


class BagelConfig(PretrainedConfig):
"""Configuration class for BAGEL model."""

model_type = "bagel"

def __init__(
self,
visual_gen: bool = True,
visual_und: bool = True,
llm_config: dict | Qwen2Config | None = None,
vit_config: dict | SiglipVisionConfig | None = None,
vae_config: dict | None = None,
latent_patch_size: int = 2,
max_latent_size: int = 32,
vit_max_num_patch_per_side: int = 70,
connector_act: str = "gelu_pytorch_tanh",
interpolate_pos: bool = False,
timestep_shift: float = 1.0,
**kwargs,
):
super().__init__(**kwargs)
self.visual_gen = visual_gen
self.visual_und = visual_und

# Convert dict configs to proper config objects
if isinstance(llm_config, dict):
self.llm_config = Qwen2Config(**llm_config)
else:
self.llm_config = llm_config or Qwen2Config()

if isinstance(vit_config, dict):
self.vit_config = SiglipVisionConfig(**vit_config)
else:
self.vit_config = vit_config or SiglipVisionConfig()

self.vae_config = vae_config or {"z_channels": 16, "downsample": 8}
self.latent_patch_size = latent_patch_size
self.max_latent_size = max_latent_size
self.vit_max_num_patch_per_side = vit_max_num_patch_per_side
self.connector_act = connector_act
self.interpolate_pos = interpolate_pos
self.timestep_shift = timestep_shift

@property
def hidden_size(self) -> int:
"""Return the hidden size of the language model."""
return self.llm_config.hidden_size

+ 2
- 0
vllm/transformers_utils/processors/__init__.py View File

@@ -8,6 +8,7 @@ reasons:
- There is a need to override the existing processor to support vLLM.
"""

from vllm.transformers_utils.processors.bagel import BagelProcessor
from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor
@@ -15,6 +16,7 @@ from vllm.transformers_utils.processors.ovis import OvisProcessor
from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor

__all__ = [
"BagelProcessor",
"DeepseekVLV2Processor",
"HunYuanVLProcessor",
"HunYuanVLImageProcessor",


+ 73
- 0
vllm/transformers_utils/processors/bagel.py View File

@@ -0,0 +1,73 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2025 Bytedance Ltd. and/or its affiliates.
"""BAGEL processor for image and text inputs."""

from transformers import AutoProcessor
from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessorMixin
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput


class BagelProcessor(ProcessorMixin):
"""
Constructs a BAGEL processor which wraps a
SigLIP image processor and a Qwen2 tokenizer.
"""

attributes = ["image_processor", "tokenizer"]
image_processor_class = "SiglipImageProcessor"
tokenizer_class = "AutoTokenizer"

def __call__(
self,
text: TextInput
| PreTokenizedInput
| list[TextInput]
| list[PreTokenizedInput] = None,
images: ImageInput = None,
**kwargs,
):
"""
Main method to prepare for the model one or several sequences(s) and image(s).
"""
if images is not None:
# Process images with the image processor
# Ensure return_tensors is set to "pt" for PyTorch tensors
image_kwargs = {**kwargs}
if "return_tensors" not in image_kwargs:
image_kwargs["return_tensors"] = "pt"
pixel_values = self.image_processor(images, **image_kwargs)
else:
pixel_values = None

text_inputs = self.tokenizer(text, **kwargs) if text is not None else None

if pixel_values is not None and text_inputs is not None:
text_inputs["pixel_values"] = pixel_values["pixel_values"]
return text_inputs
elif pixel_values is not None:
return pixel_values
else:
return text_inputs

def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's batch_decode.
"""
return self.tokenizer.batch_decode(*args, **kwargs)

def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's decode.
"""
return self.tokenizer.decode(*args, **kwargs)

@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))


AutoProcessor.register("BagelProcessor", BagelProcessor)

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save
Baidu
map