[Misc] Improve error message for `is_multimodal` (#30483 )

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
[KVConnector] Add KV events to KV Connectors (#28309 )
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -32,7 +32,6 @@ def benchmark_propose(args):

        model_config = ModelConfig(
            model="facebook/opt-125m",
            task="generate",
            max_model_len=args.num_token + args.num_spec_token,
            tokenizer="facebook/opt-125m",
            tokenizer_mode="auto",
--- a/benchmarks/kernels/benchmark_mla_k_concat.py
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@@ -0,0 +1,150 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
 in MLA (Multi-head Latent Attention) prefill.

 This validates that the optimization from commit 8d4142bd is beneficial across
 various batch sizes, not just the originally tested batch size of 32768.
 """

 import time
 from collections.abc import Callable

 import torch

 # DeepSeek-V3 MLA dimensions
 NUM_HEADS = 128
 QK_NOPE_HEAD_DIM = 128
 PE_DIM = 64


 def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
    """Original torch.cat approach with expand."""
    return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)


 def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
    """Optimized direct copy approach (avoids expand + cat overhead)."""
    k = torch.empty(
        (*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
        dtype=k_nope.dtype,
        device=k_nope.device,
    )
    k[..., : k_nope.shape[-1]] = k_nope
    k[..., k_nope.shape[-1] :] = k_pe
    return k


 def benchmark_method(
    method: Callable,
    k_nope: torch.Tensor,
    k_pe: torch.Tensor,
    num_warmup: int = 10,
    num_iters: int = 100,
 ) -> float:
    """Benchmark a concatenation method and return mean latency in ms."""
    # Warmup
    for _ in range(num_warmup):
        _ = method(k_nope, k_pe)
    torch.cuda.synchronize()

    # Benchmark
    start = time.perf_counter()
    for _ in range(num_iters):
        _ = method(k_nope, k_pe)
    torch.cuda.synchronize()
    end = time.perf_counter()

    return (end - start) / num_iters * 1000  # Convert to ms


@torch.inference_mode()
 def run_benchmark(dtype: torch.dtype, dtype_name: str):
    """Run benchmark for a specific dtype."""
    torch.set_default_device("cuda")

    # Batch sizes to test (powers of 2 from 32 to 65536)
    batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]

    print("=" * 80)
    print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
    print("=" * 80)
    print(
        f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
        f"k_pe=[B, 1, {PE_DIM}]"
    )
    print(f"dtype: {dtype_name}")
    print()
    print(
        f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
        f"{'Speedup':>8} | {'Reduction':>10}"
    )
    print("-" * 70)

    results = []
    for batch_size in batch_sizes:
        # Create input tensors (generate in float32 then convert for FP8 compatibility)
        k_nope = torch.randn(
            batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
        ).to(dtype)
        k_pe = torch.randn(
            batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
        ).to(dtype)

        # Benchmark both methods
        cat_time = benchmark_method(cat_method, k_nope, k_pe)
        direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)

        speedup = cat_time / direct_time
        reduction = (1 - direct_time / cat_time) * 100

        results.append((batch_size, cat_time, direct_time, speedup, reduction))

        print(
            f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
            f"{speedup:>7.2f}x | {reduction:>9.1f}%"
        )

    print("=" * 80)

    # Summary statistics
    speedups = [r[3] for r in results]
    print("\nSpeedup summary:")
    print(f"  Min:  {min(speedups):.2f}x")
    print(f"  Max:  {max(speedups):.2f}x")
    print(f"  Mean: {sum(speedups) / len(speedups):.2f}x")

    # Find crossover point
    crossover_batch = None
    for batch_size, _, _, speedup, _ in results:
        if speedup >= 1.0:
            crossover_batch = batch_size
            break

    print("\nConclusion:")
    if crossover_batch:
        print(f"  - Direct copy becomes beneficial at batch size >= {crossover_batch}")
    # Filter for large batches (>= 512 which is typical for prefill)
    large_batch_speedups = [r[3] for r in results if r[0] >= 512]
    if large_batch_speedups:
        avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
        print(f"  - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
    print("  - MLA prefill typically uses large batches, so optimization is effective")

    return results


@torch.inference_mode()
 def main():
    # Test bfloat16
    print("\n")
    run_benchmark(torch.bfloat16, "bfloat16")

    # Test float8_e4m3fn
    print("\n")
    run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")


 if __name__ == "__main__":
    main()
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -316,10 +316,13 @@ We have split the `encode` task into two more specific token-wise tasks: `token_

 ### Remove softmax from PoolingParams

 We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.
 We are going to remove `softmax` and `activation` from `PoolingParams` in v0.15. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.

 ### as_reward_model

 !!! warning
    We are going to remove `--convert reward` in v0.15, use `--convert embed` instead.

 Pooling models now default support all pooling, you can use it without any settings.

 - Extracting hidden states prefers using `token_embed` task.
--- a/tests/models/language/pooling/test_mm_classifier_conversion.py
+++ b/tests/models/language/pooling/test_mm_classifier_conversion.py
@@ -17,7 +17,6 @@ def test_idefics_multimodal(
    with vllm_runner(
        model_name="HuggingFaceM4/Idefics3-8B-Llama3",
        runner="pooling",
        task="classify",
        convert="classify",
        load_format="dummy",
        max_model_len=512,
@@ -86,7 +85,6 @@ def test_gemma_multimodal(
    with vllm_runner(
        model_name="google/gemma-3-4b-it",
        runner="pooling",
        task="classify",
        convert="classify",
        load_format="auto",
        hf_overrides=update_config,
--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -147,7 +147,7 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
    """
    Regression test for handling videos with broken frames.
    This test uses a pre-corrupted video file (assets/corrupted.mp4) that
    contains broken/unreadable frames to verify the video loader handles
    contains broken frames to verify the video loader handles
    them gracefully without crashing and returns accurate metadata.
    """
    with monkeypatch.context() as m:
@@ -177,3 +177,125 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
            f"Expected fewer than {metadata['total_num_frames']} frames, "
            f"but loaded {frames.shape[0]} frames"
        )


@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_1")
 class TestVideoBackendOverride1(VideoLoader):
    """Test loader that returns FAKE_OUTPUT_1 to verify backend selection."""

    @classmethod
    def load_bytes(
        cls, data: bytes, num_frames: int = -1, **kwargs
    ) -> tuple[npt.NDArray, dict]:
        return FAKE_OUTPUT_1, {"video_backend": "test_video_backend_override_1"}


@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_2")
 class TestVideoBackendOverride2(VideoLoader):
    """Test loader that returns FAKE_OUTPUT_2 to verify backend selection."""

    @classmethod
    def load_bytes(
        cls, data: bytes, num_frames: int = -1, **kwargs
    ) -> tuple[npt.NDArray, dict]:
        return FAKE_OUTPUT_2, {"video_backend": "test_video_backend_override_2"}


 def test_video_media_io_backend_kwarg_override(monkeypatch: pytest.MonkeyPatch):
    """
    Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND
    environment variable.

    This allows users to dynamically select a different video backend
    via --media-io-kwargs without changing the global env var, which is
    useful when plugins set a default backend but a specific request
    needs a different one.
    """
    with monkeypatch.context() as m:
        # Set the env var to one backend
        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_1")

        imageio = ImageMediaIO()

        # Without video_backend kwarg, should use env var backend
        videoio_default = VideoMediaIO(imageio, num_frames=10)
        frames_default, metadata_default = videoio_default.load_bytes(b"test")
        np.testing.assert_array_equal(frames_default, FAKE_OUTPUT_1)
        assert metadata_default["video_backend"] == "test_video_backend_override_1"

        # With video_backend kwarg, should override env var
        videoio_override = VideoMediaIO(
            imageio, num_frames=10, video_backend="test_video_backend_override_2"
        )
        frames_override, metadata_override = videoio_override.load_bytes(b"test")
        np.testing.assert_array_equal(frames_override, FAKE_OUTPUT_2)
        assert metadata_override["video_backend"] == "test_video_backend_override_2"


 def test_video_media_io_backend_kwarg_not_passed_to_loader(
    monkeypatch: pytest.MonkeyPatch,
 ):
    """
    Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed
    through to the underlying video loader's load_bytes method.

    This ensures the kwarg is properly popped from kwargs before forwarding.
    """

    @VIDEO_LOADER_REGISTRY.register("test_reject_video_backend_kwarg")
    class RejectVideoBackendKwargLoader(VideoLoader):
        """Test loader that fails if video_backend is passed through."""

        @classmethod
        def load_bytes(
            cls, data: bytes, num_frames: int = -1, **kwargs
        ) -> tuple[npt.NDArray, dict]:
            # This should never receive video_backend in kwargs
            if "video_backend" in kwargs:
                raise AssertionError(
                    "video_backend should be consumed by VideoMediaIO, "
                    "not passed to loader"
                )
            return FAKE_OUTPUT_1, {"received_kwargs": list(kwargs.keys())}

    with monkeypatch.context() as m:
        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_reject_video_backend_kwarg")

        imageio = ImageMediaIO()

        # Even when video_backend is provided, it should NOT be passed to loader
        videoio = VideoMediaIO(
            imageio,
            num_frames=10,
            video_backend="test_reject_video_backend_kwarg",
            other_kwarg="should_pass_through",
        )

        # This should NOT raise AssertionError
        frames, metadata = videoio.load_bytes(b"test")
        np.testing.assert_array_equal(frames, FAKE_OUTPUT_1)
        # Verify other kwargs are still passed through
        assert "other_kwarg" in metadata["received_kwargs"]


 def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch):
    """
    Test that when video_backend kwarg is None or not provided,
    VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var.
    """
    with monkeypatch.context() as m:
        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_2")

        imageio = ImageMediaIO()

        # Explicit None should fall back to env var
        videoio_none = VideoMediaIO(imageio, num_frames=10, video_backend=None)
        frames_none, metadata_none = videoio_none.load_bytes(b"test")
        np.testing.assert_array_equal(frames_none, FAKE_OUTPUT_2)
        assert metadata_none["video_backend"] == "test_video_backend_override_2"

        # Not providing video_backend should also fall back to env var
        videoio_missing = VideoMediaIO(imageio, num_frames=10)
        frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
        np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
        assert metadata_missing["video_backend"] == "test_video_backend_override_2"
--- a/tests/reasoning/test_minimax_m2_append_reasoning_parser.py
+++ b/tests/reasoning/test_minimax_m2_append_reasoning_parser.py
@@ -0,0 +1,195 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest
 from transformers import AutoTokenizer

 from tests.reasoning.utils import run_reasoning_extraction
 from vllm.reasoning import ReasoningParser, ReasoningParserManager

 parser_name = "minimax_m2_append_think"
 end_token = "</think>"

 # MiniMax M2 model path
 REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2"


@pytest.fixture(scope="module")
 def minimax_m2_tokenizer():
    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)


 # =============================================================================
 # MiniMaxM2AppendThinkReasoningParser behavior:
 # - Prepends <think> to the beginning of the output
 # - Does NOT separate reasoning and content
 # - Returns everything as content (with <think> prepended)
 # - reasoning is always None
 #
 # This parser is used when you want to keep the raw output with <think> added
 # =============================================================================

 # Case: simple output with end token
 SIMPLE_OUTPUT = {
    "output": "This is reasoning</think>This is response",
    "reasoning": None,
    "content": "<think>This is reasoning</think>This is response",
    "is_reasoning_end": True,
 }

 # Case: output without end token (reasoning in progress)
 NO_END_TOKEN = {
    "output": "This is reasoning in progress",
    "reasoning": None,
    "content": "<think>This is reasoning in progress",
    "is_reasoning_end": False,
 }

 # Case: only end token
 ONLY_END_TOKEN = {
    "output": "</think>This is response",
    "reasoning": None,
    "content": "<think></think>This is response",
    "is_reasoning_end": True,
 }

 # Case: multiple lines
 MULTIPLE_LINES = {
    "output": "Line 1\nLine 2</think>Response 1\nResponse 2",
    "reasoning": None,
    "content": "<think>Line 1\nLine 2</think>Response 1\nResponse 2",
    "is_reasoning_end": True,
 }

 # Case: empty output (non-streaming prepends <think>)
 EMPTY = {
    "output": "",
    "reasoning": None,
    "content": "<think>",
    "is_reasoning_end": False,
 }

 # Case: empty output streaming (no tokens = no output)
 EMPTY_STREAMING = {
    "output": "",
    "reasoning": None,
    "content": None,
    "is_reasoning_end": False,
 }

 # Case: special characters
 SPECIAL_CHARS = {
    "output": "Let me think... 1+1=2</think>Yes!",
    "reasoning": None,
    "content": "<think>Let me think... 1+1=2</think>Yes!",
    "is_reasoning_end": True,
 }

 # Case: code in output
 CODE_OUTPUT = {
    "output": "```python\nprint('hi')\n```</think>Here's the code.",
    "reasoning": None,
    "content": "<think>```python\nprint('hi')\n```</think>Here's the code.",
    "is_reasoning_end": True,
 }

 TEST_CASES = [
    pytest.param(
        False,
        SIMPLE_OUTPUT,
        id="simple_output",
    ),
    pytest.param(
        True,
        SIMPLE_OUTPUT,
        id="simple_output_streaming",
    ),
    pytest.param(
        False,
        NO_END_TOKEN,
        id="no_end_token",
    ),
    pytest.param(
        True,
        NO_END_TOKEN,
        id="no_end_token_streaming",
    ),
    pytest.param(
        False,
        ONLY_END_TOKEN,
        id="only_end_token",
    ),
    pytest.param(
        True,
        ONLY_END_TOKEN,
        id="only_end_token_streaming",
    ),
    pytest.param(
        False,
        MULTIPLE_LINES,
        id="multiple_lines",
    ),
    pytest.param(
        True,
        MULTIPLE_LINES,
        id="multiple_lines_streaming",
    ),
    pytest.param(
        False,
        EMPTY,
        id="empty",
    ),
    pytest.param(
        True,
        EMPTY_STREAMING,
        id="empty_streaming",
    ),
    pytest.param(
        False,
        SPECIAL_CHARS,
        id="special_chars",
    ),
    pytest.param(
        True,
        SPECIAL_CHARS,
        id="special_chars_streaming",
    ),
    pytest.param(
        False,
        CODE_OUTPUT,
        id="code_output",
    ),
    pytest.param(
        True,
        CODE_OUTPUT,
        id="code_output_streaming",
    ),
 ]


@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
 def test_reasoning(
    streaming: bool,
    param_dict: dict,
    minimax_m2_tokenizer,
 ):
    output = minimax_m2_tokenizer.tokenize(param_dict["output"])
    # decode everything to tokens
    output_tokens: list[str] = [
        minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output
    ]
    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
        minimax_m2_tokenizer
    )

    reasoning, content = run_reasoning_extraction(
        parser, output_tokens, streaming=streaming
    )

    assert reasoning == param_dict["reasoning"]
    assert content == param_dict["content"]

    # Test is_reasoning_end
    output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output)
    is_reasoning_end = parser.is_reasoning_end(output_ids)
    assert is_reasoning_end == param_dict["is_reasoning_end"]
--- a/tests/reasoning/test_minimax_m2_reasoning_parser.py
+++ b/tests/reasoning/test_minimax_m2_reasoning_parser.py
@@ -0,0 +1,230 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest
 from transformers import AutoTokenizer

 from tests.reasoning.utils import run_reasoning_extraction
 from vllm.reasoning import ReasoningParser, ReasoningParserManager

 parser_name = "minimax_m2"
 end_token = "</think>"

 # MiniMax M2 model path
 REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2"


@pytest.fixture(scope="module")
 def minimax_m2_tokenizer():
    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)


 # =============================================================================
 # MiniMax M2 specific behavior:
 # - Model does NOT generate <think> start token
 # - Model only generates </think> end token
 # - All content before </think> is reasoning
 # - All content after </think> is the actual response (content)
 # =============================================================================

 # Case: reasoning + end token + content (typical case)
 SIMPLE_REASONING = {
    "output": "This is a reasoning section</think>This is the rest",
    "reasoning": "This is a reasoning section",
    "content": "This is the rest",
    "is_reasoning_end": True,
 }

 # Case: reasoning + end token only (no content after)
 COMPLETE_REASONING = {
    "output": "This is a reasoning section</think>",
    "reasoning": "This is a reasoning section",
    "content": None,
    "is_reasoning_end": True,
 }

 # Case: no end token yet (streaming in progress, all is reasoning)
 NO_END_TOKEN = {
    "output": "This is reasoning in progress",
    "reasoning": "This is reasoning in progress",
    "content": None,
    "is_reasoning_end": False,
 }

 # Case: multiple lines of reasoning
 MULTIPLE_LINES = {
    "output": "First line\nSecond line</think>Response first line\nResponse second",
    "reasoning": "First line\nSecond line",
    "content": "Response first line\nResponse second",
    "is_reasoning_end": True,
 }

 # Case: only end token (empty reasoning, immediate response)
 SHORTEST_REASONING_NO_STREAMING = {
    "output": "</think>This is the response",
    "reasoning": "",
    "content": "This is the response",
    "is_reasoning_end": True,
 }

 # Case: only end token streaming (reasoning is None because it's just the token)
 SHORTEST_REASONING_STREAMING = {
    "output": "</think>This is the response",
    "reasoning": None,
    "content": "This is the response",
    "is_reasoning_end": True,
 }

 # Case: empty output
 EMPTY = {
    "output": "",
    "reasoning": "",
    "content": None,
    "is_reasoning_end": False,
 }

 # Case: empty streaming
 EMPTY_STREAMING = {
    "output": "",
    "reasoning": None,
    "content": None,
    "is_reasoning_end": False,
 }

 # Case: long reasoning with special characters
 SPECIAL_CHARS = {
    "output": "Let me think... 1+1=2, right?</think>Yes, 1+1=2.",
    "reasoning": "Let me think... 1+1=2, right?",
    "content": "Yes, 1+1=2.",
    "is_reasoning_end": True,
 }

 # Case: reasoning with code blocks
 CODE_IN_REASONING = {
    "output": "```python\nprint('hello')\n```</think>Here is the code.",
    "reasoning": "```python\nprint('hello')\n```",
    "content": "Here is the code.",
    "is_reasoning_end": True,
 }

 TEST_CASES = [
    # Core cases: no start token (MiniMax M2 actual behavior)
    pytest.param(
        False,
        SIMPLE_REASONING,
        id="simple_reasoning",
    ),
    pytest.param(
        True,
        SIMPLE_REASONING,
        id="simple_reasoning_streaming",
    ),
    pytest.param(
        False,
        COMPLETE_REASONING,
        id="complete_reasoning",
    ),
    pytest.param(
        True,
        COMPLETE_REASONING,
        id="complete_reasoning_streaming",
    ),
    pytest.param(
        False,
        NO_END_TOKEN,
        id="no_end_token",
    ),
    pytest.param(
        True,
        NO_END_TOKEN,
        id="no_end_token_streaming",
    ),
    pytest.param(
        False,
        MULTIPLE_LINES,
        id="multiple_lines",
    ),
    pytest.param(
        True,
        MULTIPLE_LINES,
        id="multiple_lines_streaming",
    ),
    pytest.param(
        False,
        SHORTEST_REASONING_NO_STREAMING,
        id="shortest_reasoning",
    ),
    pytest.param(
        True,
        SHORTEST_REASONING_STREAMING,
        id="shortest_reasoning_streaming",
    ),
    pytest.param(
        False,
        EMPTY,
        id="empty",
    ),
    pytest.param(
        True,
        EMPTY_STREAMING,
        id="empty_streaming",
    ),
    pytest.param(
        False,
        SPECIAL_CHARS,
        id="special_chars",
    ),
    pytest.param(
        True,
        SPECIAL_CHARS,
        id="special_chars_streaming",
    ),
    pytest.param(
        False,
        CODE_IN_REASONING,
        id="code_in_reasoning",
    ),
    pytest.param(
        True,
        CODE_IN_REASONING,
        id="code_in_reasoning_streaming",
    ),
 ]


@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
 def test_reasoning(
    streaming: bool,
    param_dict: dict,
    minimax_m2_tokenizer,
 ):
    output = minimax_m2_tokenizer.tokenize(param_dict["output"])
    # decode everything to tokens
    output_tokens: list[str] = [
        minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output
    ]
    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
        minimax_m2_tokenizer
    )

    reasoning, content = run_reasoning_extraction(
        parser, output_tokens, streaming=streaming
    )

    assert reasoning == param_dict["reasoning"]
    assert content == param_dict["content"]

    # Test is_reasoning_end
    output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output)
    is_reasoning_end = parser.is_reasoning_end(output_ids)
    assert is_reasoning_end == param_dict["is_reasoning_end"]

    # Test extract_content
    if param_dict["content"] is not None:
        content = parser.extract_content_ids(output_ids)
        assert content == minimax_m2_tokenizer.convert_tokens_to_ids(
            minimax_m2_tokenizer.tokenize(param_dict["content"])
        )
    else:
        content = parser.extract_content_ids(output)
        assert content == []
--- a/tests/v1/kv_connector/unit/test_lmcache_connector.py
+++ b/tests/v1/kv_connector/unit/test_lmcache_connector.py
@@ -0,0 +1,756 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from unittest.mock import MagicMock

 import pytest

 from vllm.distributed.kv_events import BlockStored
 from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_connector import (
    LMCacheConnectorV1,
    LMCacheKVEvents,
 )
 from vllm.v1.outputs import KVConnectorOutput


@pytest.fixture
 def mock_lmcache_engine_event():
    """Create a mock event object that mimics what the lmcache engine returns."""

    class MockEvent:
        def __init__(
            self,
            block_hashes,
            parent_block_hash,
            token_ids,
            lora_id,
            block_size,
            medium,
        ):
            self.block_hashes = block_hashes
            self.parent_block_hash = parent_block_hash
            self.token_ids = token_ids
            self.lora_id = lora_id
            self.block_size = block_size
            self.medium = medium

    return MockEvent(
        block_hashes=["hash1", "hash2"],
        parent_block_hash="parent_hash",
        token_ids=[1, 2, 3, 4],
        lora_id=None,
        block_size=16,
        medium="GPU",
    )


@pytest.fixture
 def mock_connector():
    """Create a mock LMCacheConnectorV1 instance with mocked dependencies."""
    connector = MagicMock(spec=LMCacheConnectorV1)
    connector._kv_cache_events = None
    connector._lmcache_engine = MagicMock()

    # Make the methods use the real implementation
    connector.get_kv_connector_kv_cache_events = (
        LMCacheConnectorV1.get_kv_connector_kv_cache_events.__get__(
            connector, LMCacheConnectorV1
        )
    )
    connector.update_connector_output = (
        LMCacheConnectorV1.update_connector_output.__get__(
            connector, LMCacheConnectorV1
        )
    )
    connector.take_events = LMCacheConnectorV1.take_events.__get__(
        connector, LMCacheConnectorV1
    )

    return connector


 class TestGetKVConnectorKVCacheEvents:
    """Test get_kv_connector_kv_cache_events method."""

    def test_returns_none_when_no_events(self, mock_connector):
        """Test that None is returned when lmcache engine has no events."""
        mock_connector._lmcache_engine.get_kv_events.return_value = None

        result = mock_connector.get_kv_connector_kv_cache_events()

        assert result is None
        mock_connector._lmcache_engine.get_kv_events.assert_called_once()

    def test_returns_none_when_empty_list(self, mock_connector):
        """Test that None is returned when lmcache engine returns empty list."""
        mock_connector._lmcache_engine.get_kv_events.return_value = []

        result = mock_connector.get_kv_connector_kv_cache_events()

        assert result is None

    def test_converts_single_event(self, mock_connector, mock_lmcache_engine_event):
        """Test conversion of a single event from lmcache engine format."""
        mock_connector._lmcache_engine.get_kv_events.return_value = [
            mock_lmcache_engine_event
        ]

        result = mock_connector.get_kv_connector_kv_cache_events()

        assert result is not None
        assert isinstance(result, LMCacheKVEvents)
        assert result.get_number_of_workers() == 1

        events = result.get_all_events()
        assert len(events) == 1
        assert isinstance(events[0], BlockStored)
        assert events[0].block_hashes == ["hash1", "hash2"]
        assert events[0].parent_block_hash == "parent_hash"
        assert events[0].token_ids == [1, 2, 3, 4]
        assert events[0].lora_id is None
        assert events[0].block_size == 16
        assert events[0].medium == "GPU"

    def test_converts_multiple_events(self, mock_connector):
        """Test conversion of multiple events from lmcache engine format."""

        class MockEvent:
            def __init__(self, i):
                self.block_hashes = [f"hash{i}"]
                self.parent_block_hash = f"parent{i}"
                self.token_ids = [i]
                self.lora_id = None
                self.block_size = 16
                self.medium = "GPU"

        events = [MockEvent(i) for i in range(5)]
        mock_connector._lmcache_engine.get_kv_events.return_value = events

        result = mock_connector.get_kv_connector_kv_cache_events()

        assert result is not None
        assert isinstance(result, LMCacheKVEvents)

        converted_events = result.get_all_events()
        assert len(converted_events) == 5

        for i, event in enumerate(converted_events):
            assert isinstance(event, BlockStored)
            assert event.block_hashes == [f"hash{i}"]
            assert event.parent_block_hash == f"parent{i}"
            assert event.token_ids == [i]

    def test_preserves_event_attributes(self, mock_connector):
        """Test that all event attributes are correctly preserved."""

        class MockEventWithLora:
            def __init__(self):
                self.block_hashes = ["hash_a", "hash_b", "hash_c"]
                self.parent_block_hash = "parent_xyz"
                self.token_ids = [100, 200, 300]
                self.lora_id = 42
                self.block_size = 32
                self.medium = "DISK"

        mock_connector._lmcache_engine.get_kv_events.return_value = [
            MockEventWithLora()
        ]

        result = mock_connector.get_kv_connector_kv_cache_events()

        events = result.get_all_events()
        event = events[0]

        assert event.block_hashes == ["hash_a", "hash_b", "hash_c"]
        assert event.parent_block_hash == "parent_xyz"
        assert event.token_ids == [100, 200, 300]
        assert event.lora_id == 42
        assert event.block_size == 32
        assert event.medium == "DISK"

    def test_handles_none_parent_block_hash(self, mock_connector):
        """Test handling of events with None parent_block_hash."""

        class MockEventNoParent:
            def __init__(self):
                self.block_hashes = ["hash1"]
                self.parent_block_hash = None
                self.token_ids = [1, 2]
                self.lora_id = None
                self.block_size = 16
                self.medium = "GPU"

        mock_connector._lmcache_engine.get_kv_events.return_value = [
            MockEventNoParent()
        ]

        result = mock_connector.get_kv_connector_kv_cache_events()

        events = result.get_all_events()
        assert events[0].parent_block_hash is None


 class TestUpdateConnectorOutput:
    """Test update_connector_output method."""

    def test_does_nothing_when_kv_cache_events_is_none(self, mock_connector):
        """Test that method returns early when kv_cache_events is None."""
        connector_output = KVConnectorOutput(kv_cache_events=None)

        mock_connector.update_connector_output(connector_output)

        assert mock_connector._kv_cache_events is None

    def test_does_nothing_when_kv_cache_events_is_not_lmcache_kv_events(
        self, mock_connector
    ):
        """Test that method returns early when kv_cache_events is not
        LMCacheKVEvents."""
        # Create a mock object that is not LMCacheKVEvents
        fake_events = MagicMock()
        connector_output = KVConnectorOutput(kv_cache_events=fake_events)

        mock_connector.update_connector_output(connector_output)

        assert mock_connector._kv_cache_events is None

    def test_sets_kv_cache_events_when_none(self, mock_connector):
        """Test that _kv_cache_events is set when it was None."""
        kv_events = LMCacheKVEvents(num_workers=1)
        event = BlockStored(
            block_hashes=["hash1"],
            parent_block_hash=None,
            token_ids=[1, 2],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )
        kv_events.add_events([event])

        connector_output = KVConnectorOutput(kv_cache_events=kv_events)

        mock_connector.update_connector_output(connector_output)

        assert mock_connector._kv_cache_events is kv_events

    def test_adds_events_when_kv_cache_events_already_exists(self, mock_connector):
        """Test that events are added when _kv_cache_events already exists."""
        # Set up existing events
        existing_events = LMCacheKVEvents(num_workers=2)
        event1 = BlockStored(
            block_hashes=["hash1"],
            parent_block_hash=None,
            token_ids=[1],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )
        existing_events.add_events([event1])
        existing_events.add_events([event1])  # Simulate 2 workers reporting

        mock_connector._kv_cache_events = existing_events

        # Create new events to add
        new_events = LMCacheKVEvents(num_workers=1)
        event2 = BlockStored(
            block_hashes=["hash2"],
            parent_block_hash=None,
            token_ids=[2],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )
        new_events.add_events([event2])

        connector_output = KVConnectorOutput(kv_cache_events=new_events)

        mock_connector.update_connector_output(connector_output)

        # Check that events were added
        all_events = mock_connector._kv_cache_events.get_all_events()
        assert len(all_events) == 3  # 2 from existing + 1 from new
        assert event1 in all_events
        assert event2 in all_events

    def test_increments_workers_when_kv_cache_events_already_exists(
        self, mock_connector
    ):
        """Test that worker count is incremented correctly."""
        # Set up existing events with 2 workers
        existing_events = LMCacheKVEvents(num_workers=2)
        mock_connector._kv_cache_events = existing_events

        # Create new events from 3 workers
        new_events = LMCacheKVEvents(num_workers=3)
        event = BlockStored(
            block_hashes=["hash1"],
            parent_block_hash=None,
            token_ids=[1],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )
        new_events.add_events([event])

        connector_output = KVConnectorOutput(kv_cache_events=new_events)

        mock_connector.update_connector_output(connector_output)

        # Worker count should be 2 + 3 = 5
        assert mock_connector._kv_cache_events.get_number_of_workers() == 5

    def test_multiple_updates(self, mock_connector):
        """Test multiple consecutive updates."""
        # First update
        events1 = LMCacheKVEvents(num_workers=1)
        event1 = BlockStored(
            block_hashes=["hash1"],
            parent_block_hash=None,
            token_ids=[1],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )
        events1.add_events([event1])
        output1 = KVConnectorOutput(kv_cache_events=events1)
        mock_connector.update_connector_output(output1)

        # Second update
        events2 = LMCacheKVEvents(num_workers=2)
        event2 = BlockStored(
            block_hashes=["hash2"],
            parent_block_hash=None,
            token_ids=[2],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )
        events2.add_events([event2])
        output2 = KVConnectorOutput(kv_cache_events=events2)
        mock_connector.update_connector_output(output2)

        # Third update
        events3 = LMCacheKVEvents(num_workers=1)
        event3 = BlockStored(
            block_hashes=["hash3"],
            parent_block_hash=None,
            token_ids=[3],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )
        events3.add_events([event3])
        output3 = KVConnectorOutput(kv_cache_events=events3)
        mock_connector.update_connector_output(output3)

        # Check final state
        all_events = mock_connector._kv_cache_events.get_all_events()
        assert len(all_events) == 3
        assert mock_connector._kv_cache_events.get_number_of_workers() == 4  # 1+2+1

    def test_updates_with_empty_events(self, mock_connector):
        """Test updating with empty event lists."""
        # First update with actual events
        events1 = LMCacheKVEvents(num_workers=1)
        event1 = BlockStored(
            block_hashes=["hash1"],
            parent_block_hash=None,
            token_ids=[1],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )
        events1.add_events([event1])
        output1 = KVConnectorOutput(kv_cache_events=events1)
        mock_connector.update_connector_output(output1)

        # Second update with empty events
        events2 = LMCacheKVEvents(num_workers=2)
        # No events added
        output2 = KVConnectorOutput(kv_cache_events=events2)
        mock_connector.update_connector_output(output2)

        # Should still have the original event
        all_events = mock_connector._kv_cache_events.get_all_events()
        assert len(all_events) == 1
        assert mock_connector._kv_cache_events.get_number_of_workers() == 3


 class TestTakeEvents:
    """Test take_events method."""

    def test_yields_nothing_when_kv_cache_events_is_none(self, mock_connector):
        """Test that nothing is yielded when _kv_cache_events is None."""
        mock_connector._kv_cache_events = None

        events = list(mock_connector.take_events())

        assert events == []

    def test_yields_events_and_clears(self, mock_connector):
        """Test that events are yielded and then cleared."""
        # Set up events
        kv_events = LMCacheKVEvents(num_workers=1)
        event1 = BlockStored(
            block_hashes=["hash1"],
            parent_block_hash=None,
            token_ids=[1],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )
        event2 = BlockStored(
            block_hashes=["hash2"],
            parent_block_hash=None,
            token_ids=[2],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )
        kv_events.add_events([event1, event2])
        mock_connector._kv_cache_events = kv_events

        # Take events
        events = list(mock_connector.take_events())

        # Check that events were yielded
        assert len(events) == 2
        assert event1 in events
        assert event2 in events

        # Check that _kv_cache_events was cleared
        assert mock_connector._kv_cache_events is None

    def test_aggregates_before_yielding(self, mock_connector):
        """Test that events are aggregated before yielding."""
        # Set up events from multiple workers
        kv_events = LMCacheKVEvents(num_workers=3)
        common_event = BlockStored(
            block_hashes=["hash_common"],
            parent_block_hash=None,
            token_ids=[1],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )
        uncommon_event = BlockStored(
            block_hashes=["hash_uncommon"],
            parent_block_hash=None,
            token_ids=[2],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )

        # All 3 workers report common_event
        kv_events.add_events([common_event])
        kv_events.add_events([common_event])
        kv_events.add_events([common_event])

        # Only 1 worker reports uncommon_event
        kv_events.add_events([uncommon_event])

        mock_connector._kv_cache_events = kv_events

        # Take events
        events = list(mock_connector.take_events())

        # Only the common event should be yielded
        assert len(events) == 1
        assert events[0] == common_event

    def test_multiple_take_events_calls(self, mock_connector):
        """Test calling take_events multiple times."""
        # First call with events
        kv_events1 = LMCacheKVEvents(num_workers=1)
        event1 = BlockStored(
            block_hashes=["hash1"],
            parent_block_hash=None,
            token_ids=[1],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )
        kv_events1.add_events([event1])
        mock_connector._kv_cache_events = kv_events1

        events1 = list(mock_connector.take_events())
        assert len(events1) == 1
        assert events1[0] == event1
        assert mock_connector._kv_cache_events is None

        # Second call with no events
        events2 = list(mock_connector.take_events())
        assert events2 == []

        # Third call after adding new events
        kv_events2 = LMCacheKVEvents(num_workers=1)
        event2 = BlockStored(
            block_hashes=["hash2"],
            parent_block_hash=None,
            token_ids=[2],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )
        kv_events2.add_events([event2])
        mock_connector._kv_cache_events = kv_events2

        events3 = list(mock_connector.take_events())
        assert len(events3) == 1
        assert events3[0] == event2

    def test_yields_empty_after_aggregation_removes_all(self, mock_connector):
        """Test that nothing is yielded if aggregation removes all events."""
        # Set up events from 2 workers with no common events
        kv_events = LMCacheKVEvents(num_workers=2)
        event1 = BlockStored(
            block_hashes=["hash1"],
            parent_block_hash=None,
            token_ids=[1],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )
        event2 = BlockStored(
            block_hashes=["hash2"],
            parent_block_hash=None,
            token_ids=[2],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )

        # Worker 1 reports event1
        kv_events.add_events([event1])
        # Worker 2 reports event2
        kv_events.add_events([event2])

        mock_connector._kv_cache_events = kv_events

        # Take events
        events = list(mock_connector.take_events())

        # No common events, so nothing should be yielded
        assert events == []
        assert mock_connector._kv_cache_events is None


 class TestIntegrationScenarios:
    """Test integration scenarios."""

    def test_full_workflow(self, mock_connector, mock_lmcache_engine_event):
        """Test a complete workflow from getting events to taking them."""
        # Step 1: Get events from lmcache engine
        mock_connector._lmcache_engine.get_kv_events.return_value = [
            mock_lmcache_engine_event
        ]
        kv_events = mock_connector.get_kv_connector_kv_cache_events()

        assert kv_events is not None
        assert len(kv_events.get_all_events()) == 1

        # Step 2: Update connector output (simulate receiving from worker)
        output1 = KVConnectorOutput(kv_cache_events=kv_events)
        mock_connector.update_connector_output(output1)

        assert mock_connector._kv_cache_events is not None

        # Step 3: Take events
        taken_events = list(mock_connector.take_events())

        assert len(taken_events) == 1
        assert mock_connector._kv_cache_events is None

    def test_multiple_workers_workflow(self, mock_connector):
        """Test workflow with multiple workers."""

        class MockEvent:
            def __init__(self, hash_val):
                self.block_hashes = [hash_val]
                self.parent_block_hash = None
                self.token_ids = [1]
                self.lora_id = None
                self.block_size = 16
                self.medium = "GPU"

        # Worker 1
        mock_connector._lmcache_engine.get_kv_events.return_value = [
            MockEvent("hash_common"),
            MockEvent("hash_worker1"),
        ]
        kv_events1 = mock_connector.get_kv_connector_kv_cache_events()
        output1 = KVConnectorOutput(kv_cache_events=kv_events1)
        mock_connector.update_connector_output(output1)

        # Worker 2
        mock_connector._lmcache_engine.get_kv_events.return_value = [
            MockEvent("hash_common"),
            MockEvent("hash_worker2"),
        ]
        kv_events2 = mock_connector.get_kv_connector_kv_cache_events()
        output2 = KVConnectorOutput(kv_cache_events=kv_events2)
        mock_connector.update_connector_output(output2)

        # Take events (should only get common events)
        taken_events = list(mock_connector.take_events())

        # With aggregation, only events reported by both workers should be present
        # In this case, hash_common was reported by both
        event_hashes = [e.block_hashes[0] for e in taken_events]
        assert "hash_common" in event_hashes

    def test_empty_workflow(self, mock_connector):
        """Test workflow when there are no events at any stage."""
        # Get events returns None
        mock_connector._lmcache_engine.get_kv_events.return_value = None
        kv_events = mock_connector.get_kv_connector_kv_cache_events()

        assert kv_events is None

        # Update with None
        output = KVConnectorOutput(kv_cache_events=None)
        mock_connector.update_connector_output(output)

        # Take events
        taken_events = list(mock_connector.take_events())

        assert taken_events == []
        assert mock_connector._kv_cache_events is None

    def test_repeated_cycles(self, mock_connector):
        """Test multiple cycles of the complete workflow."""

        class MockEvent:
            def __init__(self, cycle_num):
                self.block_hashes = [f"hash_cycle_{cycle_num}"]
                self.parent_block_hash = None
                self.token_ids = [cycle_num]
                self.lora_id = None
                self.block_size = 16
                self.medium = "GPU"

        for cycle in range(3):
            # Get events
            mock_connector._lmcache_engine.get_kv_events.return_value = [
                MockEvent(cycle)
            ]
            kv_events = mock_connector.get_kv_connector_kv_cache_events()

            # Update
            output = KVConnectorOutput(kv_cache_events=kv_events)
            mock_connector.update_connector_output(output)

            # Take
            taken_events = list(mock_connector.take_events())

            # Verify
            assert len(taken_events) == 1
            assert taken_events[0].block_hashes[0] == f"hash_cycle_{cycle}"
            assert mock_connector._kv_cache_events is None

    def test_lmcache_kv_events_aggregation(self):
        """
        Test LMCacheKVEvents aggregation across TP ranks using
        KVOutputAggregator (used by MultiprocExecutor).
        """
        from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
        from vllm.v1.outputs import ModelRunnerOutput

        # Create KVOutputAggregator for 3 workers (simulating TP=3)
        aggregator = KVOutputAggregator(expected_finished_count=3)

        # Define common and unique events
        common_event = BlockStored(
            block_hashes=["hash_common"],
            parent_block_hash="parent_common",
            token_ids=[1, 2, 3],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )

        worker1_unique_event = BlockStored(
            block_hashes=["hash_worker1"],
            parent_block_hash="parent_w1",
            token_ids=[4, 5],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )

        worker2_unique_event = BlockStored(
            block_hashes=["hash_worker2"],
            parent_block_hash="parent_w2",
            token_ids=[6, 7],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )

        worker3_unique_event = BlockStored(
            block_hashes=["hash_worker3"],
            parent_block_hash="parent_w3",
            token_ids=[8, 9],
            block_size=16,
            lora_id=None,
            medium="GPU",
        )

        # Create events for each worker
        # Worker 0: reports common event and its unique event
        worker0_events = LMCacheKVEvents(num_workers=1)
        worker0_events.add_events([common_event, worker1_unique_event])

        # Worker 1: reports common event and its unique event
        worker1_events = LMCacheKVEvents(num_workers=1)
        worker1_events.add_events([common_event, worker2_unique_event])

        # Worker 2: reports common event and its unique event
        worker2_events = LMCacheKVEvents(num_workers=1)
        worker2_events.add_events([common_event, worker3_unique_event])

        # Create ModelRunnerOutput instances for each worker
        worker_outputs = []
        for i, worker_events in enumerate(
            [worker0_events, worker1_events, worker2_events]
        ):
            output = ModelRunnerOutput(
                req_ids=[f"req_{i}"],
                req_id_to_index={f"req_{i}": 0},
                sampled_token_ids=[[123]],  # dummy token
                logprobs=None,
                prompt_logprobs_dict={},
                pooler_output=[None],
                kv_connector_output=KVConnectorOutput(
                    finished_sending=set([f"req_{i}_send"])
                    if i < 2
                    else None,  # Workers 0,1 finished sending
                    finished_recving=set([f"req_{i}_recv"])
                    if i > 0
                    else None,  # Workers 1,2 finished receiving
                    kv_cache_events=worker_events,
                ),
            )
            worker_outputs.append(output)

        # Use the real aggregation mechanism (like MultiprocExecutor.execute_model)
        aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0)
        kv_cache_events = aggregated_output.kv_connector_output.kv_cache_events

        assert isinstance(kv_cache_events, LMCacheKVEvents)

        # After aggregation, events should be combined from all workers
        # The aggregator doesn't automatically aggregate events, so we need to call
        # aggregate() to get only common events
        kv_cache_events.aggregate()
        aggregated_events = kv_cache_events.get_all_events()

        # Only the common event should remain after aggregation
        # because it's the only event reported by all 3 workers
        assert len(aggregated_events) == 1
        assert aggregated_events[0] == common_event

        # Verify the common event properties
        assert aggregated_events[0].block_hashes == ["hash_common"]
        assert aggregated_events[0].parent_block_hash == "parent_common"
        assert aggregated_events[0].token_ids == [1, 2, 3]
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -294,6 +294,12 @@ class AttentionImpl(ABC, Generic[T]):
    # Some features like decode context parallelism require the softmax lse.
    can_return_lse_for_decode: bool = False

    # Whether the attention impl supports Prefill Context Parallelism.
    supports_pcp: bool = False
    # Whether the attention impl(or ops) supports MTP
    # when cp_kv_cache_interleave_size > 1
    supports_mtp_with_cp_non_trivial_interleave_size: bool = False

    # some attention backends might not always want to return lse
    # even if they can return lse (for efficiency reasons)
    need_to_return_lse_for_decode: bool = False
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -788,6 +788,13 @@ class ModelConfig:
        runner_type: RunnerType,
        convert: ConvertOption,
    ) -> ConvertType:
        if convert == "reward":
            logger.warning(
                "`--convert reward` is deprecated and will be removed in v0.15. "
                "Please use `--convert embed` instead."
            )
            return "embed"

        if convert != "auto":
            return convert

--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -317,11 +317,6 @@ class ParallelConfig:
                    "num_redundant_experts."
                )

        if self.prefill_context_parallel_size > 1:
            raise ValueError(
                "Prefill context parallelism is not fully supported. "
                "Please set prefill_context_parallel_size to 1."
            )
        return self

    @property
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@@ -111,13 +111,15 @@ class PoolerConfig:
 def get_use_activation(o: object):
    if softmax := getattr(o, "softmax", None) is not None:
        logger.warning_once(
            "softmax will be deprecated, please use use_activation instead."
            "softmax will be deprecated and will be removed in v0.15. "
            "Please use use_activation instead."
        )
        return softmax

    if activation := getattr(o, "activation", None) is not None:
        logger.warning_once(
            "activation will be deprecated, please use use_activation instead."
            "activation will be deprecated and will be removed in v0.15. "
            "Please use use_activation instead."
        )
        return activation

--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -820,11 +820,6 @@ class VllmConfig:
                f"({self.parallel_config.cp_kv_cache_interleave_size})."
            )

        assert (
            self.parallel_config.cp_kv_cache_interleave_size == 1
            or self.speculative_config is None
        ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."

        # Do this after all the updates to compilation_config.mode
        self.compilation_config.set_splitting_ops_for_v1(
            all2all_backend=self.parallel_config.all2all_backend,
--- a/vllm/distributed/kv_events.py
+++ b/vllm/distributed/kv_events.py
@@ -5,7 +5,7 @@ import queue
 import threading
 import time
 from abc import ABC, abstractmethod
 from collections import deque
 from collections import Counter, deque
 from collections.abc import Callable
 from dataclasses import asdict
 from itertools import count
@@ -54,11 +54,26 @@ class BlockStored(KVCacheEvent):
    lora_id: int | None
    medium: str | None

    def __hash__(self) -> int:
        return hash(
            (
                tuple(self.block_hashes),
                self.parent_block_hash,
                tuple(self.token_ids),
                self.block_size,
                self.lora_id,
                self.medium,
            )
        )


 class BlockRemoved(KVCacheEvent):
    block_hashes: list[ExternalBlockHash]
    medium: str | None

    def __hash__(self) -> int:
        return hash((tuple(self.block_hashes), self.medium))


 class AllBlocksCleared(KVCacheEvent):
    pass
@@ -68,6 +83,119 @@ class KVEventBatch(EventBatch):
    events: list[BlockStored | BlockRemoved | AllBlocksCleared]


 class KVEventAggregator:
    """
    Aggregates KV events across multiple workers.
    Tracks how many times each event appears and returns only those
    that were emitted by all workers.
    """

    __slots__ = ("_event_counter", "_num_workers")

    def __init__(self, num_workers: int) -> None:
        if num_workers <= 0:
            raise ValueError("num_workers must be greater than zero.")
        self._event_counter: Counter[KVCacheEvent] = Counter()
        self._num_workers: int = num_workers

    def add_events(self, events: list[KVCacheEvent]) -> None:
        """
        Add events from a worker batch.

        :param events: List of KVCacheEvent objects.
        """
        if not isinstance(events, list):
            raise TypeError("events must be a list of KVCacheEvent.")
        self._event_counter.update(events)

    def get_common_events(self) -> list[KVCacheEvent]:
        """
        Return events that appeared in all workers.

        :return: List of events present in all workers.
        """
        return [
            event
            for event, count in self._event_counter.items()
            if count == self._num_workers
        ]

    def get_all_events(self) -> list[KVCacheEvent]:
        """
        Return all events for all workers.

        :return: List of events for all workers.
        """
        return list(self._event_counter.elements())

    def clear_events(self) -> None:
        """
        Clear all tracked events.
        """
        self._event_counter.clear()

    def increment_workers(self, count: int = 1) -> None:
        """
        Increment the number of workers contributing events.

        :param count: Number to increment the workers by.
        """
        if count <= 0:
            raise ValueError("count must be positive.")
        self._num_workers += count

    def reset_workers(self) -> None:
        """
        Reset the number of workers to 1.
        """
        self._num_workers = 1

    def get_number_of_workers(self) -> int:
        """
        Return the number of workers.

        :return: int number of workers.
        """
        return self._num_workers

    def __repr__(self) -> str:
        return (
            f"<KVEventAggregator workers={self._num_workers}, "
            f"events={len(self._event_counter)}>"
        )


 class KVConnectorKVEvents(ABC):
    """
    Abstract base class for KV events.
    Acts as a container for KV events from the connector.
    """

    @abstractmethod
    def add_events(self, events: list[KVCacheEvent]) -> None:
        raise NotImplementedError

    @abstractmethod
    def aggregate(self) -> "KVConnectorKVEvents":
        raise NotImplementedError

    @abstractmethod
    def increment_workers(self, count: int = 1) -> None:
        raise NotImplementedError

    @abstractmethod
    def get_all_events(self) -> list[KVCacheEvent]:
        raise NotImplementedError

    @abstractmethod
    def get_number_of_workers(self) -> int:
        raise NotImplementedError

    @abstractmethod
    def clear_events(self) -> None:
        raise NotImplementedError


 class EventPublisher(ABC):
    """Lightweight publisher for EventBatch batches with data parallelism
    support.
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -78,6 +78,7 @@ class KVOutputAggregator:
        finished_sending = set[str]()
        finished_recving = set[str]()
        aggregated_kv_connector_stats = None
        combined_kv_cache_events = None
        invalid_block_ids = set[int]()
        for model_runner_output in outputs:
            assert model_runner_output is not None
@@ -119,6 +120,19 @@ class KVOutputAggregator:
                        aggregated_kv_connector_stats.aggregate(kv_connector_stats)
                    )

            # Combine kv_cache_events from all workers.
            if combined_kv_cache_events is None:
                # Use the first worker's kv_cache events as start event list.
                combined_kv_cache_events = kv_output.kv_cache_events
            elif kv_cache_events := kv_output.kv_cache_events:
                assert isinstance(
                    combined_kv_cache_events,
                    type(kv_cache_events),
                )
                worker_kv_cache_events = kv_cache_events.get_all_events()
                combined_kv_cache_events.add_events(worker_kv_cache_events)
                combined_kv_cache_events.increment_workers(1)

            invalid_block_ids |= kv_output.invalid_block_ids

        # select output of the worker specified by output_rank
@@ -129,6 +143,7 @@ class KVOutputAggregator:
            finished_sending=finished_sending or None,
            finished_recving=finished_recving or None,
            kv_connector_stats=aggregated_kv_connector_stats or None,
            kv_cache_events=combined_kv_cache_events or None,
            invalid_block_ids=invalid_block_ids,
            expected_finished_count=self._expected_finished_count,
        )
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -49,7 +49,7 @@ from vllm.v1.outputs import KVConnectorOutput

 if TYPE_CHECKING:
    from vllm.config import VllmConfig
    from vllm.distributed.kv_events import KVCacheEvent
    from vllm.distributed.kv_events import KVCacheEvent, KVConnectorKVEvents
    from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
        KVConnectorPromMetrics,
        KVConnectorStats,
@@ -379,6 +379,14 @@ class KVConnectorBase_V1(ABC):
        """
        return None

    def get_kv_connector_kv_cache_events(self) -> Optional["KVConnectorKVEvents"]:
        """
        Get the KV connector kv cache events collected during the last interval.
        This function should be called by the model runner every time after the
        model execution and before cleanup.
        """
        return None

    def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None:
        """
        Get the KVConnector handshake metadata for this connector.
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -1,14 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Any

 import torch
 from lmcache.integration.vllm.vllm_v1_adapter import (
    LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
 )

 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_events import (
    BlockStored,
    KVCacheEvent,
    KVConnectorKVEvents,
    KVEventAggregator,
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
    KVConnectorBase_V1,
    KVConnectorMetadata,
@@ -16,6 +20,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
 )
 from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import KVConnectorOutput

 if TYPE_CHECKING:
    from vllm.forward_context import ForwardContext
@@ -26,6 +31,44 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)


 class LMCacheKVEvents(KVConnectorKVEvents):
    """
    Concrete implementation of KVConnectorKVEvents using KVEventAggregator.
    """

    def __init__(self, num_workers: int) -> None:
        self._aggregator = KVEventAggregator(num_workers)

    def add_events(self, events: list[KVCacheEvent]) -> None:
        self._aggregator.add_events(events)

    def aggregate(self) -> "LMCacheKVEvents":
        """
        Aggregate KV events and retain only common events.
        """
        common_events = self._aggregator.get_common_events()
        self._aggregator.clear_events()
        self._aggregator.add_events(common_events)
        self._aggregator.reset_workers()
        return self

    def increment_workers(self, count: int = 1) -> None:
        self._aggregator.increment_workers(count)

    def get_all_events(self) -> list[KVCacheEvent]:
        return self._aggregator.get_all_events()

    def get_number_of_workers(self) -> int:
        return self._aggregator.get_number_of_workers()

    def clear_events(self) -> None:
        self._aggregator.clear_events()
        self._aggregator.reset_workers()

    def __repr__(self) -> str:
        return f"<LMCacheKVEvents events={self.get_all_events()}>"


 class LMCacheConnectorV1(KVConnectorBase_V1):
    def __init__(
        self,
@@ -50,10 +93,17 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
            cls = _adapter.LMCacheConnectorV1Impl
        else:
            logger.info("Initializing latest dev LMCache connector")
            # lazy import
            from lmcache.integration.vllm.vllm_v1_adapter import (
                LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
            )

            cls = LMCacheConnectorLatestImpl

        self._lmcache_engine = cls(vllm_config, role, self)

        self._kv_cache_events: LMCacheKVEvents | None = None

    # ==============================
    # Worker-side methods
    # ==============================
@@ -151,6 +201,31 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
        # Fallback for older versions that don't support this method
        return set()

    def get_kv_connector_kv_cache_events(self) -> LMCacheKVEvents | None:
        """
        Get the KV connector kv cache events collected during the last interval.
        """

        events = self._lmcache_engine.get_kv_events()  # type: ignore [attr-defined]
        if not events:
            return None

        blocks: list[BlockStored] = [
            BlockStored(
                block_hashes=e.block_hashes,
                parent_block_hash=e.parent_block_hash,
                token_ids=e.token_ids,
                lora_id=e.lora_id,
                block_size=e.block_size,
                medium=e.medium,
            )
            for e in events
        ]

        lmcache_kv_events = LMCacheKVEvents(num_workers=1)
        lmcache_kv_events.add_events(blocks)
        return lmcache_kv_events

    # ==============================
    # Scheduler-side methods
    # ==============================
@@ -198,6 +273,28 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
        """
        return self._lmcache_engine.build_connector_meta(scheduler_output)

    def update_connector_output(self, connector_output: KVConnectorOutput):
        """
        Update KVConnector state from worker-side connectors output.

        Args:
            connector_output (KVConnectorOutput): the worker-side
                connectors output.
        """
        # Get the KV events
        kv_cache_events = connector_output.kv_cache_events
        if not kv_cache_events or not isinstance(kv_cache_events, LMCacheKVEvents):
            return

        if self._kv_cache_events is None:
            self._kv_cache_events = kv_cache_events
        else:
            self._kv_cache_events.add_events(kv_cache_events.get_all_events())
            self._kv_cache_events.increment_workers(
                kv_cache_events.get_number_of_workers()
            )
        return

    def request_finished(
        self,
        request: "Request",
@@ -214,3 +311,17 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
            returned by the engine.
        """
        return self._lmcache_engine.request_finished(request, block_ids)

    def take_events(self) -> Iterable["KVCacheEvent"]:
        """
        Take the KV cache events from the connector.

        Yields:
            New KV cache events since the last call.
        """
        if self._kv_cache_events is not None:
            self._kv_cache_events.aggregate()
            kv_cache_events = self._kv_cache_events.get_all_events()
            yield from kv_cache_events
            self._kv_cache_events.clear_events()
            self._kv_cache_events = None
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -259,6 +259,12 @@ class MultiConnector(KVConnectorBase_V1):
            agg_block_ids |= c.get_block_ids_with_load_errors()
        return agg_block_ids

    # TODO: Add a generic implementation of 'get_kv_connector_kv_cache_events' method
    # for the MultiConnector. It should be able to get events from multiple
    # connectors, handling the case where only a subset of the requested connectors
    # implements the 'get_kv_connector_kv_cache_events'
    # Follow on PR from https://github.com/vllm-project/vllm/pull/28309#pullrequestreview-3566351082

    # ==============================
    # Scheduler-side methods
    # ==============================
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1848,16 +1848,6 @@ class EngineArgs:
        default_chunked_prefill = model_config.is_chunked_prefill_supported
        default_prefix_caching = model_config.is_prefix_caching_supported

        if self.prefill_context_parallel_size > 1:
            default_chunked_prefill = False
            default_prefix_caching = False
            logger.warning_once(
                "--prefill-context-parallel-size > 1 is not compatible with "
                "chunked prefill and prefix caching now. Chunked prefill "
                "and prefix caching have been disabled by default.",
                scope="local",
            )

        if self.enable_chunked_prefill is None:
            self.enable_chunked_prefill = default_chunked_prefill

--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -176,7 +176,7 @@ class FrontendArgs:
    enable_force_include_usage: bool = False
    """If set to True, including usage on every request."""
    enable_tokenizer_info_endpoint: bool = False
    """Enable the /get_tokenizer_info endpoint. May expose chat
    """Enable the `/tokenizer_info` endpoint. May expose chat
    templates and other tokenizer configuration."""
    enable_log_outputs: bool = False
    """If True, log model outputs (generations).
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -53,6 +53,22 @@ The output embeddings must be one of the following formats:
 """


 def _require_is_multimodal(is_multimodal: Tensor | None) -> Tensor:
    """
    A helper function to be used in the context of
    [vllm.model_executor.models.interfaces.SupportsMultiModal.embed_input_ids][]
    to provide a better error message.
    """
    if is_multimodal is None:
        raise ValueError(
            "`embed_input_ids` now requires `is_multimodal` arg, "
            "please update your model runner according to "
            "https://github.com/vllm-project/vllm/pull/16229."
        )

    return is_multimodal


@runtime_checkable
 class SupportsMultiModal(Protocol):
    """The interface required for all multi-modal models."""
@@ -190,12 +206,10 @@ class SupportsMultiModal(Protocol):
        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
            return inputs_embeds

        assert is_multimodal is not None

        return _merge_multimodal_embeddings(
            inputs_embeds=inputs_embeds,
            multimodal_embeddings=multimodal_embeddings,
            is_multimodal=is_multimodal,
            is_multimodal=_require_is_multimodal(is_multimodal),
        )


--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -49,13 +49,7 @@ class VllmModel(Protocol[T_co]):

    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
        """Apply token embeddings to `input_ids`."""
        if hasattr(self, "get_input_embeddings"):
            logger.warning_once(
                "`get_input_embeddings` for vLLM models is deprecated and will be "
                "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
                "this method to `embed_input_ids`."
            )
            return self.get_input_embeddings(input_ids)
        ...

    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor) -> T_co: ...

--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -64,6 +64,7 @@ from .interfaces import (
    SupportsMultiModal,
    SupportsPP,
    SupportsQuant,
    _require_is_multimodal,
 )
 from .utils import (
    AutoWeightsLoader,
@@ -687,12 +688,10 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant)
        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
            return inputs_embeds

        assert is_multimodal is not None

        return _merge_multimodal_embeddings(
            inputs_embeds=inputs_embeds,
            multimodal_embeddings=multimodal_embeddings,
            is_multimodal=is_multimodal,
            is_multimodal=_require_is_multimodal(is_multimodal),
        )

    def forward(
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -93,6 +93,7 @@ from .interfaces import (
    SupportsMRoPE,
    SupportsMultiModal,
    SupportsPP,
    _require_is_multimodal,
 )
 from .qwen2_5_vl import (
    Qwen2_5_VisionAttention,
@@ -1572,7 +1573,7 @@ class Qwen3VLForConditionalGeneration(
        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
            return inputs_embeds

        assert is_multimodal is not None
        is_multimodal = _require_is_multimodal(is_multimodal)

        if self.use_deepstack:
            (
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -283,8 +283,15 @@ class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
        # They can be passed to the underlying
        # media loaders (e.g. custom implementations)
        # for flexible control.

        # Allow per-request override of video backend via kwargs.
        # This enables users to specify a different backend than the
        # global VLLM_VIDEO_LOADER_BACKEND env var, e.g.:
        #   --media-io-kwargs '{"video": {"video_backend": "torchcodec"}}'
        video_loader_backend = (
            kwargs.pop("video_backend", None) or envs.VLLM_VIDEO_LOADER_BACKEND
        )
        self.kwargs = kwargs
        video_loader_backend = envs.VLLM_VIDEO_LOADER_BACKEND
        self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend)

    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]:
--- a/vllm/reasoning/minimax_m2_reasoning_parser.py
+++ b/vllm/reasoning/minimax_m2_reasoning_parser.py
@@ -19,6 +19,10 @@ logger = init_logger(__name__)
 class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
    """
    Reasoning parser for MiniMax M2 model.

    MiniMax M2 models don't generate <think> start token, only </think> end
    token. All content before </think> is reasoning, content after is the
    actual response.
    """

    @property
@@ -31,6 +35,45 @@ class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
        """The token that ends reasoning content."""
        return "</think>"

    def extract_reasoning_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        """
        Extract reasoning content from a delta message for streaming.

        MiniMax M2 models don't generate <think> start token, so we assume
        all content is reasoning until we encounter the </think> end token.
        """
        # Skip single end token
        if len(delta_token_ids) == 1 and delta_token_ids[0] == self.end_token_id:
            return None

        # Check if end token has already appeared in previous tokens
        # meaning we're past the reasoning phase
        if self.end_token_id in previous_token_ids:
            # We're past the reasoning phase, this is content
            return DeltaMessage(content=delta_text)

        # Check if end token is in delta tokens
        if self.end_token_id in delta_token_ids:
            # End token in delta, split reasoning and content
            end_index = delta_text.find(self.end_token)
            reasoning = delta_text[:end_index]
            content = delta_text[end_index + len(self.end_token) :]
            return DeltaMessage(
                reasoning=reasoning if reasoning else None,
                content=content if content else None,
            )

        # No end token yet, all content is reasoning
        return DeltaMessage(reasoning=delta_text)


 class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
    """
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -1654,6 +1654,33 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
            # Convert from (L, N, P) to (N, P, L)
            self.W_UK_T = W_UK.permute(1, 2, 0)

    def _concat_k_nope_k_pe(
        self, k_nope: torch.Tensor, k_pe: torch.Tensor
    ) -> torch.Tensor:
        """
        Efficiently concatenate k_nope and k_pe tensors along the last dimension.

        This function avoids the performance penalty of torch.cat with expanded
        non-contiguous tensors by pre-allocating the output and using direct copies.

        Args:
            k_nope: Tensor of shape [..., nope_dim]
            k_pe: Tensor to broadcast and concatenate, typically shape [..., 1, pe_dim]
                or [..., pe_dim]

        Returns:
            Tensor of shape [..., nope_dim + pe_dim]
        """
        k = torch.empty(
            (*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
            dtype=k_nope.dtype,
            device=k_nope.device,
        )
        # Direct copies with efficient broadcasting
        k[..., : k_nope.shape[-1]] = k_nope
        k[..., k_nope.shape[-1] :] = k_pe
        return k

    def _compute_prefill_context(
        self,
        q: torch.Tensor,
@@ -1690,7 +1717,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
            )
            k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)

            k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
            k = self._concat_k_nope_k_pe(k_nope, k_pe)

            attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
                prefill=prefill_metadata,
@@ -1794,7 +1821,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim
            )
            k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
            k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
            k = self._concat_k_nope_k_pe(k_nope, k_pe)

            attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
                prefill=prefill_metadata,
@@ -1843,7 +1870,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
        )
        k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)

        k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
        k = self._concat_k_nope_k_pe(k_nope, k_pe)

        output_prefill = self._run_prefill_new_tokens(
            prefill=attn_metadata.prefill,
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -12,9 +12,11 @@ from vllm.compilation.cuda_graph import CUDAGraphStat
 from vllm.v1.core.sched.output import SchedulerOutput

 if TYPE_CHECKING:
    from vllm.distributed.kv_events import KVConnectorKVEvents
    from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 else:
    KVConnectorStats = object
    KVConnectorKVEvents = object


 class LogprobsLists(NamedTuple):
@@ -108,6 +110,7 @@ class KVConnectorOutput:
    finished_sending: set[str] | None = None
    finished_recving: set[str] | None = None
    kv_connector_stats: KVConnectorStats | None = None
    kv_cache_events: KVConnectorKVEvents | None = None
    # IDs of externally computed KV blocks that failed to load.
    # Requests referencing these blocks should be rescheduled to recompute them
    invalid_block_ids: set[int] = field(default_factory=set)
@@ -123,6 +126,7 @@ class KVConnectorOutput:
            not self.finished_sending
            and not self.finished_recving
            and not self.kv_connector_stats
            and not self.kv_cache_events
            and not self.invalid_block_ids
        )

--- a/vllm/v1/worker/cp_utils.py
+++ b/vllm/v1/worker/cp_utils.py
@@ -0,0 +1,42 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import TYPE_CHECKING, Any, cast

 from vllm.config import VllmConfig, get_layers_from_vllm_config

 if TYPE_CHECKING:
    from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 else:
    AttentionLayerBase = object


 def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None:
    pcp_size = vllm_config.parallel_config.prefill_context_parallel_size
    dcp_size = vllm_config.parallel_config.decode_context_parallel_size
    interleave_size = vllm_config.parallel_config.cp_kv_cache_interleave_size
    if pcp_size * dcp_size > 1:
        layer_type = cast(type[Any], AttentionLayerBase)
        layers = get_layers_from_vllm_config(vllm_config, layer_type)
        for layer in layers.values():
            layer_impl = getattr(layer, "impl", None)
            if layer_impl is None:
                continue
            if vllm_config.speculative_config is not None and interleave_size > 1:
                assert layer_impl.supports_mtp_with_cp_non_trivial_interleave_size, (
                    "MTP with cp_kv_cache_interleave_size > 1 is not "
                    f"supported in {layer_impl.__class__.__name__}."
                )
            if dcp_size > 1:
                assert layer_impl.need_to_return_lse_for_decode, (
                    "DCP requires attention impls to return"
                    " the softmax lse for decode, but the impl "
                    f"{layer_impl.__class__.__name__} "
                    "does not return the softmax lse for decode."
                )

            if pcp_size > 1:
                assert layer_impl.supports_pcp, (
                    "PCP requires attention impls' support, "
                    f"but the impl {layer_impl.__class__.__name__} "
                    "does not support PCP."
                )
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -148,6 +148,7 @@ from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer
 from vllm.v1.structured_output.utils import apply_grammar_bitmask
 from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
 from vllm.v1.worker.cp_utils import check_attention_cp_compatibility
 from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.ec_connector_model_runner_mixin import ECConnectorModelRunnerMixin
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -4736,6 +4737,9 @@ class GPUModelRunner(
            attention_backend_list, kv_cache_config.kv_cache_groups
        )

        # Check if attention backend supports PCP&DCP and related features.
        check_attention_cp_compatibility(self.vllm_config)

        for i, attn_backend_map in enumerate(attention_backend_maps):
            self.attn_groups.append(create_attn_groups(attn_backend_map, i))

@@ -5394,20 +5398,6 @@ class GPUModelRunner(
                kv_transfer_group.register_kv_caches(kv_caches)
            kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks)

        if self.dcp_world_size > 1:
            layer_type = cast(type[Any], AttentionLayerBase)
            layers = get_layers_from_vllm_config(self.vllm_config, layer_type)
            for layer in layers.values():
                layer_impl = getattr(layer, "impl", None)
                if layer_impl is None:
                    continue
                assert layer_impl.need_to_return_lse_for_decode, (
                    "DCP requires attention impls to return"
                    " the softmax lse for decode, but the impl "
                    f"{layer_impl.__class__.__name__} "
                    "does not return the softmax lse for decode."
                )

    def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
        """
        Add encoder-only layers to the KV cache config.
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -22,7 +22,6 @@ from vllm.distributed.kv_transfer import (
    has_kv_transfer_group,
 )
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.logger import init_logger
 from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig
@@ -138,16 +137,10 @@ class KVConnectorModelRunnerMixin:
            )
            output.invalid_block_ids = kv_connector.get_block_ids_with_load_errors()

            output.kv_connector_stats = (
                KVConnectorModelRunnerMixin.get_kv_connector_stats()
            )
            kv_connector.clear_connector_metadata()
            output.kv_connector_stats = kv_connector.get_kv_connector_stats()
            output.kv_cache_events = kv_connector.get_kv_connector_kv_cache_events()

    @staticmethod
    def get_kv_connector_stats() -> KVConnectorStats | None:
        if has_kv_transfer_group():
            return get_kv_transfer_group().get_kv_connector_stats()
        return None
            kv_connector.clear_connector_metadata()

    @staticmethod
    def use_uniform_kv_cache(
Author	SHA1	Message	Date
Cyrus Leung	3a3b06ee70	[Misc] Improve error message for `is_multimodal` (#30483 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	4 days ago
Martin Hickey	f4417f8449	[KVConnector] Add KV events to KV Connectors (#28309 ) Signed-off-by: Martin Hickey <martin.hickey@ie.ibm.com>	4 days ago
Qiu	a11f4a81e0	[Misc][PCP&DCP] relocate PCP feature check (#30050 ) Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	4 days ago
Kenichi Maehashi	853611bb18	Fix typo of endpoint name in CLI args docs (#30473 ) Signed-off-by: Kenichi Maehashi <maehashi@preferred.jp>	5 days ago
Cyrus Leung	d917747c95	[Bugfix] Fix `task` still being passed in tests/benchmarks (#30476 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	5 days ago
wang.yuqi	a5f9fb5960	[Deprecation] Deprecation `--convert reward`, use `--convert embed` instead. (#30463 ) Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>	5 days ago
jeremyteboul	4515eb1a0b	[Fix] Update lazing loading of video loader backend (#30444 ) Signed-off-by: Jeremy Teboul <jeremyteboul@fb.com> Co-authored-by: Jeremy Teboul <jeremyteboul@fb.com>	5 days ago
Cyrus Leung	13d63b65e0	[Deprecation] Remove missed fallback for `embed_input_ids` (#30469 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	5 days ago
wz1qqx	b4e8b91278	[Fix]fix import error from lmcache (#30376 ) Signed-off-by: wz1qqx <ziqi.wang@novita.ai> Co-authored-by: wz1qqx <ziqi.wang@novita.ai>	5 days ago
Rei.	6299628d32	[bugfix] fix MiniMaxM2ReasoningParser streaming output not separating reasoning_content. (#29882 ) Signed-off-by: Rei <1477174254@qq.com>	5 days ago
Ming Yang	fba8906930	[perf] Use direct copy (broadcast) instead of cat for k_nope/k_pe in MLA prefill (#29710 ) Signed-off-by: Ming Yang <minos.future@gmail.com>	5 days ago