2 Commits

Author SHA1 Message Date
  yewentao256 60906cddc0 fix tests 4 days ago
  yewentao256 6c9552b4a6 fix layer 4 days ago
11 changed files with 18 additions and 22 deletions
Split View
  1. +3
    -1
      tests/kernels/moe/modular_kernel_tools/common.py
  2. +1
    -2
      vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
  3. +1
    -1
      vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
  4. +1
    -1
      vllm/model_executor/layers/quantization/bitsandbytes.py
  5. +5
    -5
      vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
  6. +1
    -1
      vllm/model_executor/layers/quantization/experts_int8.py
  7. +1
    -1
      vllm/model_executor/layers/quantization/fp8.py
  8. +1
    -1
      vllm/model_executor/layers/quantization/modelopt.py
  9. +1
    -1
      vllm/model_executor/layers/quantization/moe_wna16.py
  10. +2
    -2
      vllm/model_executor/layers/quantization/quark/quark_moe.py
  11. +1
    -6
      vllm/model_executor/layers/quantization/utils/flashinfer_utils.py

+ 3
- 1
tests/kernels/moe/modular_kernel_tools/common.py View File

@@ -594,7 +594,9 @@ def make_modular_kernel(
)

modular_kernel = mk.FusedMoEModularKernel(
prepare_finalize=prepare_finalize, fused_experts=fused_experts
prepare_finalize=prepare_finalize,
fused_experts=fused_experts,
moe_parallel_config=moe_parallel_config,
)

return modular_kernel


+ 1
- 2
vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py View File

@@ -43,7 +43,6 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
prepare_finalize: FusedMoEPrepareAndFinalize,
shared_experts: torch.nn.Module | None,
) -> "FusedMoEModularMethod":
moe_parallel_config = getattr(moe_layer, "moe_parallel_config", None)
return FusedMoEModularMethod(
old_quant_method,
FusedMoEModularKernel(
@@ -51,7 +50,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
old_quant_method.select_gemm_impl(prepare_finalize, moe_layer),
shared_experts,
getattr(moe_layer, "shared_experts_stream", None),
moe_parallel_config=moe_parallel_config,
moe_parallel_config=moe_layer.moe_parallel_config,
),
)



+ 1
- 1
vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py View File

@@ -334,7 +334,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
apply_router_weight_on_input=layer.apply_router_weight_on_input,
global_num_experts=layer.global_num_experts,
expert_map=layer.expert_map,
moe_parallel_config=getattr(layer, "moe_parallel_config", None),
moe_parallel_config=layer.moe_parallel_config,
)

if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:


+ 1
- 1
vllm/model_executor/layers/quantization/bitsandbytes.py View File

@@ -521,7 +521,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
global_num_experts=layer.global_num_experts,
expert_map=layer.expert_map,
quant_config=self.moe_quant_config,
moe_parallel_config=getattr(layer, "moe_parallel_config", None),
moe_parallel_config=layer.moe_parallel_config,
)

def _create_weights_4bit(


+ 5
- 5
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py View File

@@ -1244,7 +1244,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
if self.disable_expert_map
else layer.expert_map, # ???
quant_config=self.moe_quant_config,
moe_parallel_config=getattr(layer, "moe_parallel_config", None),
moe_parallel_config=layer.moe_parallel_config,
)
else:
from vllm.model_executor.layers.fused_moe.cutlass_moe import (
@@ -1267,7 +1267,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
ab_strides2=self.ab_strides2,
c_strides1=self.c_strides1,
c_strides2=self.ab_strides1_c_strides2,
moe_parallel_config=getattr(layer, "moe_parallel_config", None),
moe_parallel_config=layer.moe_parallel_config,
)

else:
@@ -1287,7 +1287,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
global_num_experts=layer.global_num_experts,
expert_map=layer.expert_map,
quant_config=self.moe_quant_config,
moe_parallel_config=getattr(layer, "moe_parallel_config", None),
moe_parallel_config=layer.moe_parallel_config,
)

@property
@@ -1426,7 +1426,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
global_num_experts=layer.global_num_experts,
expert_map=layer.expert_map,
quant_config=self.moe_quant_config,
moe_parallel_config=getattr(layer, "moe_parallel_config", None),
moe_parallel_config=layer.moe_parallel_config,
)


@@ -2015,7 +2015,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
global_num_experts=layer.global_num_experts,
expert_map=layer.expert_map,
quant_config=self.moe_quant_config,
moe_parallel_config=getattr(layer, "moe_parallel_config", None),
moe_parallel_config=layer.moe_parallel_config,
)

@property


+ 1
- 1
vllm/model_executor/layers/quantization/experts_int8.py View File

@@ -159,7 +159,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
global_num_experts=layer.global_num_experts,
expert_map=layer.expert_map,
quant_config=self.moe_quant_config,
moe_parallel_config=getattr(layer, "moe_parallel_config", None),
moe_parallel_config=layer.moe_parallel_config,
)

@staticmethod


+ 1
- 1
vllm/model_executor/layers/quantization/fp8.py View File

@@ -1376,7 +1376,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
allow_cutlass_block_scaled_grouped_gemm=(
self.allow_cutlass_block_scaled_grouped_gemm
),
moe_parallel_config=getattr(layer, "moe_parallel_config", None),
moe_parallel_config=layer.moe_parallel_config,
)

if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:


+ 1
- 1
vllm/model_executor/layers/quantization/modelopt.py View File

@@ -768,7 +768,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
global_num_experts=layer.global_num_experts,
expert_map=layer.expert_map,
apply_router_weight_on_input=layer.apply_router_weight_on_input,
moe_parallel_config=getattr(layer, "moe_parallel_config", None),
moe_parallel_config=layer.moe_parallel_config,
)




+ 1
- 1
vllm/model_executor/layers/quantization/moe_wna16.py View File

@@ -381,7 +381,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
global_num_experts=layer.global_num_experts,
expert_map=layer.expert_map,
quant_config=self.moe_quant_config,
moe_parallel_config=getattr(layer, "moe_parallel_config", None),
moe_parallel_config=layer.moe_parallel_config,
)

@staticmethod


+ 2
- 2
vllm/model_executor/layers/quantization/quark/quark_moe.py View File

@@ -393,7 +393,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
global_num_experts=layer.global_num_experts,
expert_map=layer.expert_map,
quant_config=self.moe_quant_config,
moe_parallel_config=getattr(layer, "moe_parallel_config", None),
moe_parallel_config=layer.moe_parallel_config,
)


@@ -618,7 +618,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
apply_router_weight_on_input=layer.apply_router_weight_on_input,
expert_map=layer.expert_map,
quant_config=self.moe_quant_config,
moe_parallel_config=getattr(layer, "moe_parallel_config", None),
moe_parallel_config=layer.moe_parallel_config,
)

return out

+ 1
- 6
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py View File

@@ -247,11 +247,6 @@ def flashinfer_cutlass_moe_fp8(
assert quant_config is not None

# Construct modular kernel with block-scale support when requested.
moe_parallel_config = getattr(
moe if moe is not None else layer,
"moe_parallel_config",
None,
)
fused_experts = mk.FusedMoEModularKernel(
build_flashinfer_fp8_cutlass_moe_prepare_finalize(
moe=moe, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
@@ -262,7 +257,7 @@ def flashinfer_cutlass_moe_fp8(
out_dtype=hidden_states.dtype,
use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
),
moe_parallel_config=moe_parallel_config,
moe_parallel_config=layer.moe_parallel_config,
)

return fused_experts(


Loading…
Cancel
Save
Baidu
map