|
|
|
@@ -26,6 +26,7 @@ from transformers.testing_utils import ( |
|
|
|
require_torch_large_accelerator, |
|
|
|
require_triton, |
|
|
|
slow, |
|
|
|
torch_device, |
|
|
|
) |
|
|
|
from transformers.utils import ( |
|
|
|
is_torch_available, |
|
|
|
@@ -240,39 +241,6 @@ class Mxfp4QuantizerTest(unittest.TestCase): |
|
|
|
result_dtype = quantizer.update_dtype(torch.float32) |
|
|
|
self.assertEqual(result_dtype, torch.float32) |
|
|
|
|
|
|
|
def test_get_param_name_dequantize(self): |
|
|
|
"""Test parameter name updating when dequantizing""" |
|
|
|
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer |
|
|
|
|
|
|
|
config = Mxfp4Config(dequantize=True) |
|
|
|
quantizer = Mxfp4HfQuantizer(config) |
|
|
|
|
|
|
|
# Should remove _blocks suffix |
|
|
|
param_name = "model.layers.0.mlp.experts.gate_up_proj_blocks" |
|
|
|
updated_name = quantizer.get_param_name(param_name) |
|
|
|
self.assertEqual(updated_name, "model.layers.0.mlp.experts.gate_up_proj") |
|
|
|
|
|
|
|
# Should remove _scales suffix |
|
|
|
param_name = "model.layers.0.mlp.experts.down_proj_scales" |
|
|
|
updated_name = quantizer.get_param_name(param_name) |
|
|
|
self.assertEqual(updated_name, "model.layers.0.mlp.experts.down_proj") |
|
|
|
|
|
|
|
# Should not change other names |
|
|
|
param_name = "model.embed_tokens.weight" |
|
|
|
updated_name = quantizer.get_param_name(param_name) |
|
|
|
self.assertEqual(updated_name, "model.embed_tokens.weight") |
|
|
|
|
|
|
|
def test_get_param_name_no_dequantize(self): |
|
|
|
"""Test parameter name updating when not dequantizing""" |
|
|
|
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer |
|
|
|
|
|
|
|
config = Mxfp4Config(dequantize=False) |
|
|
|
quantizer = Mxfp4HfQuantizer(config) |
|
|
|
|
|
|
|
param_name = "model.layers.0.mlp.experts.gate_up_proj_blocks" |
|
|
|
updated_name = quantizer.get_param_name(param_name) |
|
|
|
self.assertEqual(updated_name, param_name) |
|
|
|
|
|
|
|
def test_is_trainable(self): |
|
|
|
"""Test trainability""" |
|
|
|
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer |
|
|
|
@@ -501,3 +469,55 @@ class Mxfp4ModelTest(unittest.TestCase): |
|
|
|
device_map="auto", |
|
|
|
) |
|
|
|
self.check_inference_correctness_quantized(loaded_model, tokenizer) |
|
|
|
|
|
|
|
def test_compute_module_sizes(self): |
|
|
|
r""" |
|
|
|
Test if we compute the right module sizes needed to generate the device map. |
|
|
|
Also test if we get the right values for `total_byte_count` in `caching_allocator_warmup`. |
|
|
|
Note that `compute_module_sizes` is being used in `get_total_byte_count` |
|
|
|
""" |
|
|
|
from transformers import AutoConfig, AutoModelForCausalLM |
|
|
|
from transformers.integrations import Mxfp4GptOssExperts |
|
|
|
from transformers.integrations.accelerate import compute_module_sizes |
|
|
|
from transformers.modeling_utils import expand_device_map, get_total_byte_count |
|
|
|
from transformers.quantizers import AutoHfQuantizer |
|
|
|
|
|
|
|
# we need to preprocess the model like that because device_map calculation happens before we load the weights inside the model. |
|
|
|
# For normal wieghts, it's fine but for quantized weights, the tensors dtype might change during loading. |
|
|
|
with torch.device("meta"): |
|
|
|
config = AutoConfig.from_pretrained(self.model_name) |
|
|
|
model = AutoModelForCausalLM.from_config(config, dtype=torch.bfloat16) |
|
|
|
model_size, _ = compute_module_sizes(model, only_modules=False) |
|
|
|
|
|
|
|
expected_keys = [name for name, _ in model.named_parameters()] + [ |
|
|
|
name for name, _ in model.named_buffers() |
|
|
|
] |
|
|
|
expanded_device_map = expand_device_map({"": torch_device}, expected_keys) |
|
|
|
total_byte_count = list(get_total_byte_count(model, expanded_device_map).values())[0] |
|
|
|
|
|
|
|
# testing prequantized = False should be enough, the shape should be the same whether it is pre-quantized or not |
|
|
|
hf_quantizer = AutoHfQuantizer.from_config(Mxfp4Config(), pre_quantized=False) |
|
|
|
hf_quantizer.preprocess_model(model=model, config=model.config) |
|
|
|
quantized_model_size, _ = compute_module_sizes(model, hf_quantizer, only_modules=False) |
|
|
|
|
|
|
|
expected_keys = [name for name, _ in model.named_parameters()] + [ |
|
|
|
name for name, _ in model.named_buffers() |
|
|
|
] |
|
|
|
expanded_device_map = expand_device_map({"": torch_device}, expected_keys) |
|
|
|
quantized_total_byte_count = list(get_total_byte_count(model, expanded_device_map, hf_quantizer).values())[ |
|
|
|
0 |
|
|
|
] |
|
|
|
for name, module in model.named_modules(): |
|
|
|
if isinstance(module, Mxfp4GptOssExperts): |
|
|
|
# from 16 bits to 4 bits |
|
|
|
assert int(model_size[f"{name}.gate_up_proj"] // 4) == int( |
|
|
|
quantized_model_size[f"{name}.gate_up_proj"] |
|
|
|
) |
|
|
|
assert int(model_size[f"{name}.down_proj"] // 4) == int(quantized_model_size[f"{name}.down_proj"]) |
|
|
|
|
|
|
|
# check that we get the same value, as we use `compute_module_sizes` in `get_total_byte_count` |
|
|
|
assert total_byte_count == model_size[""] |
|
|
|
assert quantized_total_byte_count == quantized_model_size[""] |
|
|
|
|
|
|
|
# we should at least have 3 times memory reduction in total for this model |
|
|
|
assert model_size[""] > quantized_model_size[""] * 3 |