!7851 【master】修改qk_clip_threshold默认值为100

Merge pull request !7851 from JavaZero/reset_default_qk_clip_threshold
!7833 【master】添加Muon优化器的ut
--- a/mindformers/core/optim/muon.py
+++ b/mindformers/core/optim/muon.py
@@ -273,7 +273,7 @@ class Muon(Optimizer):
        adamw_betas=(0.95, 0.95),
        adamw_eps=1e-8,
        micro_batch_num=1,
        qk_clip_threshold=4,
        qk_clip_threshold=100,
        model=None,
        **kwargs,
    ):
--- a/tests/st/test_optim/optimizer_util.py
+++ b/tests/st/test_optim/optimizer_util.py
@@ -22,6 +22,7 @@ from mindspore import nn, Tensor
 from mindspore.ops import operations as P

 from mindformers.core.optim import build_optim
 from mindformers.core.optim.muon import Muon

 np.random.seed(1024)

@@ -58,7 +59,7 @@ class NetWithLoss(nn.Cell):
    """

    def __init__(self, network, loss_fn):
        super(NetWithLoss, self).__init__()
        super().__init__()
        self.network = network
        self.loss = loss_fn

@@ -74,7 +75,7 @@ class FakeNet(nn.Cell):
    """

    def __init__(self):
        super(FakeNet, self).__init__()
        super().__init__()
        self.fc1 = nn.Dense(in_channels=8, out_channels=4, weight_init=Tensor(fc1_weight), bias_init=Tensor(fc1_bias))
        self.fc2 = nn.Dense(in_channels=4, out_channels=1, weight_init=Tensor(fc2_weight), bias_init=Tensor(fc2_bias))
        self.relu = nn.ReLU()
@@ -155,3 +156,100 @@ default_fc1_weight_adamw_v = (
 default_fc2_weight_adamw_v = (
    np.array([[35.217834, 42.283375, 26.52298, 21.510029]], dtype=np.float32)
 )


 class MockTransformerConfig:
    """Mock transformer config for testing Muon optimizer."""
    def __init__(self):
        self.multi_latent_attention = True
        self.tensor_model_parallel_size = 1
        self.data_parallel_size = 1


 class MockModel:
    """
    Mock model class that provides required interfaces for Muon optimizer.
    This simulates the model interface that Muon optimizer expects.
    """
    def __init__(self):
        self.config = MockTransformerConfig()

    def get_gpt_transformer_config(self):
        """Return transformer config."""
        return self.config

    def make_model_muon_fns(self):
        """Return muon split and merge functions."""
        def muon_split_fn(param_name, tensor):  # pylint: disable=unused-argument
            """Split function - returns tensor as list."""
            return [tensor]

        def muon_merge_fn(param_name, tensor_list):  # pylint: disable=unused-argument
            """Merge function - returns first tensor."""
            return tensor_list[0]

        return muon_split_fn, muon_merge_fn

    def get_param_layer_indices(self, params):
        """Return layer indices for parameters."""
        return {p.name: 0 for p in params}

    def get_muon_filter(self):
        """Return filter function to determine which params use Muon."""
        def muon_filter(param):
            # Apply Muon to weight parameters with 2D shape (not bias)
            return len(param.shape) == 2 and 'bias' not in param.name
        return muon_filter

    def get_tp_dims(self, params):
        """Return tensor parallel dimensions."""
        return tuple(-1 for _ in params)

    def get_op_groups_info(self, params, op):  # pylint: disable=unused-argument
        """Return optimizer parallel group info."""
        ops = tuple(1 for _ in params)
        op_groups = tuple("" for _ in params)
        return ops, op_groups


 def build_muon_network(net, mock_model, learning_rate=0.02):
    """
    Build network with Muon optimizer for testing.

    Args:
        net: The network to train
        mock_model: Mock model providing Muon interface
        learning_rate: Learning rate for optimizer

    Returns:
        tuple: (losses, optimizer)
    """

    loss_fn = nn.L1Loss(reduction='mean')
    networkwithloss = NetWithLoss(net, loss_fn)
    networkwithloss.set_train()

    params = networkwithloss.trainable_params()

    # Create Muon optimizer
    optimizer = Muon(
        params=params,
        learning_rate=learning_rate,
        weight_decay=0.1,
        matched_adamw_rms=0.2,
        momentum=0.95,
        nesterov=True,
        adamw_betas=(0.95, 0.95),
        adamw_eps=1e-8,
        model=mock_model,
    )

    trainonestepcell = mindspore.nn.TrainOneStepCell(networkwithloss, optimizer)

    losses = []
    data, label = make_fake_data()
    for i in range(20):
        loss = trainonestepcell(data[i], label[i])
        losses.append(loss.asnumpy())

    return np.array(losses), optimizer
--- a/tests/st/test_optim/test_muon/init.py
+++ b/tests/st/test_optim/test_muon/init.py
@@ -0,0 +1,15 @@
 # Copyright 2025 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """test muon optimizer."""
--- a/tests/st/test_optim/test_muon/data_utils.py
+++ b/tests/st/test_optim/test_muon/data_utils.py
@@ -0,0 +1,63 @@
 # Copyright 2025 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """
 Baseline data for Muon optimizer tests.
 """
 import numpy as np

 # Default tolerance for loss comparison
 DEFAULT_RTOL = 1e-4
 DEFAULT_ATOL = 1e-4

 # Baseline losses for single card test cases
 # learning_rate=0.02, weight_decay=0.1, momentum=0.95, nesterov=True
 BASELINE_LOSSES_NESTEROV_TRUE = np.array([
    0.3881023, 7.8122883, 15.039654, 22.062939, 28.884716,
    35.514862, 41.940598, 48.178577, 54.222153, 60.07846,
    65.739815, 71.20518, 76.508705, 81.63688, 86.58084,
    91.356064, 95.94581, 100.37069, 104.620384, 108.72005
 ], dtype=np.float32)

 # learning_rate=0.02, weight_decay=0.1, momentum=0.95, nesterov=False
 BASELINE_LOSSES_NESTEROV_FALSE = np.array([
    0.3881023, 7.8122883, 15.032751, 22.052126, 28.875042,
    35.503002, 41.92948, 48.16231, 54.218227, 60.07244,
    65.745224, 71.22119, 76.5374, 81.64788, 86.525246,
    91.292816, 95.89634, 100.308716, 104.57111, 108.64668
 ], dtype=np.float32)

 # learning_rate=0.01, weight_decay=0.05, momentum=0.9, nesterov=True
 BASELINE_LOSSES_DIFF_LR = np.array([
    0.3881023, 7.8966713, 15.322964, 22.66404, 29.917278,
    37.085056, 44.168663, 51.175865, 58.094597, 64.92998,
    71.680595, 78.34835, 84.92714, 91.44285, 97.866035,
    104.204056, 110.46475, 116.63603, 122.729706, 128.74644
 ], dtype=np.float32)


 def compare_losses(actual_losses, expected_losses, rtol=DEFAULT_RTOL, atol=DEFAULT_ATOL):
    """
    Compare actual losses with expected baseline losses.

    Args:
        actual_losses (np.ndarray): Actual losses from the test run
        expected_losses (np.ndarray): Expected baseline losses
        rtol (float): Relative tolerance for comparison
        atol (float): Absolute tolerance for comparison

    Returns:
        bool: True if losses match within tolerance, False otherwise
    """
    return np.allclose(actual_losses, expected_losses, rtol=rtol, atol=atol)
--- a/tests/st/test_optim/test_muon/run_muon.py
+++ b/tests/st/test_optim/test_muon/run_muon.py
@@ -0,0 +1,236 @@
 # Copyright 2025 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Run Muon optimizer accuracy test with configurable parameters via args"""
 import argparse
 import numpy as np
 import mindspore as ms
 from mindspore import nn, Tensor

 from mindformers.core.context.build_context import build_context
 from mindformers.core.optim.muon import Muon

 np.random.seed(1024)

 # Test weight initialization - same as optimizer_util.py
 FC1_WEIGHT = np.array([[0.72346634, 0.95608497, 0.4084163, 0.18627149,
                        0.6942514, 0.39767185, 0.24918061, 0.4548748],
                       [0.7203382, 0.19086994, 0.76286614, 0.87920564,
                        0.3169892, 0.9462494, 0.62827677, 0.27504718],
                       [0.3544535, 0.2524781, 0.5370583, 0.8313121,
                        0.6670143, 0.0488653, 0.62225235, 0.7546456],
                       [0.17985944, 0.05106374, 0.31064633, 0.4863033,
                        0.848814, 0.5523157, 0.20295663, 0.7213356]]).astype("float32")

 FC1_BIAS = np.array([0.79708564, 0.13728078, 0.66322654, 0.88128525]).astype("float32")

 FC2_WEIGHT = np.array([[0.8473515, 0.50923985, 0.42287776, 0.29769543]]).astype("float32")

 FC2_BIAS = np.array([0.09996348]).astype("float32")


 class MockTransformerConfig:
    """Mock transformer config for testing Muon optimizer."""
    def __init__(self):
        self.multi_latent_attention = True
        self.tensor_model_parallel_size = 1
        self.data_parallel_size = 1


 class MockModel:
    """
    Mock model class that provides required interfaces for Muon optimizer.
    This simulates the model interface that Muon optimizer expects.
    """
    def __init__(self):
        self.config = MockTransformerConfig()

    def get_gpt_transformer_config(self):
        """Return transformer config."""
        return self.config

    def make_model_muon_fns(self):
        """Return muon split and merge functions."""
        def muon_split_fn(param_name, tensor):  # pylint: disable=unused-argument
            """Split function - returns tensor as list."""
            return [tensor]

        def muon_merge_fn(param_name, tensor_list):  # pylint: disable=unused-argument
            """Merge function - returns first tensor."""
            return tensor_list[0]

        return muon_split_fn, muon_merge_fn

    # pylint: disable=unused-argument
    def apply_qk_clip_scaling(self, params, param_names, param_layer, logit_threshold,
                               muon_split_fn, muon_merge_fn):
        """Apply query-key clipping scaling."""
        return [(0, params[0])]

    def get_param_layer_indices(self, params):
        """Return layer indices for parameters."""
        return {p.name: 0 for p in params}

    def get_muon_filter(self):
        """Return filter function to determine which params use Muon."""
        def muon_filter(param):
            # Apply Muon to weight parameters with 2D shape (not bias)
            return len(param.shape) == 2 and 'bias' not in param.name
        return muon_filter

    def get_tp_dims(self, params):
        """Return tensor parallel dimensions."""
        return tuple(-1 for _ in params)

    def get_op_groups_info(self, params, op):  # pylint: disable=unused-argument
        """Return optimizer parallel group info."""
        ops = tuple(1 for _ in params)
        op_groups = tuple("" for _ in params)
        return ops, op_groups


 class FakeNet(nn.Cell):
    """Build fake net for testing."""

    def __init__(self):
        super().__init__()
        self.fc1 = nn.Dense(in_channels=8, out_channels=4,
                            weight_init=Tensor(FC1_WEIGHT),
                            bias_init=Tensor(FC1_BIAS))
        self.fc2 = nn.Dense(in_channels=4, out_channels=1,
                            weight_init=Tensor(FC2_WEIGHT),
                            bias_init=Tensor(FC2_BIAS))
        self.relu = nn.ReLU()

    def construct(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x


 class NetWithLoss(nn.Cell):
    """Build net with loss."""

    def __init__(self, network, loss_fn):
        super().__init__()
        self.network = network
        self.loss = loss_fn

    def construct(self, x, label):
        out = self.network(x)
        loss = self.loss(out, label)
        return loss


 def make_fake_data():
    """Make fake data for testing."""
    data, label = [], []
    for i in range(20):
        data.append(ms.Tensor(np.array(np.ones((2, 8)) * i, dtype=np.float32)))
        label.append(ms.Tensor(np.array(np.ones((2, 1)) * (i + 1), dtype=np.float32)))
    return data, label


 class MuonRunner:
    """Class to manage Muon optimizer test and training."""

    def __init__(self, args_from_parser):
        self.args = args_from_parser
        self.learning_rate = self.args.learning_rate
        self.weight_decay = self.args.weight_decay
        self.momentum = self.args.momentum
        self.nesterov = self.args.nesterov
        self.num_steps = self.args.num_steps

    def build_network(self):
        """Build network with Muon optimizer."""
        net = FakeNet()
        mock_model = MockModel()

        loss_fn = nn.L1Loss(reduction='mean')
        networkwithloss = NetWithLoss(net, loss_fn)
        networkwithloss.set_train()

        params = networkwithloss.trainable_params()

        # Create Muon optimizer
        optimizer = Muon(
            params=params,
            learning_rate=self.learning_rate,
            weight_decay=self.weight_decay,
            matched_adamw_rms=0.2,
            momentum=self.momentum,
            nesterov=self.nesterov,
            adamw_betas=(0.95, 0.95),
            adamw_eps=1e-8,
            model=mock_model,
        )

        return networkwithloss, optimizer, mock_model

    def run(self):
        """Run the training with Muon optimizer."""
        networkwithloss, optimizer, mock_model = self.build_network()
        trainonestepcell = nn.TrainOneStepCell(networkwithloss, optimizer)

        losses = []
        data, label = make_fake_data()
        for i in range(self.num_steps):
            loss = trainonestepcell(data[i], label[i])
            losses.append(loss.asnumpy())

        # Save results
        output_dict = {
            "losses": np.array(losses),
            "num_muon_m": len(optimizer.muon_m),
            "num_moments1": len(optimizer.moments1),
            "num_moments2": len(optimizer.moments2),
        }

        # Save muon momentum values for weight parameters
        muon_filter = mock_model.get_muon_filter()
        # pylint: disable=protected-access
        for idx, param in enumerate(optimizer._parameters):
            if muon_filter(param):
                muon_m_value = optimizer.muon_m[idx].asnumpy()
                output_dict[f"muon_m_{idx}"] = muon_m_value

        np.savez(self.args.output_path, **output_dict)
        print(f"Results saved to {self.args.output_path}")


 def main():
    parser = argparse.ArgumentParser(description="Run Muon optimizer test")
    parser.add_argument("--learning_rate", type=float, default=0.02)
    parser.add_argument("--weight_decay", type=float, default=0.1)
    parser.add_argument("--momentum", type=float, default=0.95)
    parser.add_argument("--nesterov", type=lambda x: x.lower() == "true", default=True)
    parser.add_argument("--num_steps", type=int, default=20)
    parser.add_argument("--output_path", type=str, default="output_muon.npz")

    args = parser.parse_args()

    # Set context
    build_context({"use_legacy": False, "use_parallel": True})
    ms.set_deterministic(True)
    ms.set_context(mode=ms.GRAPH_MODE)
    ms.set_seed(42)

    # Run training
    runner = MuonRunner(args)
    runner.run()


 if __name__ == "__main__":
    main()
--- a/tests/st/test_optim/test_muon/test_muon.py
+++ b/tests/st/test_optim/test_muon/test_muon.py
@@ -0,0 +1,202 @@
 # Copyright 2025 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """
 Test module for testing the Muon optimizer interface used for MindFormers.
 How to run this:
 pytest tests/st/test_optim/test_muon/test_muon.py
 """
 from pathlib import Path
 import subprocess
 import pytest
 import numpy as np

 from tests.st.test_optim.test_muon.data_utils import (
    BASELINE_LOSSES_NESTEROV_TRUE,
    BASELINE_LOSSES_NESTEROV_FALSE,
    BASELINE_LOSSES_DIFF_LR,
    compare_losses,
    DEFAULT_RTOL,
    DEFAULT_ATOL,
 )

 from mindformers.tools.logger import logger

 # Test parameters definition
 SINGLE_CARD_TEST_CASES = [
    # Default config with nesterov=True
    {
        "learning_rate": 0.02,
        "weight_decay": 0.1,
        "momentum": 0.95,
        "nesterov": True,
        "num_steps": 20,
        "baseline_losses": BASELINE_LOSSES_NESTEROV_TRUE,
    },
    # Config without Nesterov momentum
    {
        "learning_rate": 0.02,
        "weight_decay": 0.1,
        "momentum": 0.95,
        "nesterov": False,
        "num_steps": 20,
        "baseline_losses": BASELINE_LOSSES_NESTEROV_FALSE,
    },
    # Config with different learning rate
    {
        "learning_rate": 0.01,
        "weight_decay": 0.05,
        "momentum": 0.9,
        "nesterov": True,
        "num_steps": 20,
        "baseline_losses": BASELINE_LOSSES_DIFF_LR,
    },
 ]


 def build_msrun_command_list(
        worker_num,
        local_worker_num,
        log_dir,
        run_script_path,
        learning_rate,
        weight_decay,
        momentum,
        nesterov,
        num_steps,
        output_path,
        port=29500
    ):
    """Build the msrun command with the specified parameters."""
    cmd_list = [
        "msrun",
        f"--worker_num={worker_num}",
        f"--local_worker_num={local_worker_num}",
        f"--master_port={port}",
        f"--log_dir={log_dir}",
        "--join=True",
        str(run_script_path),
        f"--learning_rate={learning_rate}",
        f"--weight_decay={weight_decay}",
        f"--momentum={momentum}",
        f"--nesterov={str(nesterov).lower()}",
        f"--num_steps={num_steps}",
        f"--output_path={output_path}",
    ]
    logger.info(f"Equivalent shell command for Muon test: {' '.join(cmd_list)}")
    return cmd_list


 class TestMuon:
    """Test class for Muon optimizer with different configurations."""
    OUTPUT_FILENAME = "output_muon.npz"
    LOG_DIR_NAME = "msrun_log"

    def setup_method(self):
        """Setup method to prepare test environment."""
        self.sh_path = Path(__file__).parent.resolve()
        self.run_script_path = self.sh_path / "run_muon.py"

    def check_results(self, output_dict, baseline_losses=None):
        """
        Check the output results from the Muon optimizer run.

        Args:
            output_dict: Dictionary containing the output results
            num_params: Expected number of parameters
            baseline_losses: Expected baseline losses for comparison
        """
        # Check losses
        losses = output_dict.get("losses")
        assert losses is not None, "Losses not found in output"
        assert len(losses) > 0, "Losses array is empty"
        assert not np.any(np.isnan(losses)), "Losses contain NaN values"
        assert not np.any(np.isinf(losses)), "Losses contain Inf values"

        # Compare with baseline if provided
        if baseline_losses is not None:
            assert compare_losses(losses, baseline_losses, rtol=DEFAULT_RTOL, atol=DEFAULT_ATOL), (
                f"Losses do not match baseline.\n"
                f"Actual: {losses}\n"
                f"Expected: {baseline_losses}\n"
                f"Max diff: {np.max(np.abs(losses - baseline_losses))}"
            )

    def run_test(
            self,
            worker_num,
            local_worker_num,
            optimizer_args,
            tmp_path,
            port=29500,
            baseline_losses=None
        ):
        """Helper function to run test and check results."""
        output_file_path = tmp_path / self.OUTPUT_FILENAME
        log_dir_path = tmp_path / self.LOG_DIR_NAME
        log_dir_path.mkdir(parents=True, exist_ok=True)

        cmd_list = build_msrun_command_list(
            worker_num=worker_num,
            local_worker_num=local_worker_num,
            log_dir=log_dir_path,
            run_script_path=self.run_script_path,
            learning_rate=optimizer_args["learning_rate"],
            weight_decay=optimizer_args["weight_decay"],
            momentum=optimizer_args["momentum"],
            nesterov=optimizer_args["nesterov"],
            num_steps=optimizer_args["num_steps"],
            output_path=output_file_path,
            port=port
        )

        result = subprocess.run(
            cmd_list, shell=False, capture_output=True, text=True, check=False
        )

        assert result.returncode == 0, (
            f"Test script failed with non-zero exit code: "
            f"{result.returncode}.\nStdout:\n{result.stdout}\nStderr:\n{result.stderr}"
        )
        assert output_file_path.exists(), (
            f"Output file {output_file_path} was not created."
        )

        output_dict = np.load(output_file_path)
        self.check_results(output_dict, baseline_losses=baseline_losses)

        return output_dict


@pytest.mark.level0
@pytest.mark.platform_arm_ascend910b_training
@pytest.mark.env_onecard
 class TestMuonSingleCard(TestMuon):
    """Test class for Muon optimizer with single card configurations."""

    @pytest.mark.parametrize("optimizer_args", SINGLE_CARD_TEST_CASES)
    def test_muon_single_card(self, optimizer_args, tmp_path):
        """
        Feature: Muon optimizer training
        Description: Test computation of Muon optimizer with various configurations.
        Expectation: Training completes successfully with valid losses matching baseline
        """
        baseline_losses = optimizer_args.get("baseline_losses")
        self.run_test(
            worker_num=1,
            local_worker_num=1,
            optimizer_args=optimizer_args,
            tmp_path=tmp_path,
            baseline_losses=baseline_losses
        )
--- a/tests/st/test_ut/base_schema.json
+++ b/tests/st/test_ut/base_schema.json
@@ -1062,7 +1062,7 @@
        "signature": "(use_fused)"
    },
    "mindformers.core.optim.Muon": {
        "signature": "(params, learning_rate=0.02, weight_decay=0.1, matched_adamw_rms=0.2, momentum=0.95, nesterov=True, ns_steps=5, adamw_betas=(0.95, 0.95), adamw_eps=1e-08, micro_batch_num=1, qk_clip_threshold=4, model=None, **kwargs)"
        "signature": "(params, learning_rate=0.02, weight_decay=0.1, matched_adamw_rms=0.2, momentum=0.95, nesterov=True, ns_steps=5, adamw_betas=(0.95, 0.95), adamw_eps=1e-08, micro_batch_num=1, qk_clip_threshold=100, model=None, **kwargs)"
    },
    "mindformers.core.optim.Muon._verify_model": {
        "signature": "(self, model)"
Author	SHA1	Message	Date
i-robot	517802fc69	!7851 【master】修改qk_clip_threshold默认值为100 Merge pull request !7851 from JavaZero/reset_default_qk_clip_threshold	6 days ago
i-robot	567b89272d	!7833 【master】添加Muon优化器的ut Merge pull request !7833 from JavaZero/test_muon	6 days ago
JavaZero	1c65f2afff	update: increase default qk_clip_threshold from 4 to 100	6 days ago
JavaZero	7bba670f5c	test: add unit tests for Muon optimizer initialization and computation test: add baseline data and comparison for Muon optimizer losses	1 week ago