4 Commits

Author SHA1 Message Date
  i-robot 517802fc69
!7851 【master】修改qk_clip_threshold默认值为100 6 days ago
  i-robot 567b89272d
!7833 【master】添加Muon优化器的ut 6 days ago
  JavaZero 1c65f2afff update: increase default qk_clip_threshold from 4 to 100 6 days ago
  JavaZero 7bba670f5c test: add unit tests for Muon optimizer initialization and computation 1 week ago
7 changed files with 618 additions and 4 deletions
Split View
  1. +1
    -1
      mindformers/core/optim/muon.py
  2. +100
    -2
      tests/st/test_optim/optimizer_util.py
  3. +15
    -0
      tests/st/test_optim/test_muon/__init__.py
  4. +63
    -0
      tests/st/test_optim/test_muon/data_utils.py
  5. +236
    -0
      tests/st/test_optim/test_muon/run_muon.py
  6. +202
    -0
      tests/st/test_optim/test_muon/test_muon.py
  7. +1
    -1
      tests/st/test_ut/base_schema.json

+ 1
- 1
mindformers/core/optim/muon.py View File

@@ -273,7 +273,7 @@ class Muon(Optimizer):
adamw_betas=(0.95, 0.95),
adamw_eps=1e-8,
micro_batch_num=1,
qk_clip_threshold=4,
qk_clip_threshold=100,
model=None,
**kwargs,
):


+ 100
- 2
tests/st/test_optim/optimizer_util.py View File

@@ -22,6 +22,7 @@ from mindspore import nn, Tensor
from mindspore.ops import operations as P

from mindformers.core.optim import build_optim
from mindformers.core.optim.muon import Muon

np.random.seed(1024)

@@ -58,7 +59,7 @@ class NetWithLoss(nn.Cell):
"""

def __init__(self, network, loss_fn):
super(NetWithLoss, self).__init__()
super().__init__()
self.network = network
self.loss = loss_fn

@@ -74,7 +75,7 @@ class FakeNet(nn.Cell):
"""

def __init__(self):
super(FakeNet, self).__init__()
super().__init__()
self.fc1 = nn.Dense(in_channels=8, out_channels=4, weight_init=Tensor(fc1_weight), bias_init=Tensor(fc1_bias))
self.fc2 = nn.Dense(in_channels=4, out_channels=1, weight_init=Tensor(fc2_weight), bias_init=Tensor(fc2_bias))
self.relu = nn.ReLU()
@@ -155,3 +156,100 @@ default_fc1_weight_adamw_v = (
default_fc2_weight_adamw_v = (
np.array([[35.217834, 42.283375, 26.52298, 21.510029]], dtype=np.float32)
)


class MockTransformerConfig:
"""Mock transformer config for testing Muon optimizer."""
def __init__(self):
self.multi_latent_attention = True
self.tensor_model_parallel_size = 1
self.data_parallel_size = 1


class MockModel:
"""
Mock model class that provides required interfaces for Muon optimizer.
This simulates the model interface that Muon optimizer expects.
"""
def __init__(self):
self.config = MockTransformerConfig()

def get_gpt_transformer_config(self):
"""Return transformer config."""
return self.config

def make_model_muon_fns(self):
"""Return muon split and merge functions."""
def muon_split_fn(param_name, tensor): # pylint: disable=unused-argument
"""Split function - returns tensor as list."""
return [tensor]

def muon_merge_fn(param_name, tensor_list): # pylint: disable=unused-argument
"""Merge function - returns first tensor."""
return tensor_list[0]

return muon_split_fn, muon_merge_fn

def get_param_layer_indices(self, params):
"""Return layer indices for parameters."""
return {p.name: 0 for p in params}

def get_muon_filter(self):
"""Return filter function to determine which params use Muon."""
def muon_filter(param):
# Apply Muon to weight parameters with 2D shape (not bias)
return len(param.shape) == 2 and 'bias' not in param.name
return muon_filter

def get_tp_dims(self, params):
"""Return tensor parallel dimensions."""
return tuple(-1 for _ in params)

def get_op_groups_info(self, params, op): # pylint: disable=unused-argument
"""Return optimizer parallel group info."""
ops = tuple(1 for _ in params)
op_groups = tuple("" for _ in params)
return ops, op_groups


def build_muon_network(net, mock_model, learning_rate=0.02):
"""
Build network with Muon optimizer for testing.

Args:
net: The network to train
mock_model: Mock model providing Muon interface
learning_rate: Learning rate for optimizer

Returns:
tuple: (losses, optimizer)
"""

loss_fn = nn.L1Loss(reduction='mean')
networkwithloss = NetWithLoss(net, loss_fn)
networkwithloss.set_train()

params = networkwithloss.trainable_params()

# Create Muon optimizer
optimizer = Muon(
params=params,
learning_rate=learning_rate,
weight_decay=0.1,
matched_adamw_rms=0.2,
momentum=0.95,
nesterov=True,
adamw_betas=(0.95, 0.95),
adamw_eps=1e-8,
model=mock_model,
)

trainonestepcell = mindspore.nn.TrainOneStepCell(networkwithloss, optimizer)

losses = []
data, label = make_fake_data()
for i in range(20):
loss = trainonestepcell(data[i], label[i])
losses.append(loss.asnumpy())

return np.array(losses), optimizer

+ 15
- 0
tests/st/test_optim/test_muon/__init__.py View File

@@ -0,0 +1,15 @@
# Copyright 2025 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""test muon optimizer."""

+ 63
- 0
tests/st/test_optim/test_muon/data_utils.py View File

@@ -0,0 +1,63 @@
# Copyright 2025 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Baseline data for Muon optimizer tests.
"""
import numpy as np

# Default tolerance for loss comparison
DEFAULT_RTOL = 1e-4
DEFAULT_ATOL = 1e-4

# Baseline losses for single card test cases
# learning_rate=0.02, weight_decay=0.1, momentum=0.95, nesterov=True
BASELINE_LOSSES_NESTEROV_TRUE = np.array([
0.3881023, 7.8122883, 15.039654, 22.062939, 28.884716,
35.514862, 41.940598, 48.178577, 54.222153, 60.07846,
65.739815, 71.20518, 76.508705, 81.63688, 86.58084,
91.356064, 95.94581, 100.37069, 104.620384, 108.72005
], dtype=np.float32)

# learning_rate=0.02, weight_decay=0.1, momentum=0.95, nesterov=False
BASELINE_LOSSES_NESTEROV_FALSE = np.array([
0.3881023, 7.8122883, 15.032751, 22.052126, 28.875042,
35.503002, 41.92948, 48.16231, 54.218227, 60.07244,
65.745224, 71.22119, 76.5374, 81.64788, 86.525246,
91.292816, 95.89634, 100.308716, 104.57111, 108.64668
], dtype=np.float32)

# learning_rate=0.01, weight_decay=0.05, momentum=0.9, nesterov=True
BASELINE_LOSSES_DIFF_LR = np.array([
0.3881023, 7.8966713, 15.322964, 22.66404, 29.917278,
37.085056, 44.168663, 51.175865, 58.094597, 64.92998,
71.680595, 78.34835, 84.92714, 91.44285, 97.866035,
104.204056, 110.46475, 116.63603, 122.729706, 128.74644
], dtype=np.float32)


def compare_losses(actual_losses, expected_losses, rtol=DEFAULT_RTOL, atol=DEFAULT_ATOL):
"""
Compare actual losses with expected baseline losses.

Args:
actual_losses (np.ndarray): Actual losses from the test run
expected_losses (np.ndarray): Expected baseline losses
rtol (float): Relative tolerance for comparison
atol (float): Absolute tolerance for comparison

Returns:
bool: True if losses match within tolerance, False otherwise
"""
return np.allclose(actual_losses, expected_losses, rtol=rtol, atol=atol)

+ 236
- 0
tests/st/test_optim/test_muon/run_muon.py View File

@@ -0,0 +1,236 @@
# Copyright 2025 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Run Muon optimizer accuracy test with configurable parameters via args"""
import argparse
import numpy as np
import mindspore as ms
from mindspore import nn, Tensor

from mindformers.core.context.build_context import build_context
from mindformers.core.optim.muon import Muon

np.random.seed(1024)

# Test weight initialization - same as optimizer_util.py
FC1_WEIGHT = np.array([[0.72346634, 0.95608497, 0.4084163, 0.18627149,
0.6942514, 0.39767185, 0.24918061, 0.4548748],
[0.7203382, 0.19086994, 0.76286614, 0.87920564,
0.3169892, 0.9462494, 0.62827677, 0.27504718],
[0.3544535, 0.2524781, 0.5370583, 0.8313121,
0.6670143, 0.0488653, 0.62225235, 0.7546456],
[0.17985944, 0.05106374, 0.31064633, 0.4863033,
0.848814, 0.5523157, 0.20295663, 0.7213356]]).astype("float32")

FC1_BIAS = np.array([0.79708564, 0.13728078, 0.66322654, 0.88128525]).astype("float32")

FC2_WEIGHT = np.array([[0.8473515, 0.50923985, 0.42287776, 0.29769543]]).astype("float32")

FC2_BIAS = np.array([0.09996348]).astype("float32")


class MockTransformerConfig:
"""Mock transformer config for testing Muon optimizer."""
def __init__(self):
self.multi_latent_attention = True
self.tensor_model_parallel_size = 1
self.data_parallel_size = 1


class MockModel:
"""
Mock model class that provides required interfaces for Muon optimizer.
This simulates the model interface that Muon optimizer expects.
"""
def __init__(self):
self.config = MockTransformerConfig()

def get_gpt_transformer_config(self):
"""Return transformer config."""
return self.config

def make_model_muon_fns(self):
"""Return muon split and merge functions."""
def muon_split_fn(param_name, tensor): # pylint: disable=unused-argument
"""Split function - returns tensor as list."""
return [tensor]

def muon_merge_fn(param_name, tensor_list): # pylint: disable=unused-argument
"""Merge function - returns first tensor."""
return tensor_list[0]

return muon_split_fn, muon_merge_fn

# pylint: disable=unused-argument
def apply_qk_clip_scaling(self, params, param_names, param_layer, logit_threshold,
muon_split_fn, muon_merge_fn):
"""Apply query-key clipping scaling."""
return [(0, params[0])]

def get_param_layer_indices(self, params):
"""Return layer indices for parameters."""
return {p.name: 0 for p in params}

def get_muon_filter(self):
"""Return filter function to determine which params use Muon."""
def muon_filter(param):
# Apply Muon to weight parameters with 2D shape (not bias)
return len(param.shape) == 2 and 'bias' not in param.name
return muon_filter

def get_tp_dims(self, params):
"""Return tensor parallel dimensions."""
return tuple(-1 for _ in params)

def get_op_groups_info(self, params, op): # pylint: disable=unused-argument
"""Return optimizer parallel group info."""
ops = tuple(1 for _ in params)
op_groups = tuple("" for _ in params)
return ops, op_groups


class FakeNet(nn.Cell):
"""Build fake net for testing."""

def __init__(self):
super().__init__()
self.fc1 = nn.Dense(in_channels=8, out_channels=4,
weight_init=Tensor(FC1_WEIGHT),
bias_init=Tensor(FC1_BIAS))
self.fc2 = nn.Dense(in_channels=4, out_channels=1,
weight_init=Tensor(FC2_WEIGHT),
bias_init=Tensor(FC2_BIAS))
self.relu = nn.ReLU()

def construct(self, x):
x = self.relu(self.fc1(x))
x = self.fc2(x)
return x


class NetWithLoss(nn.Cell):
"""Build net with loss."""

def __init__(self, network, loss_fn):
super().__init__()
self.network = network
self.loss = loss_fn

def construct(self, x, label):
out = self.network(x)
loss = self.loss(out, label)
return loss


def make_fake_data():
"""Make fake data for testing."""
data, label = [], []
for i in range(20):
data.append(ms.Tensor(np.array(np.ones((2, 8)) * i, dtype=np.float32)))
label.append(ms.Tensor(np.array(np.ones((2, 1)) * (i + 1), dtype=np.float32)))
return data, label


class MuonRunner:
"""Class to manage Muon optimizer test and training."""

def __init__(self, args_from_parser):
self.args = args_from_parser
self.learning_rate = self.args.learning_rate
self.weight_decay = self.args.weight_decay
self.momentum = self.args.momentum
self.nesterov = self.args.nesterov
self.num_steps = self.args.num_steps

def build_network(self):
"""Build network with Muon optimizer."""
net = FakeNet()
mock_model = MockModel()

loss_fn = nn.L1Loss(reduction='mean')
networkwithloss = NetWithLoss(net, loss_fn)
networkwithloss.set_train()

params = networkwithloss.trainable_params()

# Create Muon optimizer
optimizer = Muon(
params=params,
learning_rate=self.learning_rate,
weight_decay=self.weight_decay,
matched_adamw_rms=0.2,
momentum=self.momentum,
nesterov=self.nesterov,
adamw_betas=(0.95, 0.95),
adamw_eps=1e-8,
model=mock_model,
)

return networkwithloss, optimizer, mock_model

def run(self):
"""Run the training with Muon optimizer."""
networkwithloss, optimizer, mock_model = self.build_network()
trainonestepcell = nn.TrainOneStepCell(networkwithloss, optimizer)

losses = []
data, label = make_fake_data()
for i in range(self.num_steps):
loss = trainonestepcell(data[i], label[i])
losses.append(loss.asnumpy())

# Save results
output_dict = {
"losses": np.array(losses),
"num_muon_m": len(optimizer.muon_m),
"num_moments1": len(optimizer.moments1),
"num_moments2": len(optimizer.moments2),
}

# Save muon momentum values for weight parameters
muon_filter = mock_model.get_muon_filter()
# pylint: disable=protected-access
for idx, param in enumerate(optimizer._parameters):
if muon_filter(param):
muon_m_value = optimizer.muon_m[idx].asnumpy()
output_dict[f"muon_m_{idx}"] = muon_m_value

np.savez(self.args.output_path, **output_dict)
print(f"Results saved to {self.args.output_path}")


def main():
parser = argparse.ArgumentParser(description="Run Muon optimizer test")
parser.add_argument("--learning_rate", type=float, default=0.02)
parser.add_argument("--weight_decay", type=float, default=0.1)
parser.add_argument("--momentum", type=float, default=0.95)
parser.add_argument("--nesterov", type=lambda x: x.lower() == "true", default=True)
parser.add_argument("--num_steps", type=int, default=20)
parser.add_argument("--output_path", type=str, default="output_muon.npz")

args = parser.parse_args()

# Set context
build_context({"use_legacy": False, "use_parallel": True})
ms.set_deterministic(True)
ms.set_context(mode=ms.GRAPH_MODE)
ms.set_seed(42)

# Run training
runner = MuonRunner(args)
runner.run()


if __name__ == "__main__":
main()

+ 202
- 0
tests/st/test_optim/test_muon/test_muon.py View File

@@ -0,0 +1,202 @@
# Copyright 2025 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Test module for testing the Muon optimizer interface used for MindFormers.
How to run this:
pytest tests/st/test_optim/test_muon/test_muon.py
"""
from pathlib import Path
import subprocess
import pytest
import numpy as np

from tests.st.test_optim.test_muon.data_utils import (
BASELINE_LOSSES_NESTEROV_TRUE,
BASELINE_LOSSES_NESTEROV_FALSE,
BASELINE_LOSSES_DIFF_LR,
compare_losses,
DEFAULT_RTOL,
DEFAULT_ATOL,
)

from mindformers.tools.logger import logger

# Test parameters definition
SINGLE_CARD_TEST_CASES = [
# Default config with nesterov=True
{
"learning_rate": 0.02,
"weight_decay": 0.1,
"momentum": 0.95,
"nesterov": True,
"num_steps": 20,
"baseline_losses": BASELINE_LOSSES_NESTEROV_TRUE,
},
# Config without Nesterov momentum
{
"learning_rate": 0.02,
"weight_decay": 0.1,
"momentum": 0.95,
"nesterov": False,
"num_steps": 20,
"baseline_losses": BASELINE_LOSSES_NESTEROV_FALSE,
},
# Config with different learning rate
{
"learning_rate": 0.01,
"weight_decay": 0.05,
"momentum": 0.9,
"nesterov": True,
"num_steps": 20,
"baseline_losses": BASELINE_LOSSES_DIFF_LR,
},
]


def build_msrun_command_list(
worker_num,
local_worker_num,
log_dir,
run_script_path,
learning_rate,
weight_decay,
momentum,
nesterov,
num_steps,
output_path,
port=29500
):
"""Build the msrun command with the specified parameters."""
cmd_list = [
"msrun",
f"--worker_num={worker_num}",
f"--local_worker_num={local_worker_num}",
f"--master_port={port}",
f"--log_dir={log_dir}",
"--join=True",
str(run_script_path),
f"--learning_rate={learning_rate}",
f"--weight_decay={weight_decay}",
f"--momentum={momentum}",
f"--nesterov={str(nesterov).lower()}",
f"--num_steps={num_steps}",
f"--output_path={output_path}",
]
logger.info(f"Equivalent shell command for Muon test: {' '.join(cmd_list)}")
return cmd_list


class TestMuon:
"""Test class for Muon optimizer with different configurations."""
OUTPUT_FILENAME = "output_muon.npz"
LOG_DIR_NAME = "msrun_log"

def setup_method(self):
"""Setup method to prepare test environment."""
self.sh_path = Path(__file__).parent.resolve()
self.run_script_path = self.sh_path / "run_muon.py"

def check_results(self, output_dict, baseline_losses=None):
"""
Check the output results from the Muon optimizer run.

Args:
output_dict: Dictionary containing the output results
num_params: Expected number of parameters
baseline_losses: Expected baseline losses for comparison
"""
# Check losses
losses = output_dict.get("losses")
assert losses is not None, "Losses not found in output"
assert len(losses) > 0, "Losses array is empty"
assert not np.any(np.isnan(losses)), "Losses contain NaN values"
assert not np.any(np.isinf(losses)), "Losses contain Inf values"

# Compare with baseline if provided
if baseline_losses is not None:
assert compare_losses(losses, baseline_losses, rtol=DEFAULT_RTOL, atol=DEFAULT_ATOL), (
f"Losses do not match baseline.\n"
f"Actual: {losses}\n"
f"Expected: {baseline_losses}\n"
f"Max diff: {np.max(np.abs(losses - baseline_losses))}"
)

def run_test(
self,
worker_num,
local_worker_num,
optimizer_args,
tmp_path,
port=29500,
baseline_losses=None
):
"""Helper function to run test and check results."""
output_file_path = tmp_path / self.OUTPUT_FILENAME
log_dir_path = tmp_path / self.LOG_DIR_NAME
log_dir_path.mkdir(parents=True, exist_ok=True)

cmd_list = build_msrun_command_list(
worker_num=worker_num,
local_worker_num=local_worker_num,
log_dir=log_dir_path,
run_script_path=self.run_script_path,
learning_rate=optimizer_args["learning_rate"],
weight_decay=optimizer_args["weight_decay"],
momentum=optimizer_args["momentum"],
nesterov=optimizer_args["nesterov"],
num_steps=optimizer_args["num_steps"],
output_path=output_file_path,
port=port
)

result = subprocess.run(
cmd_list, shell=False, capture_output=True, text=True, check=False
)

assert result.returncode == 0, (
f"Test script failed with non-zero exit code: "
f"{result.returncode}.\nStdout:\n{result.stdout}\nStderr:\n{result.stderr}"
)
assert output_file_path.exists(), (
f"Output file {output_file_path} was not created."
)

output_dict = np.load(output_file_path)
self.check_results(output_dict, baseline_losses=baseline_losses)

return output_dict


@pytest.mark.level0
@pytest.mark.platform_arm_ascend910b_training
@pytest.mark.env_onecard
class TestMuonSingleCard(TestMuon):
"""Test class for Muon optimizer with single card configurations."""

@pytest.mark.parametrize("optimizer_args", SINGLE_CARD_TEST_CASES)
def test_muon_single_card(self, optimizer_args, tmp_path):
"""
Feature: Muon optimizer training
Description: Test computation of Muon optimizer with various configurations.
Expectation: Training completes successfully with valid losses matching baseline
"""
baseline_losses = optimizer_args.get("baseline_losses")
self.run_test(
worker_num=1,
local_worker_num=1,
optimizer_args=optimizer_args,
tmp_path=tmp_path,
baseline_losses=baseline_losses
)

+ 1
- 1
tests/st/test_ut/base_schema.json View File

@@ -1062,7 +1062,7 @@
"signature": "(use_fused)"
},
"mindformers.core.optim.Muon": {
"signature": "(params, learning_rate=0.02, weight_decay=0.1, matched_adamw_rms=0.2, momentum=0.95, nesterov=True, ns_steps=5, adamw_betas=(0.95, 0.95), adamw_eps=1e-08, micro_batch_num=1, qk_clip_threshold=4, model=None, **kwargs)"
"signature": "(params, learning_rate=0.02, weight_decay=0.1, matched_adamw_rms=0.2, momentum=0.95, nesterov=True, ns_steps=5, adamw_betas=(0.95, 0.95), adamw_eps=1e-08, micro_batch_num=1, qk_clip_threshold=100, model=None, **kwargs)"
},
"mindformers.core.optim.Muon._verify_model": {
"signature": "(self, model)"


Loading…
Cancel
Save
Baidu
map