2 Commits

9 changed files with 1420 additions and 0 deletions
Split View
  1. +125
    -0
      tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_attention/test_shared_kv_cross_attention/data_gen_utils.py
  2. +91
    -0
      tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_attention/test_shared_kv_cross_attention/test_shared_kv_cross_attention.py
  3. +477
    -0
      tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_attention/test_sliding_window_attention/data_gen_utils.py
  4. +97
    -0
      tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_attention/test_sliding_window_attention/test_sliding_window_attention.py
  5. +107
    -0
      tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_flash_attention/test_flash_attention_sliding_window.py
  6. +101
    -0
      tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_transformer_block/data_gen_utils_yoco.py
  7. +130
    -0
      tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_transformer_block/test_transformers_block_shared_kv_cross_attention.py
  8. +130
    -0
      tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_transformer_block/test_transformers_block_sliding_window_attention.py
  9. +162
    -0
      tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_transformer_block/test_transformers_block_yoco.py

+ 125
- 0
tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_attention/test_shared_kv_cross_attention/data_gen_utils.py View File

@@ -0,0 +1,125 @@
# Copyright 2025 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Data for SharedKVCrossAttention UT."""

import numpy as np
import mindspore as ms
from mindformers.parallel_core.transformer_config import TransformerConfig
from mindformers.parallel_core.training_graph.base_models.common.embeddings.rotary_pos_embedding import RotaryEmbedding


def get_init_params(config: TransformerConfig, seq_length=2, batch_size=2, hidden_size=8, q_hidden_size=8):
"""Generate initial parameters for SharedKVCrossAttention"""
np.random.seed(1)
shape = (seq_length, batch_size, config.num_attention_heads, config.kv_channels)
rotary_pos_emb = RotaryEmbedding(
kv_channels=4,
use_eod_reset=config.use_eod_reset
)
params = {
"hidden_states": ms.Tensor(0.01 * np.random.randn(seq_length, batch_size, config.hidden_size), ms.bfloat16),
"attention_mask": ms.Tensor(np.triu(np.ones((2, 2, 2, 2), dtype=np.int8), k=1), dtype=ms.uint8),
"rotary_pos_emb": rotary_pos_emb(seq_length),
"sharded_key": ms.Tensor(0.01 * np.random.randn(*shape), ms.bfloat16),
"sharded_value": ms.Tensor(0.01 * np.random.randn(*shape), ms.bfloat16)
}
weight_shape = (hidden_size, q_hidden_size)
weight_dict = {
"linear_q.weight": ms.Tensor(0.01 * np.random.randn(*weight_shape), ms.bfloat16),
"linear_proj.weight": ms.Tensor(0.01 * np.random.randn(*weight_shape), ms.bfloat16)
}
return params, weight_dict

def get_init_block_params(config: TransformerConfig, seq_length=2, batch_size=2, hidden_size=8, q_hidden_size=8):
"""Generate initial parameters for TransformerBlock with SharedKVCrossAttention"""
np.random.seed(1)
rotary_pos_emb = RotaryEmbedding(
kv_channels=4,
use_eod_reset=config.use_eod_reset
)
params = {
"hidden_states": ms.Tensor(0.01 * np.random.randn(seq_length, batch_size, config.hidden_size), ms.bfloat16),
"attention_mask": ms.Tensor(np.triu(np.ones((2, 2, 2, 2), dtype=np.int8), k=1), dtype=ms.uint8),
"rotary_pos_emb": rotary_pos_emb(seq_length)
}
linear_x0_weight = 0.01 * np.random.randn(32, 8)
linear_x1_weight = 0.01 * np.random.randn(32, 8)
weight_shape = (hidden_size, q_hidden_size)
weight_dict = {
"layers.0.cross_attention.linear_q.weight": ms.Tensor(0.01 * np.random.randn(*weight_shape), ms.bfloat16),
"layers.0.cross_attention.linear_proj.weight": ms.Tensor(0.01 * np.random.randn(*weight_shape), ms.bfloat16),
"adapter.k_proj.weight": ms.Tensor(0.01 * np.random.randn(*weight_shape), ms.bfloat16),
"adapter.v_proj.weight": ms.Tensor(0.01 * np.random.randn(*weight_shape), ms.bfloat16),
"layers.0.mlp.linear_fc1.weight": ms.Tensor(np.concatenate(
(linear_x0_weight, linear_x1_weight), axis=0), ms.bfloat16
),
"layers.0.mlp.linear_fc2.weight": ms.Tensor(0.01 * np.random.randn(8, 32), ms.bfloat16),
"adapter.kv_layer_norm.weight": ms.Tensor(0.01 * np.random.randn(8), ms.bfloat16),
"layers.0.input_layernorm.weight": ms.Tensor(0.01 * np.random.randn(8), ms.bfloat16),
"final_layernorm.weight": ms.Tensor(0.01 * np.random.randn(8), ms.bfloat16)
}
return params, weight_dict



GOLDEN_DATA = {
"output_attn": np.array(
[[[-5.602836608886719e-05, 0.0007781982421875,
-0.0003643035888671875, -1.5497207641601562e-05,
-0.0001850128173828125, 0.000446319580078125,
0.0001506805419921875, 0.000263214111328125],
[-1.8477439880371094e-05, -0.0004482269287109375,
0.0002574920654296875, -4.696846008300781e-05,
0.000591278076171875, -0.00055694580078125,
0.000370025634765625, 2.658367156982422e-05]],
[[-4.1484832763671875e-05, 0.0004138946533203125,
-0.00019741058349609375, -2.002716064453125e-05,
-0.00015163421630859375, 0.0002689361572265625,
0.00010347366333007812, 0.00011301040649414062],
[-3.8623809814453125e-05, -0.00038909912109375,
0.00018787384033203125, -6.938353180885315e-08,
0.000392913818359375, -0.0002079010009765625,
0.000232696533203125, 6.4849853515625e-05]]],
dtype=np.float32),
"output_block": np.array(
[[[0.016235, - 0.006104, - 0.005249, - 0.010742,
0.008667, - 0.023071, 0.017456, - 0.007599],
[0.003204, - 0.002502, 0.014709, - 0.020630,
- 0.003220, - 0.003845, 0.011353, - 0.010986]],
[[-0.001724, - 0.008728, 0.000395, 0.005829,
- 0.010925, 0.011475, 0.009033, 0.005035],
[0.009033, - 0.006836, - 0.001244, - 0.009338,
- 0.002640, 0.005280, - 0.006897, - 0.003967]]],
dtype=np.float32)
}

GPU_DATA = {
"output_attn": np.array(
[[[-5.5512e-05, 7.7754e-04, -3.6442e-04, -1.5505e-05, -1.8534e-04, 4.4649e-04, 1.4995e-04, 2.6433e-04],
[-4.1193e-05, 4.1335e-04, -1.9753e-04, -2.0061e-05, -1.5163e-04, 2.6995e-04, 1.0301e-04, 1.1354e-04]],
[[-1.8618e-05, -4.4912e-04, 2.5768e-04, -4.6789e-05, 5.9355e-04, -5.5917e-04, 3.7121e-04, 2.6458e-05],
[-3.8943e-05, -3.8902e-04, 1.8821e-04, -1.7298e-07, 3.9256e-04, -2.0821e-04, 2.3300e-04, 6.4698e-05]]],
dtype=np.float32),
"output_block": np.array(
[[[0.0162, -0.0061, -0.0053, -0.0107,
0.0087, -0.0230, 0.0174, -0.0076],
[-0.0017, -0.0088, 0.0004, 0.0058,
-0.0110, 0.0114, 0.0090, 0.0050]],
[[0.0032, -0.0025, 0.0146, -0.0206,
-0.0032, -0.0038, 0.0113, -0.0110],
[0.0090, -0.0068, -0.0012, -0.0094,
-0.0027, 0.0053, -0.0069, -0.0040]]],
dtype=np.float32)
}

+ 91
- 0
tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_attention/test_shared_kv_cross_attention/test_shared_kv_cross_attention.py View File

@@ -0,0 +1,91 @@
# Copyright 2025 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Test module for testing SharedKVCrossAttention used for mindformers."""
import pytest
import mindspore as ms
from tests.st.test_ut.test_parallel_core.test_training_graph.test_transformer.test_attention.test_shared_kv_cross_attention.data_gen_utils import get_init_params, GOLDEN_DATA, GPU_DATA
from tests.utils.double_benchmark import DoubleBenchmarkComparator
from mindformers.parallel_core.training_graph.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from mindformers.parallel_core.training_graph.transformer.attention import SharedKVCrossAttention, SharedKVCrossAttentionSubmodules
from mindformers.parallel_core.training_graph.transformer.flash_attention import FlashAttention
from mindformers.parallel_core.transformer_config import TransformerConfig


class TestSharedKVCrossAttention:
"""A test class for testing SharedKVCrossAttention"""

def setup_method(self):
"""Setup method to prepare test environment"""
self.config = TransformerConfig(
compute_dtype='bfloat16',
use_flash_attention=True,
num_query_groups=2,
data_parallel_size=1,
tensor_model_parallel_size=1,
hidden_size=8,
num_attention_heads=2,
add_bias_linear=True,
add_qkv_bias=True,
num_layers=1,
params_dtype='float32',
attention_dropout=0.0,
context_parallel_size=1,
model_architecture="yoco",
num_encoder_layers=0,
num_decoder_layers=1
)

ms.set_context(mode=ms.GRAPH_MODE)

submodules = SharedKVCrossAttentionSubmodules(
linear_q=ColumnParallelLinear,
core_attention=FlashAttention,
linear_proj=RowParallelLinear,
)

self.net = SharedKVCrossAttention(
config=self.config,
submodules=submodules,
layer_number=1
)

self.inputs, weight_dict = get_init_params(self.config)
self.net.load_state_dict(weight_dict, strict=False)

def run_test(self, accuracy=True, compare_type=None):
"""Helper function to run test and check results"""

output, _ = self.net(**self.inputs)
npu_output = output.asnumpy()
if accuracy:
gpu_output = GPU_DATA[compare_type]
golden_output = GOLDEN_DATA[compare_type]
assert DoubleBenchmarkComparator.check_pass_or_not(npu_output, gpu_output, golden_output), (
f"SharedKVCrossAttention compare_type={compare_type} test failed.\n"
f"NPU output:\n{npu_output}\n\n"
f"GPU output:\n{gpu_output}\n\n"
f"Golden output:\n{golden_output}"
)


@pytest.mark.level1
@pytest.mark.platform_arm_ascend910b_training
@pytest.mark.env_onecard
def test_shared_kv_cross_attention(self):
"""
Feature: SharedKVCrossAttention
Description: Test Case: SharedKVCrossAttention
"""
self.run_test(compare_type="output_attn")

+ 477
- 0
tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_attention/test_sliding_window_attention/data_gen_utils.py View File

@@ -0,0 +1,477 @@
# Copyright 2025 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Generate data for test."""
import numpy as np
import mindspore as ms
from mindspore.ops import auto_generate as aclnn_ops
from mindformers.parallel_core.training_graph.base_models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
from mindformers.parallel_core.transformer_config import MLATransformerConfig

reshape = aclnn_ops.Reshape()

def get_init_params(config: MLATransformerConfig, seq_length=8, batch_size=2):
"""Generate SBND-format input tensors for FlashAttention."""
np.random.seed(1)
shape = (seq_length, batch_size, config.num_attention_heads, config.kv_channels)
attn_mask = ms.Tensor(np.triu(np.ones((2048, 2048), dtype=np.int8), k=1), dtype=ms.uint8)

return {
"query": ms.tensor(0.01 * np.random.randn(*shape), ms.bfloat16),
"key": ms.tensor(0.01 * np.random.randn(*shape), ms.bfloat16),
"value": ms.tensor(0.01 * np.random.randn(*shape), ms.bfloat16),
"attention_mask": ms.tensor(attn_mask)
}

def get_init_tnd_params(config: MLATransformerConfig, seq_length=8, batch_size=2):
"""Generate TND-format input tensors for FlashAttention."""
np.random.seed(1)
shape = (seq_length, batch_size, config.num_attention_heads, config.kv_channels)
new_shape = (batch_size * seq_length, config.num_attention_heads, config.kv_channels)
attn_mask = ms.Tensor(np.triu(np.ones((2048, 2048), dtype=np.int8), k=1), dtype=ms.uint8)
query = ms.tensor(0.01 * np.random.randn(*shape), ms.bfloat16)
key = ms.tensor(0.01 * np.random.randn(*shape), ms.bfloat16)
value = ms.tensor(0.01 * np.random.randn(*shape), ms.bfloat16)
return {
"query": reshape(query.transpose(1, 0, 2, 3), new_shape),
"key": reshape(key.transpose(1, 0, 2, 3), new_shape),
"value": reshape(value.transpose(1, 0, 2, 3), new_shape),
"attention_mask": ms.tensor(attn_mask),
"actual_seq_qlen": ms.Tensor([8, 16], ms.int),
"actual_seq_kvlen": ms.Tensor([8, 16], ms.int)
}


def get_init_attn_params(config: MLATransformerConfig, seq_length=2, batch_size=2, hidden_size=8, q_hidden_size=8):
"""Generate initial parameters for SlidingWindowAttention"""
np.random.seed(1)

rotary_pos_emb = RotaryEmbedding(
kv_channels=4,
use_eod_reset=config.use_eod_reset
)
attn_mask = ms.Tensor(np.triu(np.ones((2048, 2048), dtype=np.int8), k=1), dtype=ms.uint8)
params = {
"hidden_states": ms.Tensor(0.01 * np.random.randn(seq_length, batch_size, config.hidden_size), ms.bfloat16),
"attention_mask": ms.tensor(attn_mask),
"rotary_pos_emb": rotary_pos_emb(seq_length)
}
weight_shape = (hidden_size, q_hidden_size)
linear_q_weight = 0.01 * np.random.randn(*weight_shape)
linear_k_weight = 0.01 * np.random.randn(*weight_shape)
linear_v_weight = 0.01 * np.random.randn(*weight_shape)
weight_dict = {
"linear_qkv.weight": ms.Tensor(np.concatenate(
(linear_q_weight, linear_k_weight, linear_v_weight), axis=0),
ms.bfloat16),
"linear_proj.weight": ms.Tensor(0.01 * np.random.randn(*weight_shape), ms.bfloat16)
}
return params, weight_dict

def get_init_block_params(config: MLATransformerConfig, seq_length=2, batch_size=2, hidden_size=8, q_hidden_size=8):
"""Generate initial parameters for TransformerBlock with SlidingWindowAttention"""
np.random.seed(1)

rotary_pos_emb = RotaryEmbedding(
kv_channels=4,
use_eod_reset=config.use_eod_reset
)
attn_mask = ms.Tensor(np.triu(np.ones((2048, 2048), dtype=np.int8), k=1), dtype=ms.uint8)
params = {
"hidden_states": ms.Tensor(0.01 * np.random.randn(seq_length, batch_size, config.hidden_size), ms.bfloat16),
"attention_mask": ms.tensor(attn_mask),
"rotary_pos_emb": rotary_pos_emb(seq_length)
}
weight_shape = (hidden_size, q_hidden_size)
linear_q_weight = 0.01 * np.random.randn(*weight_shape)
linear_k_weight = 0.01 * np.random.randn(*weight_shape)
linear_v_weight = 0.01 * np.random.randn(*weight_shape)
linear_proj_weight = ms.Tensor(0.01 * np.random.randn(*weight_shape), ms.bfloat16)
linear_x0_weight = 0.01 * np.random.randn(32, 8)
linear_x1_weight = 0.01 * np.random.randn(32, 8)
weight_dict = {
"layers.0.self_attention.linear_qkv.weight": ms.Tensor(np.concatenate(
(linear_q_weight, linear_k_weight, linear_v_weight), axis=0),
ms.bfloat16),
"layers.0.self_attention.linear_proj.weight": linear_proj_weight,
"layers.0.mlp.linear_fc1.weight": ms.Tensor(np.concatenate(
(linear_x0_weight, linear_x1_weight), axis=0),
ms.bfloat16),
"layers.0.mlp.linear_fc2.weight": ms.Tensor(0.01 * np.random.randn(8, 32), ms.bfloat16),
"layers.0.input_layernorm.weight": ms.Tensor(0.01 * np.random.randn(8), ms.bfloat16),
"final_layernorm.weight": ms.Tensor(0.01 * np.random.randn(8), ms.bfloat16)

}
return params, weight_dict


def get_golden() -> dict[str, np.ndarray]:
"""Generate golden data for test."""
output_bnsd = np.array(
[[[-0.011963, 0.008606, - 0.001808, - 0.006042],
[-0.012329, 0.005493, 0.007935, - 0.006226]],
[[-0.003372, - 0.001404, 0.003098, - 0.002792],
[-0.007111, 0.002243, 0.008301, 0.000641]],
[[-0.000488, - 0.000479, 0.002319, 0.000206],
[-0.003967, 0.003769, 0.004517, - 0.007660]],
[[0.002228, 0.005096, 0.002853, - 0.000095],
[-0.003311, 0.002533, 0.003418, - 0.008545]],
[[0.000751, 0.002090, 0.002777, - 0.000668],
[-0.001663, 0.001678, 0.004700, - 0.006409]],
[[0.004272, - 0.001411, 0.001236, 0.000950],
[0.002823, 0.000984, 0.003998, - 0.005737]],
[[0.005554, - 0.001617, 0.002029, 0.000355],
[0.000610, 0.001289, 0.004150, - 0.003067]],
[[0.004730, - 0.002182, 0.002472, 0.000614],
[0.000885, 0.001038, 0.005066, - 0.002213]]], dtype=np.float32)
output_tnd = np.array(
[[[-0.011963, 0.008606],
[-0.001808, -0.006042]],
[[-0.003372, -0.001404],
[0.003098, -0.002792]],
[[-0.000488, -0.000479],
[0.002319, 0.000206]],
[[0.002228, 0.005096],
[0.002853, -0.000095]],
[[0.000751, 0.002090],
[0.002777, -0.000668]],
[[0.004272, -0.001411],
[0.001236, 0.000950]],
[[0.005554, -0.001617],
[0.002029, 0.000355]],
[[0.004730, -0.002182],
[0.002472, 0.000614]],
[[-0.012329, 0.005493],
[0.007935, -0.006226]],
[[-0.007111, 0.002243],
[0.008301, 0.000641]],
[[-0.003967, 0.003769],
[0.004517, -0.007660]],
[[-0.003311, 0.002533],
[0.003418, -0.008545]],
[[-0.001663, 0.001678],
[0.004700, -0.006409]],
[[0.002823, 0.000984],
[0.003998, -0.005737]],
[[0.000610, 0.001289],
[0.004150, -0.003067]],
[[0.000885, 0.001038],
[0.005066, -0.002213]]]
, dtype=np.float32)
output_attn = np.array(
[[[0.000012, 0.000015, - 0.000013, - 0.000025,
0.000009, 0.000016, 0.000003, - 0.000003],
[-0.000010, - 0.000001, - 0.000011, - 0.000019,
0.000009, 0.000007, - 0.000010, 0.000000]],
[[0.000003, 0.000011, - 0.000010, - 0.000013,
0.000005, 0.000007, - 0.000000, - 0.000001],
[-0.000003, 0.000002, - 0.000006, - 0.000015,
0.000006, 0.000004, - 0.000007, - 0.000000]]], dtype=np.float32)
output_block = np.array(
[[[0.016235, - 0.006042, - 0.005280, - 0.010742,
0.008667, - 0.023071, 0.017456, - 0.007599],
[0.003159, - 0.002487, 0.014648, - 0.020630,
- 0.003220, - 0.003830, 0.011353, - 0.010986]],
[[-0.001701, - 0.008789, 0.000431, 0.005829,
- 0.010986, 0.011475, 0.009033, 0.005035],
[0.009033, - 0.006805, - 0.001228, - 0.009338,
- 0.002701, 0.005310, - 0.006958, - 0.003967]]], dtype=np.float32)
grad_query = np.array(
[[[[0.000000, 0.000000],
[0.000000, 0.000000]],
[[0.000000, 0.000000],
[0.000000, 0.000000]]],
[[[0.000002, - 0.000005],
[-0.000023, - 0.000103]],
[[-0.000017, 0.000015],
[0.000043, 0.000032]]],
[[[-0.000009, - 0.000014],
[-0.000021, - 0.000062]],
[[-0.000028, 0.000019],
[0.000032, 0.000008]]],
[[[-0.000007, 0.000030],
[-0.000015, - 0.000046]],
[[-0.000018, 0.000016],
[0.000013, 0.000007]]],
[[[0.000006, 0.000034],
[-0.000007, - 0.000039]],
[[-0.000018, 0.000020],
[0.000025, 0.000014]]],
[[[0.000005, 0.000028],
[-0.000006, - 0.000033]],
[[-0.000027, 0.000016],
[0.000021, 0.000011]]],
[[[0.000011, 0.000016],
[-0.000005, - 0.000030]],
[[-0.000021, 0.000033],
[0.000044, 0.000012]]],
[[[0.000010, 0.000023],
[-0.000001, - 0.000017]],
[[-0.000018, 0.000029],
[0.000059, 0.000011]]]], dtype=np.float32)
grad_key = np.array(
[[[[-0.000012, 0.000021],
[-0.000029, 0.000038]],
[[0.000048, - 0.000022],
[-0.000019, 0.000042]]],
[[[-0.000018, 0.000035],
[0.000033, - 0.000046]],
[[0.000019, - 0.000012],
[0.000093, 0.000048]]],
[[[-0.000005, - 0.000018],
[-0.000007, 0.000010]],
[[-0.000056, 0.000039],
[-0.000098, - 0.000112]]],
[[[0.000030, - 0.000054],
[-0.000002, 0.000001]],
[[0.000013, - 0.000001],
[-0.000021, - 0.000032]]],
[[[0.000006, 0.000014],
[0.000003, - 0.000007]],
[[-0.000007, 0.000002],
[0.000037, 0.000052]]],
[[[0.000000, - 0.000000],
[-0.000000, 0.000000]],
[[-0.000037, 0.000001],
[-0.000002, - 0.000001]]],
[[[0.000007, 0.000011],
[0.000000, - 0.000000]],
[[0.000021, - 0.000009],
[0.000003, 0.000006]]],
[[[-0.000007, - 0.000008],
[0.000001, 0.000004]],
[[-0.000000, 0.000000],
[0.000006, - 0.000003]]]], dtype=np.float32)
grad_value = np.array(
[[[[2.718750, 2.718750],
[2.718750, 2.718750]],
[[2.718750, 2.718750],
[2.718750, 2.718750]]],
[[[1.718750, 1.718750],
[1.718750, 1.718750]],
[[1.718750, 1.718750],
[1.718750, 1.718750]]],
[[[1.218750, 1.218750],
[1.218750, 1.218750]],
[[1.218750, 1.218750],
[1.218750, 1.218750]]],
[[[0.886719, 0.886719],
[0.886719, 0.886719]],
[[0.886719, 0.886719],
[0.886719, 0.886719]]],
[[[0.636719, 0.636719],
[0.636719, 0.636719]],
[[0.636719, 0.636719],
[0.636719, 0.636719]]],
[[[0.435547, 0.435547],
[0.435547, 0.435547]],
[[0.435547, 0.435547],
[0.435547, 0.435547]]],
[[[0.267578, 0.267578],
[0.267578, 0.267578]],
[[0.267578, 0.267578],
[0.267578, 0.267578]]],
[[[0.125000, 0.125000],
[0.125000, 0.125000]],
[[0.125000, 0.125000],
[0.125000, 0.125000]]]], dtype=np.float32)
return {
'bnsd': output_bnsd,
'tnd': output_tnd,
"attn": output_attn,
"query": grad_query,
"key": grad_key,
"value": grad_value,
"block": output_block
}


def get_gpu_datas() -> dict[str, np.ndarray]:
"""Generate gpu data for test."""
output_bnsd = np.array(
[[[-1.1963e-02, 8.6060e-03, -1.8082e-03, -6.0425e-03],
[-1.2329e-02, 5.4932e-03, 7.9346e-03, -6.2256e-03]],
[[-3.3722e-03, -1.4038e-03, 3.0975e-03, -2.7924e-03],
[-7.1106e-03, 2.2430e-03, 8.3008e-03, 6.4087e-04]],
[[-4.8828e-04, -4.7874e-04, 2.3193e-03, 2.0599e-04],
[-3.9673e-03, 3.7689e-03, 4.5166e-03, -7.6599e-03]],
[[2.2278e-03, 5.0964e-03, 2.8534e-03, -9.5367e-05],
[-3.3112e-03, 2.5330e-03, 3.4180e-03, -8.5449e-03]],
[[7.5150e-04, 2.0905e-03, 2.7771e-03, -6.6757e-04],
[-1.6632e-03, 1.6785e-03, 4.6997e-03, -6.4087e-03]],
[[4.2725e-03, -1.4114e-03, 1.2360e-03, 9.4986e-04],
[2.8229e-03, 9.8419e-04, 3.9978e-03, -5.7373e-03]],
[[5.5542e-03, -1.6174e-03, 2.0294e-03, 3.5477e-04],
[6.1035e-04, 1.2894e-03, 4.1504e-03, -3.0670e-03]],
[[4.7302e-03, -2.1820e-03, 2.4719e-03, 6.1417e-04],
[8.8501e-04, 1.0376e-03, 5.0659e-03, -2.2125e-03]]], dtype=np.float16)
output_tnd = np.array(
[[[-1.1963e-02, 8.6060e-03],
[-1.8082e-03, -6.0425e-03]],
[[-3.3722e-03, -1.4038e-03],
[3.0975e-03, -2.7924e-03]],
[[-4.8828e-04, -4.7874e-04],
[2.3193e-03, 2.0599e-04]],
[[2.2278e-03, 5.0964e-03],
[2.8534e-03, -9.5367e-05]],
[[7.5150e-04, 2.0905e-03],
[2.7771e-03, -6.6757e-04]],
[[4.2725e-03, -1.4114e-03],
[1.2360e-03, 9.4986e-04]],
[[5.5542e-03, -1.6174e-03],
[2.0294e-03, 3.5477e-04]],
[[4.7302e-03, -2.1820e-03],
[2.4719e-03, 6.1417e-04]],
[[-1.2329e-02, 5.4932e-03],
[7.9346e-03, -6.2256e-03]],
[[-7.1106e-03, 2.2430e-03],
[8.3008e-03, 6.4087e-04]],
[[-3.9673e-03, 3.7689e-03],
[4.5166e-03, -7.6599e-03]],
[[-3.3112e-03, 2.5330e-03],
[3.4180e-03, -8.5449e-03]],
[[-1.6632e-03, 1.6785e-03],
[4.6997e-03, -6.4087e-03]],
[[2.8229e-03, 9.8419e-04],
[3.9978e-03, -5.7373e-03]],
[[6.1035e-04, 1.2894e-03],
[4.1504e-03, -3.0670e-03]],
[[8.8501e-04, 1.0376e-03],
[5.0659e-03, -2.2125e-03]]], dtype=np.float16)
output_attn = np.array(
[[[1.1813e-05, 1.4700e-05, -1.3472e-05, -2.5206e-05,
9.1753e-06, 1.5654e-05, 2.7061e-06, -2.9343e-06],
[-9.8812e-06, -1.4772e-06, -1.0940e-05, -1.9328e-05,
8.5110e-06, 7.1337e-06, -1.0376e-05, 5.1637e-07]],
[[2.8103e-06, 1.1080e-05, -9.7287e-06, -1.3028e-05,
4.7935e-06, 6.7433e-06, -4.5341e-08, -9.6254e-07],
[-3.2683e-06, 1.5758e-06, -6.3239e-06, -1.5027e-05,
5.9035e-06, 4.4412e-06, -7.2456e-06, -5.4673e-07]]], dtype=np.float32)
output_block = np.array(
[[[0.0162, -0.0061, -0.0053, -0.0107,
0.0087, -0.0230, 0.0174, -0.0076],
[0.0032, -0.0025, 0.0146, -0.0206,
-0.0032, -0.0038, 0.0113, -0.0110]],
[[-0.0017, -0.0088, 0.0004, 0.0058,
-0.0110, 0.0114, 0.0090, 0.0050],
[0.0090, -0.0068, -0.0012, -0.0094,
-0.0027, 0.0053, -0.0069, -0.0040]]], dtype=np.float32)
grad_query = np.array(
[[[[0.0000e+00, 0.0000e+00],
[0.0000e+00, 0.0000e+00]],
[[0.0000e+00, 0.0000e+00],
[0.0000e+00, 0.0000e+00]]],
[[[1.6466e-06, -4.7982e-06],
[-2.3484e-05, -1.0252e-04]],
[[-1.7405e-05, 1.5497e-05],
[4.2915e-05, 3.2425e-05]]],
[[[-8.7023e-06, -1.4007e-05],
[-2.1219e-05, -6.1512e-05]],
[[-2.7895e-05, 1.9193e-05],
[3.2187e-05, 7.5102e-06]]],
[[[-7.3910e-06, 3.0398e-05],
[-1.5199e-05, -4.5776e-05]],
[[-1.7524e-05, 1.5974e-05],
[1.3113e-05, 6.9439e-06]]],
[[[6.2585e-06, 3.4332e-05],
[-7.3314e-06, -3.9101e-05]],
[[-1.8001e-05, 1.9550e-05],
[2.5392e-05, 1.3709e-05]]],
[[[5.0962e-06, 2.8372e-05],
[-5.9307e-06, -3.2902e-05]],
[[-2.6584e-05, 1.6451e-05],
[2.1219e-05, 1.1444e-05]]],
[[[1.0908e-05, 1.6451e-05],
[-5.2750e-06, -2.9922e-05]],
[[-2.0504e-05, 3.3140e-05],
[4.4346e-05, 1.2159e-05]]],
[[[9.7156e-06, 2.2650e-05],
[-1.0654e-06, -1.6570e-05]],
[[-1.8001e-05, 2.9087e-05],
[5.8651e-05, 1.1146e-05]]]], dtype=np.float32)
grad_key = np.array(
[[[[-1.2279e-05, 2.1100e-05],
[-2.8610e-05, 3.8147e-05]],
[[4.8399e-05, -2.1815e-05],
[-1.8954e-05, 4.1962e-05]]],
[[[-1.7881e-05, 3.5048e-05],
[3.3379e-05, -4.6492e-05]],
[[1.8597e-05, -1.1504e-05],
[9.2983e-05, 4.8399e-05]]],
[[[-5.3644e-06, -1.8358e-05],
[-6.9439e-06, 1.0014e-05]],
[[-5.6267e-05, 3.9101e-05],
[-9.7752e-05, -1.1206e-04]]],
[[[2.9922e-05, -5.3644e-05],
[-2.0862e-06, 1.2517e-06]],
[[1.2636e-05, -1.1325e-06],
[-2.0504e-05, -3.2187e-05]]],
[[[6.0797e-06, 1.3649e-05],
[3.0249e-06, -7.0333e-06]],
[[-7.4208e-06, 2.4736e-06],
[3.7193e-05, 5.2452e-05]]],
[[[1.4435e-07, -3.8370e-07],
[-4.0978e-07, 2.8312e-07]],
[[-3.6955e-05, 1.4529e-06],
[-1.5199e-06, -6.9663e-07]]],
[[[6.6161e-06, 1.0550e-05],
[4.9919e-07, -2.9802e-08]],
[[2.0862e-05, -8.6427e-06],
[3.2932e-06, 6.0201e-06]]],
[[[-7.2718e-06, -8.1062e-06],
[1.2442e-06, 3.8743e-06]],
[[-1.0151e-07, 1.6857e-07],
[5.6326e-06, -3.2783e-06]]]], dtype=np.float32)
grad_value = np.array(
[[[[2.7188, 2.7188],
[2.7188, 2.7188]],
[[2.7188, 2.7188],
[2.7188, 2.7188]]],
[[[1.7188, 1.7188],
[1.7188, 1.7188]],
[[1.7188, 1.7188],
[1.7188, 1.7188]]],
[[[1.2188, 1.2188],
[1.2188, 1.2188]],
[[1.2188, 1.2188],
[1.2188, 1.2188]]],
[[[0.8828, 0.8828],
[0.8828, 0.8828]],
[[0.8828, 0.8828],
[0.8828, 0.8828]]],
[[[0.6328, 0.6328],
[0.6328, 0.6328]],
[[0.6328, 0.6328],
[0.6328, 0.6328]]],
[[[0.4336, 0.4336],
[0.4336, 0.4336]],
[[0.4336, 0.4336],
[0.4336, 0.4336]]],
[[[0.2676, 0.2676],
[0.2676, 0.2676]],
[[0.2676, 0.2676],
[0.2676, 0.2676]]],
[[[0.1250, 0.1250],
[0.1250, 0.1250]],
[[0.1250, 0.1250],
[0.1250, 0.1250]]]], dtype=np.float32)
return {
'bnsd': output_bnsd,
'tnd': output_tnd,
"attn": output_attn,
"query": grad_query,
"key": grad_key,
"value": grad_value,
"block":output_block
}

+ 97
- 0
tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_attention/test_sliding_window_attention/test_sliding_window_attention.py View File

@@ -0,0 +1,97 @@
# Copyright 2025 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Test module for testing SlidingWindowAttention used for mindformers."""
import pytest
import mindspore as ms
from tests.st.test_ut.test_parallel_core.test_training_graph.test_transformer.test_attention.test_sliding_window_attention.data_gen_utils import get_init_attn_params, get_gpu_datas, get_golden
from tests.utils.double_benchmark import DoubleBenchmarkComparator
from mindformers.parallel_core.training_graph.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from mindformers.parallel_core.training_graph.transformer.attention import SelfAttentionSubmodules, SelfAttentionContiguous
from mindformers.parallel_core.training_graph.transformer.flash_attention import FlashAttention
from mindformers.parallel_core.transformer_config import TransformerConfig


class TestSlidingWindowAttention:
"""A test class for testing SlidingWindowAttention"""

def setup_method(self):
"""Setup method to prepare test environment"""
self.config = TransformerConfig(
compute_dtype='bfloat16',
use_flash_attention=True,
num_query_groups=2,
data_parallel_size=1,
tensor_model_parallel_size=1,
hidden_size=8,
num_attention_heads=2,
add_bias_linear=False,
add_qkv_bias=False,
num_layers=1,
params_dtype='float32',
attention_dropout=0.0,
context_parallel_size=1,
window_size=(10, 0),
model_architecture="yoco",
num_encoder_layers=0,
num_decoder_layers=1
)


ms.set_context(mode=ms.GRAPH_MODE)

submodules = SelfAttentionSubmodules(
linear_qkv=ColumnParallelLinear,
core_attention=FlashAttention,
linear_proj=RowParallelLinear,
)

self.net = SelfAttentionContiguous(
config=self.config,
submodules=submodules,
layer_number=1,
)

self.inputs, weight_dict = get_init_attn_params(self.config)
self.net.load_state_dict(weight_dict, strict=False)

def run_test(self, accuracy=True, compare_type=None):
"""Helper function to run test and check results"""

output, _ = self.net(**self.inputs)
npu_output = output.asnumpy()
if accuracy:
gpu_output = GPU_DATA[compare_type]
golden_output = GOLDEN_DATA[compare_type]
assert DoubleBenchmarkComparator.check_pass_or_not(npu_output, gpu_output, golden_output), (
f"SlidingWindowAttention compare_type={compare_type} test failed.\n"
f"NPU output:\n{npu_output}\n\n"
f"GPU output:\n{gpu_output}\n\n"
f"Golden output:\n{golden_output}"
)


@pytest.mark.level1
@pytest.mark.platform_arm_ascend910b_training
@pytest.mark.env_onecard
def test_bnsd_case(self):
"""
Feature: SlidingWindowAttention
Description: Test Case: SlidingWindowAttention
"""
self.run_test(compare_type="attn")


GOLDEN_DATA = get_golden()
GPU_DATA = get_gpu_datas()

+ 107
- 0
tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_flash_attention/test_flash_attention_sliding_window.py View File

@@ -0,0 +1,107 @@
# Copyright 2025 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Test module for testing FlashAttention in SlidingWindowAttention used for mindformers."""
import pytest
from mindspore import nn, ParameterTuple
from tests.st.test_ut.test_parallel_core.test_training_graph.test_transformer.test_attention.test_sliding_window_attention.data_gen_utils import get_init_params, get_init_tnd_params, get_gpu_datas, get_golden
from tests.utils.double_benchmark import DoubleBenchmarkComparator
from mindformers.parallel_core.training_graph.transformer.flash_attention import FlashAttention
from mindformers.parallel_core.training_graph.device_matrix import layout
from mindformers.parallel_core.transformer_config import MLATransformerConfig


def compare_value(compare_type=None, npu_output=None):
"""Check the accuracy results"""
gpu_output = GPU_DATA[compare_type]
golden_output = GOLDEN_DATA[compare_type]
assert DoubleBenchmarkComparator.check_pass_or_not(npu_output, gpu_output, golden_output), (
f"FlashAttention compare_type={compare_type} test failed.\n"
f"NPU output:\n{npu_output}\n\n"
f"GPU output:\n{gpu_output}\n\n"
f"Golden output:\n{golden_output}"
)


class TestFlashAttention:
"""A test class for testing FlashAttention in SlidingWindowAttention"""

def run_test(self, attention_dropout=0.0, soft_max_scale=None, accuracy=True, compare_type=None):
"""Helper function to run test and check results"""
self.flash_attention = FlashAttention(config=self.config, layer_number=0, attention_dropout=attention_dropout,
softmax_scale=soft_max_scale)
if compare_type == "bnsd":
weights = ParameterTuple(self.flash_attention.trainable_params())
train_network = nn.ForwardValueAndGrad(self.flash_attention, weights=weights, get_all=True,
get_by_list=True)
output, grads = train_network(self.inputs["query"], self.inputs["key"], self.inputs["value"],
self.inputs["attention_mask"])
npu_output = output.asnumpy()
query_grad = grads[0][0].asnumpy()
key_grad = grads[0][1].asnumpy()
value_grad = grads[0][2].asnumpy()
if accuracy:
compare_value(compare_type, npu_output)
compare_value("query", query_grad)
compare_value("key", key_grad)
compare_value("value", value_grad)
if compare_type == "tnd":
output = self.flash_attention(**self.inputs)
npu_output = output.asnumpy()
if accuracy:
compare_value(compare_type, npu_output)

@pytest.mark.level1
@pytest.mark.platform_arm_ascend910b_training
@pytest.mark.env_onecard
def test_bnsd_case(self):
"""
Feature: FlashAttention
Description: Test Case: input_layout=bnsd
"""
self.config = MLATransformerConfig(multi_latent_attention=False,
hidden_size=4,
num_attention_heads=2,
num_layers=1,
window_size=(10,0),
model_architecture="yoco"
)

layout.init_layout(self.config)
self.inputs = get_init_params(self.config)
self.run_test(compare_type="bnsd")

@pytest.mark.level1
@pytest.mark.platform_arm_ascend910b_training
@pytest.mark.env_onecard
def test_tnd_case(self):
"""
Feature: FlashAttention
Description: Test Case: input_layout=tnd
"""
self.config = MLATransformerConfig(multi_latent_attention=False,
hidden_size=4,
num_attention_heads=2,
num_layers=1,
window_size=(10,0),
use_eod_attn_mask_compression=True,
model_architecture="yoco"
)

layout.init_layout(self.config)
self.inputs = get_init_tnd_params(self.config)
self.run_test(compare_type="tnd")

GOLDEN_DATA = get_golden()
GPU_DATA = get_gpu_datas()

+ 101
- 0
tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_transformer_block/data_gen_utils_yoco.py View File

@@ -0,0 +1,101 @@
# Copyright 2025 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Data for SharedKVCrossAttention UT."""

import numpy as np
import mindspore as ms
from mindformers.parallel_core.transformer_config import TransformerConfig
from mindformers.parallel_core.training_graph.base_models.common.embeddings.rotary_pos_embedding import RotaryEmbedding


def get_init_params(config: TransformerConfig, seq_length=2, batch_size=2, hidden_size=8, q_hidden_size=8):
"""Generate initial parameters for TransformerBlock with SharedKVCrossAttention"""
np.random.seed(1)
rotary_pos_emb = RotaryEmbedding(
kv_channels=4,
use_eod_reset=config.use_eod_reset
)
params = {
"hidden_states": ms.Tensor(0.01 * np.random.randn(seq_length, batch_size, config.hidden_size), ms.bfloat16),
"attention_mask": ms.Tensor(np.triu(np.ones((2, 2, 2, 2), dtype=np.int8), k=1), dtype=ms.uint8),
"rotary_pos_emb": rotary_pos_emb(seq_length)
}
linear_x0_weight = 0.01 * np.random.randn(32, 8)
linear_x1_weight = 0.01 * np.random.randn(32, 8)
weight_shape = (hidden_size, q_hidden_size)
linear_q_weight = 0.01 * np.random.randn(*weight_shape)
linear_k_weight = 0.01 * np.random.randn(*weight_shape)
linear_v_weight = 0.01 * np.random.randn(*weight_shape)
linear_proj_weight = ms.Tensor(0.01 * np.random.randn(*weight_shape), ms.bfloat16)
cross_linear_q_weight = 0.01 * np.random.randn(*weight_shape)
cross_linear_k_weight = 0.01 * np.random.randn(*weight_shape)
cross_linear_v_weight = 0.01 * np.random.randn(*weight_shape)
linear_fc2_weight = ms.Tensor(0.01 * np.random.randn(8, 32), ms.bfloat16)
kv_layer_norm_weight = ms.Tensor(0.01 * np.random.randn(8), ms.bfloat16)
input_layernorm_weight = ms.Tensor(0.01 * np.random.randn(8), ms.bfloat16)
pre_mlp_layernorm_weight= ms.Tensor(0.01 * np.random.randn(8), ms.bfloat16)
weight_dict = {
"layers.0.self_attention.linear_qkv.weight": ms.Tensor(np.concatenate(
(linear_q_weight, linear_k_weight, linear_v_weight), axis=0),
ms.bfloat16),
"layers.0.self_attention.linear_proj.weight": linear_proj_weight,
"layers.1.cross_attention.linear_q.weight": ms.Tensor(cross_linear_q_weight, ms.bfloat16),
"layers.1.cross_attention.linear_proj.weight": linear_proj_weight,
"layers.0.pre_mlp_layernorm.weight": pre_mlp_layernorm_weight,
"layers.1.pre_mlp_layernorm.weight": pre_mlp_layernorm_weight,
"adapter.k_proj.weight": ms.Tensor(cross_linear_k_weight, ms.bfloat16),
"adapter.v_proj.weight": ms.Tensor(cross_linear_v_weight, ms.bfloat16),
"layers.0.mlp.linear_fc1.weight": ms.Tensor(np.concatenate(
(linear_x0_weight, linear_x1_weight), axis=0), ms.bfloat16
),
"layers.1.mlp.linear_fc1.weight": ms.Tensor(np.concatenate(
(linear_x0_weight, linear_x1_weight), axis=0), ms.bfloat16
),
"layers.0.mlp.linear_fc2.weight": linear_fc2_weight,
"layers.1.mlp.linear_fc2.weight": linear_fc2_weight,
"adapter.kv_layer_norm.weight": kv_layer_norm_weight,
"layers.0.input_layernorm.weight": input_layernorm_weight,
"layers.1.pre_cross_attn_layernorm.weight": input_layernorm_weight,
"final_layernorm.weight": ms.Tensor(0.01 * np.random.randn(8), ms.bfloat16)
}
return params, weight_dict



GOLDEN_DATA = {
"output": np.array(
[[[-0.003723, -0.000454, 0.004303, 0.009277,
0.010315, -0.019531, 0.011353, -0.010132],
[-0.000889, -0.000224, -0.014465, 0.021729,
-0.004639, -0.003937, 0.008972, -0.017822]],
[[0.000641, -0.001076, -0.000565, -0.008301,
-0.021484, 0.015869, 0.009705, 0.010986],
[-0.004028, -0.000984, 0.001953, 0.015747,
-0.006165, 0.008728, -0.008789, -0.010254]]],
dtype=np.float32)
}

GPU_DATA = {
"output": np.array(
[[[-0.0037, -0.0005, 0.0043, 0.0093,
0.0103, -0.0195, 0.0114, -0.0102],
[-0.0009, -0.0002, -0.0145, 0.0218,
-0.0046, -0.0040, 0.0090, -0.0179]],
[[0.0006, -0.0011, -0.0006, -0.0083,
-0.0215, 0.0159, 0.0096, 0.0110],
[-0.0040, -0.0010, 0.0020, 0.0158,
-0.0062, 0.0087, -0.0087, -0.0103]]],
dtype=np.float32)
}

+ 130
- 0
tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_transformer_block/test_transformers_block_shared_kv_cross_attention.py View File

@@ -0,0 +1,130 @@
# Copyright 2025 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Test module for testing FlashAttention used for mindformers."""
import pytest
import mindspore as ms
from tests.st.test_ut.test_parallel_core.test_training_graph.test_transformer.test_attention.test_shared_kv_cross_attention.data_gen_utils import get_init_block_params, GOLDEN_DATA, GPU_DATA
from tests.utils.double_benchmark import DoubleBenchmarkComparator
from mindformers.parallel_core.training_graph.transformer.identity_op import IdentityOp
from mindformers.parallel_core.training_graph.transformer.mlp import MLP, MLPSubmodules
from mindformers.parallel_core.training_graph.transformer.transformer_block import TransformerBlock
from mindformers.parallel_core.training_graph.transformer.transformer_layer import TransformerLayerSubmodules, \
TransformerLayer
from mindformers.parallel_core.training_graph.device_matrix import layout
from mindformers.parallel_core.utils.spec_utils import ModuleSpec
from mindformers.parallel_core.training_graph.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from mindformers.parallel_core.training_graph.transformer.attention import SharedKVCrossAttention, SharedKVCrossAttentionSubmodules
from mindformers.parallel_core.training_graph.transformer.flash_attention import FlashAttention
from mindformers.parallel_core.training_graph.transformer.norm import RMSNorm
from mindformers.parallel_core.transformer_config import TransformerConfig
from mindformers.core.context.build_context import build_context


class TestTransFormersBlock:
"""A test class for testing TransFormBlock with SharedKVCrossAttention"""

def setup_method(self):
"""Setup method to prepare test environment"""
self.config = TransformerConfig(
compute_dtype='bfloat16',
use_flash_attention=True,
num_query_groups=2,
data_parallel_size=1,
tensor_model_parallel_size=1,
hidden_size=8,
num_attention_heads=2,
add_bias_linear=True,
add_qkv_bias=True,
num_layers=1,
params_dtype='float32',
attention_dropout=0.0,
normalization="RMSNorm",
model_architecture="yoco",
num_encoder_layers=0,
num_decoder_layers=1,
hidden_act="swiglu",
gated_linear_unit=True
)

build_context({"use_legacy": False})
ms.context.set_context(deterministic="ON")
ms.set_context(mode=ms.GRAPH_MODE)

submodules = SharedKVCrossAttentionSubmodules(
linear_q=ColumnParallelLinear,
core_attention=FlashAttention,
linear_proj=RowParallelLinear,
)
layout.init_layout(self.config)
layer_submodules = TransformerLayerSubmodules(
input_layernorm=IdentityOp,
cross_attention=ModuleSpec(
module=SharedKVCrossAttention,
submodules=submodules,
),
pre_cross_attn_layernorm=RMSNorm,
self_attention=IdentityOp,
pre_mlp_layernorm=RMSNorm,
mlp=ModuleSpec(
module=MLP,
submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear)
)
)

self.submodules_spec = ModuleSpec(
module=TransformerLayer,
submodules=layer_submodules
)

self.net = TransformerBlock(
config=self.config,
spec=self.submodules_spec,
post_layer_norm=False
)

self.inputs, weight_dict = get_init_block_params(self.config)
self.net.load_state_dict(weight_dict, strict=False)
self.hidden_states = self.inputs.get("hidden_states")
self.attention_mask = self.inputs.get("attention_mask")
self.rotary_pos_emb = self.inputs.get("rotary_pos_emb")

def run_test(self, accuracy=True, compare_type=None):
"""Helper function to run test and check results"""

output, _ = self.net(self.hidden_states,
attention_mask=self.attention_mask,
rotary_pos_emb=self.rotary_pos_emb
)
npu_output = output.asnumpy()
if accuracy:
gpu_output = GPU_DATA[compare_type]
golden_output = GOLDEN_DATA[compare_type]
assert DoubleBenchmarkComparator.check_pass_or_not(npu_output, gpu_output, golden_output), (
f"TransformerBlock with SharedKVCrossAttention compare_type={compare_type} test failed.\n"
f"NPU output:\n{npu_output}\n\n"
f"GPU output:\n{gpu_output}\n\n"
f"Golden output:\n{golden_output}"
)


@pytest.mark.level1
@pytest.mark.platform_arm_ascend910b_training
@pytest.mark.env_onecard
def test_transformer_block_with_cache_cross_attention(self):
"""
Feature: TransformerBlock
Description: Test Case: TransformerBlock with SharedKVCrossAttention
"""
self.run_test(compare_type="output_block")

+ 130
- 0
tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_transformer_block/test_transformers_block_sliding_window_attention.py View File

@@ -0,0 +1,130 @@
# Copyright 2025 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Test module for testing TransformerBlock with SlidingWindowAttention used for mindformers."""
import pytest
import mindspore as ms
from tests.st.test_ut.test_parallel_core.test_training_graph.test_transformer.test_attention.test_sliding_window_attention.data_gen_utils import get_init_block_params, get_gpu_datas, get_golden
from tests.utils.double_benchmark import DoubleBenchmarkComparator
from mindformers.parallel_core.training_graph.transformer.identity_op import IdentityOp
from mindformers.parallel_core.training_graph.transformer.mlp import MLPSubmodules, MLP
from mindformers.parallel_core.training_graph.transformer.transformer_block import TransformerBlock
from mindformers.parallel_core.training_graph.transformer.transformer_layer import TransformerLayerSubmodules, \
TransformerLayer
from mindformers.parallel_core.training_graph.device_matrix import layout
from mindformers.parallel_core.utils.spec_utils import ModuleSpec
from mindformers.parallel_core.training_graph.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from mindformers.parallel_core.training_graph.transformer.attention import SelfAttentionSubmodules, SelfAttentionContiguous
from mindformers.parallel_core.training_graph.transformer.flash_attention import FlashAttention
from mindformers.parallel_core.training_graph.transformer.norm import RMSNorm
from mindformers.parallel_core.transformer_config import TransformerConfig
from mindformers.core.context.build_context import build_context


class TestTransFormersBlock:
"""A test class for testing TransformerBlock with SlidingWindowAttention"""

def setup_method(self):
"""Setup method to prepare test environment"""
self.config = TransformerConfig(
compute_dtype='bfloat16',
use_flash_attention=True,
num_query_groups=2,
data_parallel_size=1,
tensor_model_parallel_size=1,
hidden_size=8,
num_attention_heads=2,
add_bias_linear=True,
add_qkv_bias=True,
num_layers=1,
params_dtype='float32',
attention_dropout=0.0,
normalization="RMSNorm",
window_size=(10, 0),
hidden_act="swiglu",
gated_linear_unit=True
)

build_context({"use_legacy": False})
ms.context.set_context(deterministic="ON")
ms.set_context(mode=ms.GRAPH_MODE)

submodules = SelfAttentionSubmodules(
linear_qkv=ColumnParallelLinear,
core_attention=FlashAttention,
linear_proj=RowParallelLinear
)
layout.init_layout(self.config)
layer_submodules = TransformerLayerSubmodules(
input_layernorm=RMSNorm,
cross_attention=IdentityOp,
pre_cross_attn_layernorm=IdentityOp,
self_attention=ModuleSpec(
module=SelfAttentionContiguous,
submodules=submodules,
),
pre_mlp_layernorm=RMSNorm,
mlp=ModuleSpec(
module=MLP,
submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear)
)
)

self.submodules_spec = ModuleSpec(
module=TransformerLayer,
submodules=layer_submodules
)

self.net = TransformerBlock(
config=self.config,
spec=self.submodules_spec,
post_layer_norm=False
)

self.inputs, weight_dict = get_init_block_params(self.config)
self.net.load_state_dict(weight_dict, strict=False)
self.hidden_states = self.inputs.get("hidden_states")
self.attention_mask = self.inputs.get("attention_mask")
self.rotary_pos_emb = self.inputs.get("rotary_pos_emb")

def run_test(self, accuracy=True, compare_type=None):
"""Helper function to run test and check results"""

output, _ = self.net(self.hidden_states,
attention_mask=self.attention_mask,
rotary_pos_emb=self.rotary_pos_emb
)
npu_output = output.asnumpy()
if accuracy:
gpu_output = GPU_DATA[compare_type]
golden_output = GOLDEN_DATA[compare_type]
assert DoubleBenchmarkComparator.check_pass_or_not(npu_output, gpu_output, golden_output), (
f"TransformerBlock with SlidingWindowAttention compare_type={compare_type} test failed.\n"
f"NPU output:\n{npu_output}\n\n"
f"GPU output:\n{gpu_output}\n\n"
f"Golden output:\n{golden_output}"
)

@pytest.mark.level1
@pytest.mark.platform_arm_ascend910b_training
@pytest.mark.env_onecard
def test_transformer_block_with_cache_cross_attention(self):
"""
Feature: TransformerBlock
Description: Test Case: TransformerBlock with SlidingWindowAttention
"""
self.run_test(compare_type="block")

GOLDEN_DATA = get_golden()
GPU_DATA = get_gpu_datas()

+ 162
- 0
tests/st/test_ut/test_parallel_core/test_training_graph/test_transformer/test_transformer_block/test_transformers_block_yoco.py View File

@@ -0,0 +1,162 @@
# Copyright 2025 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Test module for testing TransformerBlock with SlidingWindowAttention used for mindformers."""
import pytest
import mindspore as ms

from tests.st.test_ut.test_parallel_core.test_training_graph.test_transformer.test_transformer_block.data_gen_utils_yoco import get_init_params, GOLDEN_DATA, GPU_DATA
from tests.utils.double_benchmark import DoubleBenchmarkComparator
from mindformers.parallel_core.training_graph.transformer.identity_op import IdentityOp
from mindformers.parallel_core.training_graph.transformer.mlp import MLPSubmodules, MLP
from mindformers.parallel_core.training_graph.transformer.transformer_block import TransformerBlock, \
TransformerBlockSubmodules
from mindformers.parallel_core.training_graph.transformer.transformer_layer import TransformerLayerSubmodules, \
TransformerLayer
from mindformers.parallel_core.training_graph.device_matrix import layout
from mindformers.parallel_core.utils.spec_utils import ModuleSpec
from mindformers.parallel_core.training_graph.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from mindformers.parallel_core.training_graph.transformer.attention import SelfAttentionSubmodules, \
SelfAttentionContiguous, SharedKVCrossAttentionSubmodules, SharedKVCrossAttention
from mindformers.parallel_core.training_graph.transformer.flash_attention import FlashAttention
from mindformers.parallel_core.training_graph.transformer.norm import RMSNorm
from mindformers.parallel_core.transformer_config import TransformerConfig
from mindformers.core.context.build_context import build_context


class TestTransFormersBlock:
"""A test class for testing TransformerBlock with yoco"""

def setup_method(self):
"""Setup method to prepare test environment"""
self.config = TransformerConfig(
compute_dtype='bfloat16',
use_flash_attention=True,
num_query_groups=2,
data_parallel_size=1,
tensor_model_parallel_size=1,
hidden_size=8,
num_attention_heads=2,
add_bias_linear=True,
add_qkv_bias=True,
num_layers=2,
params_dtype='float32',
attention_dropout=0.0,
normalization="RMSNorm",
window_size=(10, 0),
hidden_act="swiglu",
gated_linear_unit=True,
model_architecture="yoco",
num_encoder_layers=1,
num_decoder_layers=1,
)

build_context({"use_legacy": False})
ms.context.set_context(deterministic="ON")
ms.set_context(mode=ms.GRAPH_MODE)

self_attn_submodules = SelfAttentionSubmodules(
linear_qkv=ColumnParallelLinear,
core_attention=FlashAttention,
linear_proj=RowParallelLinear
)
cross_atn_submodules = SharedKVCrossAttentionSubmodules(
linear_q=ColumnParallelLinear,
core_attention=FlashAttention,
linear_proj=RowParallelLinear,
)
layout.init_layout(self.config)
layer_submodules = []
self_attn_layer_submodule = TransformerLayerSubmodules(
input_layernorm=RMSNorm,
cross_attention=IdentityOp,
pre_cross_attn_layernorm=IdentityOp,
self_attention=ModuleSpec(
module=SelfAttentionContiguous,
submodules=self_attn_submodules,
),
pre_mlp_layernorm=RMSNorm,
mlp=ModuleSpec(
module=MLP,
submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear)
)
)
cross_attn_layer_submodule = TransformerLayerSubmodules(
input_layernorm=IdentityOp,
cross_attention=ModuleSpec(
module=SharedKVCrossAttention,
submodules=cross_atn_submodules,
),
pre_cross_attn_layernorm=RMSNorm,
self_attention=IdentityOp,
pre_mlp_layernorm=RMSNorm,
mlp=ModuleSpec(
module=MLP,
submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear)
)
)
self_attn_submodules_spec = ModuleSpec(
module=TransformerLayer,
submodules=self_attn_layer_submodule
)
cross_attn_submodules_spec = ModuleSpec(
module=TransformerLayer,
submodules=cross_attn_layer_submodule
)
layer_submodules.append(self_attn_submodules_spec)
layer_submodules.append(cross_attn_submodules_spec)
self.submodules_spec = TransformerBlockSubmodules(
layer_specs=layer_submodules,
layer_norm=RMSNorm
)

self.net = TransformerBlock(
config=self.config,
spec=self.submodules_spec,
post_layer_norm=True
)

self.inputs, weight_dict = get_init_params(self.config)
self.net.load_state_dict(weight_dict, strict=False)
self.hidden_states = self.inputs.get("hidden_states")
self.attention_mask = self.inputs.get("attention_mask")
self.rotary_pos_emb = self.inputs.get("rotary_pos_emb")

def run_test(self, accuracy=True, compare_type=None):
"""Helper function to run test and check results"""

output, _ = self.net(self.hidden_states,
attention_mask=self.attention_mask,
rotary_pos_emb=self.rotary_pos_emb
)
npu_output = output.asnumpy()
if accuracy:
gpu_output = GPU_DATA[compare_type]
golden_output = GOLDEN_DATA[compare_type]
assert DoubleBenchmarkComparator.check_pass_or_not(npu_output, gpu_output, golden_output), (
f"TransformerBlock with SlidingWindowAttention compare_type={compare_type} test failed.\n"
f"NPU output:\n{npu_output}\n\n"
f"GPU output:\n{gpu_output}\n\n"
f"Golden output:\n{golden_output}"
)

@pytest.mark.level1
@pytest.mark.platform_arm_ascend910b_training
@pytest.mark.env_onecard
def test_transformer_block_with_yoco(self):
"""
Feature: TransformerBlock
Description: Test Case: TransformerBlock with SlidingWindowAttention
"""
self.run_test(compare_type="output")

Loading…
Cancel
Save
Baidu
map