10 Commits

Author SHA1 Message Date
  i-robot b13b1f16e7
!17993 add custom pass tutorial 16 hours ago
  yuchaojie 027aeade1b add custom pass tutorial 1 day ago
  i-robot 8c1666eed5
!17986 fix docs for custom backend 16 hours ago
  i-robot 4f971c41f4
!17987 【master】【bugfix】【ckpt】相关例子替换llama为qwen2.5 18 hours ago
  i-robot 9665fb8909
!17985 整改精度文档 18 hours ago
  i-robot c59348aff6
!17991 解决lite文档部分页面内容生成错误问题 18 hours ago
  SaiYao f1c8ffe45f 【master】【bugfix】【ckpt】相关例子替换llama为qwen2.5 1 day ago
  myprigitee 254ac25e82 解决lite文档部分页面内容生成错误问题 21 hours ago
  hb_hubin 9f57d19ec5 fix docs for custom backend 1 day ago
  zhangyihuiben 0e525eeb1c 整改精度文档 1 day ago
12 changed files with 1024 additions and 267 deletions
Split View
  1. +1
    -0
      CODEOWNERS
  2. +43
    -26
      docs/lite/api/source_en/conf.py
  3. +69
    -69
      docs/mindformers/docs/source_en/advanced_development/accuracy_comparison.md
  4. +84
    -68
      docs/mindformers/docs/source_en/feature/ckpt.md
  5. +1
    -1
      docs/mindformers/docs/source_zh_cn/advanced_development/accuracy_comparison.md
  6. +95
    -79
      docs/mindformers/docs/source_zh_cn/feature/ckpt.md
  7. +21
    -12
      tutorials/source_en/custom_program/custom_backend.md
  8. +345
    -0
      tutorials/source_en/custom_program/custom_pass.md
  9. +1
    -0
      tutorials/source_en/index.rst
  10. +21
    -12
      tutorials/source_zh_cn/custom_program/custom_backend.md
  11. +342
    -0
      tutorials/source_zh_cn/custom_program/custom_pass.md
  12. +1
    -0
      tutorials/source_zh_cn/index.rst

+ 1
- 0
CODEOWNERS View File

@@ -17,6 +17,7 @@
/tutorials/source_zh_cn/beginner/ @zwiori @gemini524
/tutorials/source_zh_cn/compile/ @ginfung @gemini524
/tutorials/source_zh_cn/custom_program/custom_backend.md @limingqi107 @gemini524
/tutorials/source_zh_cn/custom_program/custom_pass.md @limingqi107 @gemini524
/tutorials/source_zh_cn/custom_program/op_custom.rst @chujinjin @gemini524
/tutorials/source_zh_cn/custom_program/operation/ @chujinjin @gemini524
/tutorials/source_zh_cn/custom_program/hook_program.ipynb @zwiori @gemini524


+ 43
- 26
docs/lite/api/source_en/conf.py View File

@@ -578,37 +578,17 @@ folder_converter = '../include/converter/include'
# 查找同名文件并删除converter下的
find_common_files2del(folder_runtime, folder_converter)

def replace_key_struct_in_type_h(file_path):
# 原代码块
original_code = '''
using Key = struct MS_API Key {
size_t max_key_len = 32;
size_t len = 0;
unsigned char key[32] = {0};
Key() : len(0) {}
explicit Key(const char *dec_key, size_t key_len);
};'''

# 新代码块
new_code = '''
struct MS_API Key {
size_t max_key_len = 32;
size_t len = 0;
unsigned char key[32] = {0};
Key() : len(0) {}
explicit Key(const char *dec_key, size_t key_len);
};

using Key = Key;'''
# 解决lite页面内容生成错误问题
def code_content_replace(original_code, new_code, file_path):
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
except Exception as e:
print(f"读取文件失败:{e}")
return False

modified_content = content.replace(original_code, new_code)
print("找到完全匹配的代码块,执行精准替换")
if modified_content:
print("找到完全匹配的代码块,执行精准替换")

try:
with open(file_path, "w", encoding="utf-8") as f:
@@ -618,9 +598,46 @@ using Key = Key;'''
except Exception as e:
print(f"写入文件失败:{e}")
return False

original_code = '''
using Key = struct MS_API Key {
size_t max_key_len = 32;
size_t len = 0;
unsigned char key[32] = {0};
Key() : len(0) {}
explicit Key(const char *dec_key, size_t key_len);
};'''
new_code = '''
struct MS_API Key {
size_t max_key_len = 32;
size_t len = 0;
unsigned char key[32] = {0};
Key() : len(0) {}
explicit Key(const char *dec_key, size_t key_len);
};

using Key = Key;'''
types_h_path = "../include/runtime/include/api/types.h"
replace_key_struct_in_type_h(types_h_path)
code_content_replace(original_code, new_code, types_h_path)

original_code = '''
/// \\brief Get the value with the given type from a node if it is a ValueNode.'''
new_code = '''
/// \\overload // Tell Doxygen that this is an independent overload and generate a separate entry
/// \\brief Get the value with the given type from a node if it is a ValueNode.'''
anf_h_path = "../include/converter/include/mindapi/ir/anf.h"
code_content_replace(original_code, new_code, anf_h_path)

original_code = '''using CreateKernel = std::function<std::shared_ptr<kernel::Kernel>(
const std::vector<MSTensor> &inputs, const std::vector<MSTensor> &outputs, const schema::Primitive *primitive,
const mindspore::Context *ctx)>;'''
new_code = '''std::shared_ptr<kernel::Kernel> CreateKernel(
const std::vector<MSTensor> &inputs, const std::vector<MSTensor> &outputs, const schema::Primitive *primitive,
const mindspore::Context *ctx);

using CreateKernel = std::function<decltype(CreateKernel)>;'''
register_kernel_h_path = "../include/runtime/include/registry/register_kernel.h"
code_content_replace(original_code, new_code, register_kernel_h_path)

# for file_name in fileList:
# file_data = ''


+ 69
- 69
docs/mindformers/docs/source_en/advanced_development/accuracy_comparison.md View File

@@ -55,75 +55,75 @@ The following tables describe the configuration comparison with Megatron-LM.

This document supports only the precision comparison of the mcore model. Therefore, `--use-mcore-model` must be configured for Megatron-LM, and `use_legacy: False` must be configured for MindSpore Transformers.

| Megatron-LM | Description | MindSpore Transformers | Description |
|--------------------------------------------|---------------------------------------------|--------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
| `use-legacy-model` and `use-mcore-model` | Specifies whether to use the mcore model. | `use_legacy` | Specifies whether to use the mcore model. `use_legacy: False` is equivalent to `--use-mcore-model`. |
| `num-layers` | Number of network layers, that is, number of transformer layers. | `num_layers` | Number of network layers, that is, number of transformer layers. |
| `encoder-num-layers` | Number of encoder layers. | Not supported. | |
| `decoder-num-layers` | Number of decoder layers. | Not supported. | |
| `hidden-size` | Size of the hidden layer, which is the dimension in the hidden state. | `hidden_size` | Size of the hidden layer, which is the dimension in the hidden state. |
| `ffn-hidden-size` | Size of the hidden layer in the feedforward network. | `intermediate_size` | Size of the hidden layer in the feedforward network. |
| `num-attention-heads` | Number of attention heads. | `num_heads` | Number of attention heads. |
| `kv-channels` | Number of key/value tensor channels. | `head_dim` | Number of key/value tensor channels. |
| `group-query-attention` | Specifies whether to enable group query attention. | `use_gqa` | Specifies whether to enable group query attention. |
| `num-query-groups` | Number of query groups. | `n_kv_heads` | Number of query groups. |
| `max-position-embeddings` | Maximum position encoding length. | `max_position_embeddings` | Maximum position encoding length. |
| `position-embedding-type` | Position encoding type, such as learned_absolute and rope. | `position_embedding_type` | Position encoding type, such as learned_absolute and rope. |
| `use-rotary-position-embeddings` | Specifies whether to use rotary position embedding (RoPE). | Specified by `position_embedding_type`==`rope` | Specifies whether to use RoPE. |
| `rotary-base` | Rotary base used for RoPE. | `rotary_base` | Rotary base used for RoPE. |
| `rotary-percent` | RoPE usage ratio. | `rotary_percent` | RoPE usage ratio. |
| `rotary-interleaved` | Specifies whether to use interleaved RoPE. | `rotary_interleaved` | Specifies whether to use interleaved RoPE. |
| `rotary-seq-len-interpolation-factor` | Rotary sequence length interpolation factor. | `rotary_seq_len_interpolation_factor` | Rotary sequence length interpolation factor. |
| `use-rope-scaling` | Specifies whether to enable RoPE scaling. | `use_rope_scaling` | Specifies whether to enable RoPE scaling. |
| `rope-scaling-factor` | RoPE scaling factor. | `scaling_factor` | RoPE scaling factor. |
| `no-position-embedding` | Specifies whether to disable location encoding. | `no-position-embedding` | Specifies whether to disable location encoding. |
| `disable-bias-linear` | Disables bias in linear layers. | `add_bias_linear` | Enables bias in linear layers. |
| `mrope-section` | Information of multiple RoPE sections. | Not supported. | |
| `make-vocab-size-divisible-by` | Divides the size of the word table by a specified number. | Not supported. | By default, the dictionary size is not changed. |
| `init-method-std` | Standard deviation of the normal distribution used during model parameter initialization. | `init_method_std` | Standard deviation of the normal distribution used during model parameter initialization. |
| `attention-dropout` | Dropout probability applied in the multi-head self-attention mechanism. | `attention_dropout` | Dropout probability applied in the multi-head self-attention mechanism. |
| `hidden-dropout` | Dropout probability in the hidden layer. | `hidden_dropout` | Dropout probability in the hidden layer. |
| `normalization` | Normalization method, which can be LayerNorm or RMSNorm. | `normalization` | Normalization method, which can be LayerNorm or RMSNorm. |
| `norm-epsilon` | Normalized stability factor (epsilon). | `rms_norm_eps` | RMSNorm stability factor. |
| `apply-layernorm-1p` | Specifies whether to add 1 after LayerNorm. | Not supported. | |
| `apply-residual-connection-post-layernorm` | Specifies whether the residual connection is applied after LayerNorm. | `apply_residual_connection_post_layernorm` | Specifies whether the residual connection is applied after LayerNorm. |
| `openai-gelu` | Specifies whether to use the GELU activation function of the OpenAI version. | Not supported. | |
| `squared-relu` | Specifies whether to use the square ReLU activation function. | Not supported. | |
| Specified by `swiglu`, `openai-gelu`, and `squared-relu` | The default value is **torch.nn.functional.gelu**. | `hidden_act` | Activation function type. |
| `gated_linear_unit` | Specifies whether to use gate linear unit in multi-layer perceptron (MLP). | `gated_linear_unit` | Specifies whether to use gate linear unit in MLP. |
| `swiglu` | Specifies whether to use the SwiGLU activation function. | `hidden_act`==`silu` and `gated_linear_unit`| Specifies whether to use the SwiGLU activation function. |
| `no-persist-layer-norm` | Disables persistence layer normalization. | Not supported. | |
| `untie-embeddings-and-output-weights` | Specifies whether to decouple the weights of the input embedding layer and output layer. | `untie_embeddings_and_output_weights` | Specifies whether to decouple the weights of the input embedding layer and output layer. |
| Specified by `fp16` and `bf16` | Tensor compute precision during training. | `compute_dtype` | Tensor compute precision during training. |
| `grad-reduce-in-bf16` | Gradient reduction using BFloat16. | Not supported. | |
| Not supported. | By default, the initialization tensor is generated in BFloat16 format. | `param_init_type` | Initial precision of the weight tensor. The default value is **Float32**, which ensures that the backward gradient is updated in Float32. |
| Not supported. | By default, layer normalization is calculated in Float32. | `layernorm_compute_type` | Layer normalization tensor calculation precision. |
| `attention-softmax-in-fp32` | Executes **attention softmax** in Float32. | `softmax_compute_type` | Softmax tensor calculation precision. |
| Not supported. | | `rotary_dtype` | Position encoding tensor calculation precision. |
| `loss-scale` | Overall loss scaling factor. | `loss_scale_value` | Overall loss scaling factor, which is configured in **runner_wrapper**. If `compute_dtype` is set to **BFloat16**, the value is usually set to **1.0**. |
| `initial-loss-scale` | Initial loss scaling factor. | Not supported. | |
| `min-loss-scale` | Minimum loss scaling factor. | Not supported. | |
| `loss-scale-window` | Dynamic window size scaling. | `loss_scale_window` | Dynamic window size scaling. |
| `hysteresis` | Loss scale hysteresis parameter. | Not supported. | |
| `fp32-residual-connection` | Uses Float32 for residual connection. | Not supported. | |
| `accumulate-allreduce-grads-in-fp32` | Accumulates and reduces gradients using Float32. | Not supported. | Accumulates and reduces gradients using Float32 by default. |
| `fp16-lm-cross-entropy` | Uses Float16 to execute the cross entropy of the LLM. | Not supported. | Uses Float32 to execute the cross entropy of the LLM by default. |
| `q-lora-rank` | LoRA rank of the query projection layer, which is used when Q-LoRA is enabled. | `q_lora_rank` | LoRA rank of the query projection layer, which is used when Q-LoRA is enabled. |
| `kv-lora-rank` | LoRA rank of the key/value projection layer, which is used when KV-LoRA is enabled. | `kv_lora_rank` | LoRA rank of the key/value projection layer, which is used when KV-LoRA is enabled. |
| `qk-head-dim` | Number of dimensions per Q/K head. | `qk_nope_head_dim` | Number of dimensions per Q/K head. |
| `qk-pos-emb-head-dim` | Number of relative position embedding dimensions per Q/K head. | `qk_rope_head_dim` | Number of relative position embedding dimensions per Q/K head. |
| `v-head-dim` | Number of dimensions per value projection (V head). | `v_head_dim` | Number of dimensions per value projection (V head). |
| `rotary-scaling-factor` | RoPE scaling coefficient.| `scaling_factor` | RoPE scaling coefficient. |
| `use-precision-aware-optimizer` | Enables the optimizer with precision awareness to automatically manage parameter updates of different data types. | Not supported. | |
| `main-grads-dtype` | Data type of the main gradient. | Not supported. | By default, Float32 is used as the data type of the main gradient. |
| `main-params-dtype` | Data type of the main parameter. | Not supported. | By default, Float32 is used as the data type of the main parameter. |
| `exp-avg-dtype` | Data type of the exponential moving average (EMA). | Not supported. | |
| `exp-avg-sq-dtype` | Data type of the EMA square item. | Not supported. | |
| `first-last-layers-bf16` | Specifies whether to forcibly use BFloat16 at the first and last layers. | Not supported. | |
| `num-layers-at-start-in-bf16` | Number of layers that start with BFloat16. | Not supported. | |
| `num-layers-at-end-in-bf16` | Number of layers that end with BFloat16. | Not supported. | |
| `multi-latent-attention` | Specifies whether to enable the multi-hidden variable attention mechanism. | `multi_latent_attention` | Specifies whether to enable the multi-hidden variable attention mechanism. |
| `qk-layernorm` | Enables query/key layer normalization. | `qk-layernorm` | Enables query/key layer normalization. |
| Megatron-LM | Description | MindSpore Transformers | Description |
|--------------------------------------------|-------------------------------------------------------------------------------------------------------------------|------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
| `use-legacy-model` and `use-mcore-model` | Specifies whether to use the mcore model. | `use_legacy` | Specifies whether to use the mcore model. `use_legacy: False` is equivalent to `--use-mcore-model`. |
| `num-layers` | Number of network layers, that is, number of transformer layers. | `num_layers` | Number of network layers, that is, number of transformer layers. |
| `encoder-num-layers` | Number of encoder layers. | Not supported. | |
| `decoder-num-layers` | Number of decoder layers. | Not supported. | |
| `hidden-size` | Size of the hidden layer, which is the dimension in the hidden state. | `hidden_size` | Size of the hidden layer, which is the dimension in the hidden state. |
| `ffn-hidden-size` | Size of the hidden layer in the feedforward network. | `intermediate_size` | Size of the hidden layer in the feedforward network. |
| `num-attention-heads` | Number of attention heads. | `num_heads` | Number of attention heads. |
| `kv-channels` | Number of key/value tensor channels. | `head_dim` | Number of key/value tensor channels. |
| `group-query-attention` | Specifies whether to enable group query attention. | `use_gqa` | Specifies whether to enable group query attention. |
| `num-query-groups` | Number of query groups. | `n_kv_heads` | Number of query groups. |
| `max-position-embeddings` | Maximum position encoding length. | `max_position_embeddings` | Maximum position encoding length. |
| `position-embedding-type` | Position encoding type, such as learned_absolute and rope. | `position_embedding_type` | Position encoding type, such as learned_absolute and rope. |
| `use-rotary-position-embeddings` | Specifies whether to use rotary position embedding (RoPE). | Specified by `position_embedding_type`==`rope` | Specifies whether to use RoPE. |
| `rotary-base` | Rotary base used for RoPE. | `rotary_base` | Rotary base used for RoPE. |
| `rotary-percent` | RoPE usage ratio. | `rotary_percent` | RoPE usage ratio. |
| `rotary-interleaved` | Specifies whether to use interleaved RoPE. | `rotary_interleaved` | Specifies whether to use interleaved RoPE. |
| `rotary-seq-len-interpolation-factor` | Rotary sequence length interpolation factor. | `rotary_seq_len_interpolation_factor` | Rotary sequence length interpolation factor. |
| `use-rope-scaling` | Specifies whether to enable RoPE scaling. | `use_rope_scaling` | Specifies whether to enable RoPE scaling. |
| `rope-scaling-factor` | RoPE scaling factor. | `scaling_factor` | RoPE scaling factor. |
| `no-position-embedding` | Specifies whether to disable location encoding. | `no-position-embedding` | Specifies whether to disable location encoding. |
| `disable-bias-linear` | Disables bias in linear layers. | `add_bias_linear` | Enables bias in linear layers. |
| `mrope-section` | Information of multiple RoPE sections. | Not supported. | |
| `make-vocab-size-divisible-by` | Divides the size of the word table by a specified number. | Not supported. | By default, the dictionary size is not changed. |
| `init-method-std` | Standard deviation of the normal distribution used during model parameter initialization. | `init_method_std` | Standard deviation of the normal distribution used during model parameter initialization. |
| `attention-dropout` | Dropout probability applied in the multi-head self-attention mechanism. | `attention_dropout` | Dropout probability applied in the multi-head self-attention mechanism. |
| `hidden-dropout` | Dropout probability in the hidden layer. | `hidden_dropout` | Dropout probability in the hidden layer. |
| `normalization` | Normalization method, which can be LayerNorm or RMSNorm. | `normalization` | Normalization method, which can be LayerNorm or RMSNorm. |
| `norm-epsilon` | Normalized stability factor (epsilon). | `rms_norm_eps` | RMSNorm stability factor. |
| `apply-layernorm-1p` | Specifies whether to add 1 after LayerNorm. | Not supported. | |
| `apply-residual-connection-post-layernorm` | Specifies whether the residual connection is applied after LayerNorm. | `apply_residual_connection_post_layernorm` | Specifies whether the residual connection is applied after LayerNorm. |
| `openai-gelu` | Specifies whether to use the GELU activation function of the OpenAI version. | Not supported. | |
| `squared-relu` | Specifies whether to use the square ReLU activation function. | Not supported. | |
| Specified by `swiglu`, `openai-gelu`, and `squared-relu` | The default value is **torch.nn.functional.gelu**. | `hidden_act` | Activation function type. |
| `gated_linear_unit` | Specifies whether to use gate linear unit in multi-layer perceptron (MLP). | `gated_linear_unit` | Specifies whether to use gate linear unit in MLP. |
| `swiglu` | Specifies whether to use the SwiGLU activation function. | `hidden_act` == `silu` and `gated_linear_unit` | Specifies whether to use the SwiGLU activation function. |
| `no-persist-layer-norm` | Disables persistence layer normalization. | Not supported. | |
| `untie-embeddings-and-output-weights` | Specifies whether to decouple the weights of the input embedding layer and output layer. | `untie_embeddings_and_output_weights` | Specifies whether to decouple the weights of the input embedding layer and output layer. |
| Specified by `fp16` and `bf16` | Tensor compute precision during training. | `compute_dtype` | Tensor compute precision during training. |
| `grad-reduce-in-bf16` | Gradient reduction using BFloat16. | Not supported. | |
| Not supported. | By default, the initialization tensor is generated in BFloat16 format. | `param_init_type` | Initial precision of the weight tensor. The default value is **Float32**, which ensures that the backward gradient is updated in Float32. |
| Not supported. | By default, layer normalization is calculated in Float32. | `layernorm_compute_type` | Layer normalization tensor calculation precision. |
| `attention-softmax-in-fp32` | Executes **attention softmax** in Float32. | `softmax_compute_type` | Softmax tensor calculation precision. |
| Not supported. | | `rotary_dtype` | Position encoding tensor calculation precision. |
| `loss-scale` | Overall loss scaling factor. | `loss_scale_value` | Overall loss scaling factor, which is configured in **runner_wrapper**. If `compute_dtype` is set to **BFloat16**, the value is usually set to **1.0**. |
| `initial-loss-scale` | Initial loss scaling factor. | Not supported. | |
| `min-loss-scale` | Minimum loss scaling factor. | Not supported. | |
| `loss-scale-window` | Dynamic window size scaling. | `loss_scale_window` | Dynamic window size scaling. |
| `hysteresis` | Loss scale hysteresis parameter. | Not supported. | |
| `fp32-residual-connection` | Uses Float32 for residual connection. | `fp32_residual_connection` | Uses Float32 for residual connection. |
| `accumulate-allreduce-grads-in-fp32` | Accumulates and reduces gradients using Float32. | Not supported. | Accumulates and reduces gradients using Float32 by default. |
| `fp16-lm-cross-entropy` | Uses Float16 to execute the cross entropy of the LLM. | Not supported. | Uses Float32 to execute the cross entropy of the LLM by default. |
| `q-lora-rank` | LoRA rank of the query projection layer, which is used when Q-LoRA is enabled. | `q_lora_rank` | LoRA rank of the query projection layer, which is used when Q-LoRA is enabled. |
| `kv-lora-rank` | LoRA rank of the key/value projection layer, which is used when KV-LoRA is enabled. | `kv_lora_rank` | LoRA rank of the key/value projection layer, which is used when KV-LoRA is enabled. |
| `qk-head-dim` | Number of dimensions per Q/K head. | `qk_nope_head_dim` | Number of dimensions per Q/K head. |
| `qk-pos-emb-head-dim` | Number of relative position embedding dimensions per Q/K head. | `qk_rope_head_dim` | Number of relative position embedding dimensions per Q/K head. |
| `v-head-dim` | Number of dimensions per value projection (V head). | `v_head_dim` | Number of dimensions per value projection (V head). |
| `rotary-scaling-factor` | RoPE scaling coefficient. | `scaling_factor` | RoPE scaling coefficient. |
| `use-precision-aware-optimizer` | Enables the optimizer with precision awareness to automatically manage parameter updates of different data types. | Not supported. | |
| `main-grads-dtype` | Data type of the main gradient. | Not supported. | By default, Float32 is used as the data type of the main gradient. |
| `main-params-dtype` | Data type of the main parameter. | Not supported. | By default, Float32 is used as the data type of the main parameter. |
| `exp-avg-dtype` | Data type of the exponential moving average (EMA). | Not supported. | |
| `exp-avg-sq-dtype` | Data type of the EMA square item. | Not supported. | |
| `first-last-layers-bf16` | Specifies whether to forcibly use BFloat16 at the first and last layers. | Not supported. | |
| `num-layers-at-start-in-bf16` | Number of layers that start with BFloat16. | Not supported. | |
| `num-layers-at-end-in-bf16` | Number of layers that end with BFloat16. | Not supported. | |
| `multi-latent-attention` | Specifies whether to enable the multi-hidden variable attention mechanism. | `multi_latent_attention` | Specifies whether to enable the multi-hidden variable attention mechanism. |
| `qk-layernorm` | Enables query/key layer normalization. | `qk-layernorm` | Enables query/key layer normalization. |

- Optimizer and learning rate scheduling configurations



+ 84
- 68
docs/mindformers/docs/source_en/feature/ckpt.md View File

@@ -36,10 +36,10 @@ python convert_weight.py [-h] --model MODEL [--reversed] --input_path INPUT_PATH

### Conversion Example

Assume that you have downloaded the [Llama3.1 model weight](https://gitee.com/mindspore/mindformers/blob/master/research/llama3_1/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD) and saved it in the `/home/user/torch_weights` path, to convert it to the MindSpore Transformers weight and save it in the `/home/user/ms_weights` path, run the following command:
Assume that you have downloaded the [Qwen2.5 model weight](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD) and saved it in the `/home/user/torch_weights` path, to convert it to the MindSpore Transformers weight and save it in the `/home/user/ms_weights` path, run the following command:

```bash
python convert_weight.py --model llama --input_path /home/user/torch_weights --output_path /home/user/ms_weights/llama.ckpt
python convert_weight.py --model qwen2_5 --input_path /home/user/torch_weights --output_path /home/user/ms_weights/qwen2_5.ckpt
```

After the preceding steps are performed, the HuggingFace weight is successfully converted to a MindSpore Transformers weight, facilitating model training or inference on MindSpore Transformers.
@@ -48,11 +48,9 @@ After the preceding steps are performed, the HuggingFace weight is successfully

| Parameter Value | Supported models |
|-----------------|------------------------------|
| llama | Llama3.1 |
| glm-n | GLM4 |
| qwen | Qwen2.5 |
| qwen2_5 | Qwen2.5 |
| mixtral | Mixtral |
| deepseek | DeepSeekV3 |

### Developing Weight Conversion for Unsupported Models

@@ -63,60 +61,78 @@ After the preceding steps are performed, the HuggingFace weight is successfully

### Example of Developing Model Weight Conversion

Llama is used as an example. To convert a HuggingFace weight to a MindSpore Transformers one, define the `convert_pt_to_ms` function in [convert_weight.py](https://gitee.com/mindspore/mindformers/blob/master/mindformers/models/llama/convert_weight.py).
[GLM-4](https://gitee.com/mindspore/mindformers/blob/master/docs/model_cards/glm4.md) is used as an example. To convert a HuggingFace weight to a MindSpore Transformers one, define the `convert_pt_to_ms` function in [convert_weight.py](https://gitee.com/mindspore/mindformers/blob/master/mindformers/models/glm2/convert_weight.py).

```python
def convert_pt_to_ms(input_path, output_path, dtype=None, **kwargs):
"""convert hf weight to ms."""
print(f"Trying to convert huggingface checkpoint in '{input_path}'.", flush=True)
try:
from transformers import LlamaForCausalLM
except:
raise ImportError(f"Failed to load huggingface checkpoint. Please make sure transformers is available.")

try:
model_hf = LlamaForCausalLM.from_pretrained(os.path.dirname(input_path))
except Exception as e:
print(f"Do not find huggingface checkpoint in '{os.path.dirname(input_path)}', Error {e.message}.", flush=True)
return False
ckpt_list = []
for name, value in model_hf.state_dict().items():
name = name_replace(name)
if name == 'norm.weight':
name = 'norm_out.weight'
if name[:7] == 'layers.':
name = name[7:]

print(f'\rprocessing parameter: {name} {value.shape} ', end='', flush=True)
ckpt_list.append({'name': name, 'data': pt2ms(value, dtype)})

ms.save_checkpoint(ckpt_list, output_path)
print(f"\rConvert huggingface checkpoint finished, the mindspore checkpoint is saved in '{output_path}'.",
flush=True)
return True
def convert_pt_to_ms(input_path, output_path, config, dtype=ms.float32, **kwargs):
""" Convert pytorch model file to MindSpore model file. """
config: ChatGLM2Config = MindFormerConfig(config)['model']['model_config']
config = ChatGLM2Config(**config)
model = AutoModel.from_pretrained(input_path)

print('parameter convert....')
ms_param = []
for k, v in tqdm(model.state_dict().items()):
if "word_embeddings.weight" in k:
k = k.replace("word_embeddings.weight", "embedding_weight")
ms_param.append({"name": k, "data": v})
# qkv weight split
if not config.qkv_concat or config.use_rearrange_rope:
attn_split(ms_param, config, dtype)

# mlp weight split
if not config.mlp_concat:
mlp_split(ms_param, config, dtype)

tmp_list = []
pop_list = []
for i, item in enumerate(ms_param):
k, v = item["name"], item["data"]
if not isinstance(v, ms.Tensor):
tmp_list.append({"name": k, "data": pt2ms(v, dtype)})
pop_list.append(i)
for i in reversed(pop_list):
ms_param.pop(i)
ms_param += tmp_list

ms.save_checkpoint(ms_param, output_path)
print(f"Convert finished, the output is saved to {output_path}")
```

To convert a MindSpore Transformers weight to a HuggingFace one, define the `convert_ms_to_pt` function in [convert_reversed.py](https://gitee.com/mindspore/mindformers/blob/master/mindformers/models/llama/convert_reversed.py).
To convert a MindSpore Transformers weight to a HuggingFace one, define the `convert_ms_to_pt` function in [convert_reversed.py](https://gitee.com/mindspore/mindformers/blob/master/mindformers/models/glm2/convert_reversed.py).

```python
def convert_ms_to_pt(input_path, output_path, dtype=None, **kwargs):
"""convert ms weight to hf."""
print(f"Trying to convert mindspore checkpoint in '{input_path}'.", flush=True)
model_ms = ms.load_checkpoint(input_path)

state_dict = {}
for name, value in model_ms.items():
name = name_replace(name)
print(f'\rprocessing parameter: {name} {value.shape} ', end='', flush=True)
if is_lora_param(name):
name = name.replace('.tk_delta_lora_a', '.lora_A.weight')
name = name.replace('.tk_delta_lora_b', 'lora_B.weight')
state_dict[name] = ms2pt(value, dtype)

torch.save(state_dict, output_path)
print(f"\rConvert mindspore checkpoint finished, the huggingface checkpoint is saved in '{output_path}'.",
flush=True)
return True
def convert_ms_to_pt(input_path, output_path, config, dtype=torch.float32, **kwargs):
""" Convert MindSpore model file to pytorch model file. """
ckpt_dict = ms.load_checkpoint(input_path)
print('parameter convert....')
pt_param = {}
for k, v in tqdm(ckpt_dict.items()):
v = ms2pt(v, dtype)
if "embedding_weight" in k:
k = k.replace("embedding_weight", "word_embeddings.weight")
if is_lora_param(k):
k = k.replace(".tk_delta_lora_a", ".lora_A.weight")
k = k.replace(".tk_delta_lora_b", ".lora_B.weight")
pt_param[k] = v

# Convert pytorch model file to MindSpore model file.
config: ChatGLM2Config = MindFormerConfig(config)['model']['model_config']
config = ChatGLM2Config(**config)

# qkv weight split
if not config.qkv_concat:
attn_merge(pt_param, config)
else:
attn_rearange(pt_param, config)

# mlp weight split
if not config.mlp_concat:
mlp_merge(pt_param)

print('saving pt ckpt....')
torch.save(pt_param, output_path)
print(f"Convert finished, the output is saved to {output_path}")
```

## Distributed Weight Slicing and Merging
@@ -149,7 +165,7 @@ Parameters in the `yaml` file related to **automatic weight conversion** are des

```yaml
# load_checkpoint: specifies path of the pre-trained weight file.
load_checkpoint: "/worker/llama3_8b/llama3_8b.ckpt"
load_checkpoint: "/worker/qwen2_5-7b/qwen2_5-7b.ckpt"

# auto_trans_ckpt: specifies whether to enable automatic conversion.
auto_trans_ckpt: True
@@ -159,10 +175,10 @@ auto_trans_ckpt: True

```yaml
# load_checkpoint: specifies the path of the multi-device weight folder.
load_checkpoint: "/worker/checkpoint/llama3-8b-2layer-dp2mp2pp2"
load_checkpoint: "/worker/checkpoint/qwen2_5-7b-2layer-dp2mp2pp2"

# src_strategy_path_or_dir: specifies the path of the distributed strategy file.
src_strategy_path_or_dir: "/worker/checkpoint/llama3-8b-2layer-dp2mp2pp2/strategy/merged_ckpt_strategy.ckpt"
src_strategy_path_or_dir: "/worker/checkpoint/qwen2_5-7b-2layer-dp2mp2pp2/strategy/merged_ckpt_strategy.ckpt"

# auto_trans_ckpt: specifies whether to enable automatic conversion.
auto_trans_ckpt: True
@@ -172,10 +188,10 @@ auto_trans_ckpt: True

```yaml
# load_checkpoint: specifies the path of the multi-device weight folder.
load_checkpoint: "/worker/checkpoint/llama3-8b-2layer-dp1mp2pp2"
load_checkpoint: "/worker/checkpoint/qwen2_5-7b-2layer-dp1mp2pp2"

# src_strategy_path_or_dir: specifies the path of the distributed strategy file.
src_strategy_path_or_dir: "/worker/checkpoint/llama3-8b-2layer-dp1mp2pp2/strategy/merged_ckpt_strategy.ckpt"
src_strategy_path_or_dir: "/worker/checkpoint/qwen2_5-7b-2layer-dp1mp2pp2/strategy/merged_ckpt_strategy.ckpt"

# auto_trans_ckpt: specifies whether to enable automatic conversion.
auto_trans_ckpt: True
@@ -221,8 +237,8 @@ Use [mindformers/tools/ckpt_transform/transform_checkpoint.py](https://gitee.com

```shell
python transform_checkpoint.py \
--src_checkpoint /worker/checkpoint/llama3-8b-2layer/rank_0/llama3_8b.ckpt \
--dst_checkpoint /worker/transform_ckpt/llama3_8b_1to8/ \
--src_checkpoint /worker/checkpoint/qwen2_5-7b-2layer/rank_0/qwen2_5-7b.ckpt \
--dst_checkpoint /worker/transform_ckpt/qwen2_5-7b_1to8/ \
--dst_strategy /worker/mindformers/output/strategy/ \
--prefix "checkpoint_"
```
@@ -235,9 +251,9 @@ Use [mindformers/tools/ckpt_transform/transform_checkpoint.sh](https://gitee.com

```shell
bash transform_checkpoint.sh \
/worker/checkpoint/llama3-8b-2layer/rank_0/llama3_8b.ckpt \
/worker/checkpoint/qwen2_5-7b-2layer/rank_0/qwen2_5-7b.ckpt \
None \
/worker/transform_ckpt/llama3_8b_1to8/ \
/worker/transform_ckpt/qwen2_5-7b_1to8/ \
/worker/mindformers/output/strategy/ \
8 2 "checkpoint_"
```
@@ -281,7 +297,7 @@ If a unified shared storage path (such as the NFS-mounted /worker directory) is

```yaml
# Set the path of the pre-trained weight file to an absolute path.
load_checkpoint: "/worker/checkpoint/llama3-8b/rank_0/llama3_8b.ckpt"
load_checkpoint: "/worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt"

# Set auto_trans_ckpt to True to enable automatic weight conversion.
auto_trans_ckpt: True
@@ -374,8 +390,8 @@ If there is no shared path between servers, you need to use the offline weight c

```shell
python mindformers/tools/ckpt_transform/transform_checkpoint.py \
--src_checkpoint /worker/checkpoint/llama3-8b/rank_0/llama_7b.ckpt \
--dst_checkpoint ./output/llama3_8b_dp2mp4pp2 \
--src_checkpoint /worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt \
--dst_checkpoint ./output/qwen2_5-7b_dp2mp4pp2 \
--dst_strategy ./output/strategy
```

@@ -384,9 +400,9 @@ If there is no shared path between servers, you need to use the offline weight c
```shell
# Use two processes for conversion.
bash mindformers/tools/ckpt_transform/transform_checkpoint.sh \
/worker/checkpoint/llama3-8b/rank_0/llama_7b.ckpt \
/worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt \
None \
./output/llama3_8b_dp2mp4pp2 \
./output/qwen2_5-7b_dp2mp4pp2 \
./output/strategy \
16 2
```
@@ -419,7 +435,7 @@ If there is no shared path between servers, you need to use the offline weight c

```yaml
# Set the pre-trained weight path to model_dir, the distributed weight folder path.
load_checkpoint: "/worker/checkpoint/llama3_8b_dp2mp4pp2"
load_checkpoint: "/worker/checkpoint/qwen2_5-7b_dp2mp4pp2"

# Change only_save_strategy to False.
only_save_strategy: False


+ 1
- 1
docs/mindformers/docs/source_zh_cn/advanced_development/accuracy_comparison.md View File

@@ -105,7 +105,7 @@ Megatron-LM 是一个面向大规模训练任务的成熟框架,具备高度
| `min-loss-scale` | 最小损失缩放因子 | 不支持配置 | |
| `loss-scale-window` | 动态缩放窗口大小 | `loss_scale_window` | 动态缩放窗口大小 |
| `hysteresis` | 损失缩放迟滞参数 | 不支持配置 | |
| `fp32-residual-connection` | 使用 Float32 残差连接 | 不支持配置 | |
| `fp32-residual-connection` | 使用 Float32 残差连接 | `fp32_residual_connection` | 使用 Float32 残差连接 |
| `accumulate-allreduce-grads-in-fp32` | 使用 Float32 累加并规约梯度 | 不支持配置 | 默认使用 Float32 累加并规约梯度 |
| `fp16-lm-cross-entropy` | 使用 Float16 执行语言模型交叉熵 | 不支持配置 | 默认使用 Float32 执行语言模型交叉熵 |
| `q-lora-rank` | Query 投影层的 LoRA rank,启用 Q-LoRA 时使用 | `q_lora_rank` | Query 投影层的 LoRA rank,启用 Q-LoRA 时使用 |


+ 95
- 79
docs/mindformers/docs/source_zh_cn/feature/ckpt.md View File

@@ -36,23 +36,21 @@ python convert_weight.py [-h] --model MODEL [--reversed] --input_path INPUT_PATH

### 转换示例

假设用户已经下载了[Llama3.1模型的权重](https://gitee.com/mindspore/mindformers/blob/master/research/llama3_1/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD),并保存在路径`/home/user/torch_weights`中,用户希望将其转换为MindSpore Transformers权重并保存在路径`/home/user/ms_weights`中,可以使用以下命令:
假设用户已经下载了 [Qwen2.5 模型的权重](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD),并保存在路径`/home/user/torch_weights`中,用户希望将其转换为MindSpore Transformers权重并保存在路径`/home/user/ms_weights`中,可以使用以下命令:

```bash
python convert_weight.py --model llama --input_path /home/user/torch_weights --output_path /home/user/ms_weights/llama.ckpt
python convert_weight.py --model qwen2_5 --input_path /home/user/torch_weights --output_path /home/user/ms_weights/qwen2_5.ckpt
```

通过以上步骤,可将HuggingFace权重成功转换为MindSpore Transformers权重,方便在MindSpore Transformers中继续模型训练或推理。

### 已支持模型

| 参数取值 | 支持模型 |
|----------|------------------------------|
| llama | Llama3.1 |
| glm-n | GLM4 |
| qwen | Qwen2.5 |
| mixtral | Mixtral |
| deepseek | DeepSeekV3 |
| 参数取值 | 支持模型 |
|----------|---------|
| glm-n | GLM4 |
| qwen2_5 | Qwen2.5 |
| mixtral | Mixtral |

### 未支持模型权重转换开发

@@ -63,60 +61,78 @@ python convert_weight.py --model llama --input_path /home/user/torch_weights --o

### 模型权重转换开发示例

此处以Llama为例。如若希望转换HuggingFace权重至MindSpore Transformers权重,需在[convert_weight.py](https://gitee.com/mindspore/mindformers/blob/master/mindformers/models/llama/convert_weight.py)内定义`convert_pt_to_ms`函数:
此处以 [GLM-4](https://gitee.com/mindspore/mindformers/blob/master/docs/model_cards/glm4.md) 为例。如若希望转换HuggingFace权重至MindSpore Transformers权重,需在[convert_weight.py](https://gitee.com/mindspore/mindformers/blob/master/mindformers/models/glm2/convert_weight.py)内定义`convert_pt_to_ms`函数:

```python
def convert_pt_to_ms(input_path, output_path, dtype=None, **kwargs):
"""convert hf weight to ms."""
print(f"Trying to convert huggingface checkpoint in '{input_path}'.", flush=True)
try:
from transformers import LlamaForCausalLM
except:
raise ImportError(f"Failed to load huggingface checkpoint. Please make sure transformers is available.")

try:
model_hf = LlamaForCausalLM.from_pretrained(os.path.dirname(input_path))
except Exception as e:
print(f"Do not find huggingface checkpoint in '{os.path.dirname(input_path)}', Error {e.message}.", flush=True)
return False
ckpt_list = []
for name, value in model_hf.state_dict().items():
name = name_replace(name)
if name == 'norm.weight':
name = 'norm_out.weight'
if name[:7] == 'layers.':
name = name[7:]

print(f'\rprocessing parameter: {name} {value.shape} ', end='', flush=True)
ckpt_list.append({'name': name, 'data': pt2ms(value, dtype)})

ms.save_checkpoint(ckpt_list, output_path)
print(f"\rConvert huggingface checkpoint finished, the mindspore checkpoint is saved in '{output_path}'.",
flush=True)
return True
def convert_pt_to_ms(input_path, output_path, config, dtype=ms.float32, **kwargs):
""" Convert pytorch model file to MindSpore model file. """
config: ChatGLM2Config = MindFormerConfig(config)['model']['model_config']
config = ChatGLM2Config(**config)
model = AutoModel.from_pretrained(input_path)

print('parameter convert....')
ms_param = []
for k, v in tqdm(model.state_dict().items()):
if "word_embeddings.weight" in k:
k = k.replace("word_embeddings.weight", "embedding_weight")
ms_param.append({"name": k, "data": v})
# qkv weight split
if not config.qkv_concat or config.use_rearrange_rope:
attn_split(ms_param, config, dtype)

# mlp weight split
if not config.mlp_concat:
mlp_split(ms_param, config, dtype)

tmp_list = []
pop_list = []
for i, item in enumerate(ms_param):
k, v = item["name"], item["data"]
if not isinstance(v, ms.Tensor):
tmp_list.append({"name": k, "data": pt2ms(v, dtype)})
pop_list.append(i)
for i in reversed(pop_list):
ms_param.pop(i)
ms_param += tmp_list

ms.save_checkpoint(ms_param, output_path)
print(f"Convert finished, the output is saved to {output_path}")
```

而若是希望转换MindSpore Transformers权重至HuggingFace权重,则需在[convert_reversed.py](https://gitee.com/mindspore/mindformers/blob/master/mindformers/models/llama/convert_reversed.py)内定义`convert_ms_to_pt`函数:
而若是希望转换MindSpore Transformers权重至HuggingFace权重,则需在[convert_reversed.py](https://gitee.com/mindspore/mindformers/blob/master/mindformers/models/glm2/convert_reversed.py)内定义`convert_ms_to_pt`函数:

```python
def convert_ms_to_pt(input_path, output_path, dtype=None, **kwargs):
"""convert ms weight to hf."""
print(f"Trying to convert mindspore checkpoint in '{input_path}'.", flush=True)
model_ms = ms.load_checkpoint(input_path)

state_dict = {}
for name, value in model_ms.items():
name = name_replace(name)
print(f'\rprocessing parameter: {name} {value.shape} ', end='', flush=True)
if is_lora_param(name):
name = name.replace('.tk_delta_lora_a', '.lora_A.weight')
name = name.replace('.tk_delta_lora_b', 'lora_B.weight')
state_dict[name] = ms2pt(value, dtype)

torch.save(state_dict, output_path)
print(f"\rConvert mindspore checkpoint finished, the huggingface checkpoint is saved in '{output_path}'.",
flush=True)
return True
def convert_ms_to_pt(input_path, output_path, config, dtype=torch.float32, **kwargs):
""" Convert MindSpore model file to pytorch model file. """
ckpt_dict = ms.load_checkpoint(input_path)
print('parameter convert....')
pt_param = {}
for k, v in tqdm(ckpt_dict.items()):
v = ms2pt(v, dtype)
if "embedding_weight" in k:
k = k.replace("embedding_weight", "word_embeddings.weight")
if is_lora_param(k):
k = k.replace(".tk_delta_lora_a", ".lora_A.weight")
k = k.replace(".tk_delta_lora_b", ".lora_B.weight")
pt_param[k] = v

# Convert pytorch model file to MindSpore model file.
config: ChatGLM2Config = MindFormerConfig(config)['model']['model_config']
config = ChatGLM2Config(**config)

# qkv weight split
if not config.qkv_concat:
attn_merge(pt_param, config)
else:
attn_rearange(pt_param, config)

# mlp weight split
if not config.mlp_concat:
mlp_merge(pt_param)

print('saving pt ckpt....')
torch.save(pt_param, output_path)
print(f"Convert finished, the output is saved to {output_path}")
```

## 权重切分与合并
@@ -135,13 +151,13 @@ def convert_ms_to_pt(input_path, output_path, dtype=None, **kwargs):

**自动权重转换**相关`yaml`文件参数说明如下:

| 参数名称 | 说明 |
| ------------------- |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| load_checkpoint | 预加载权重的绝对路径或文件夹路径。<br> - 如果是完整权重,则填写绝对路径;<br> - 如果是分布式权重,则填写文件夹路径,分布式权重须按照`model_dir/rank_x/xxx.ckpt`格式存放,文件夹路径填写为`model_dir`。<br>**如果rank_x文件夹下存在多个ckpt,将会使用文件名默认排序最后的ckpt文件用于转换。** |
| src_strategy_path_or_dir | 预加载权重对应的[分布式策略文件](#离线转换配置说明)路径。<br> - 如果预加载权重是完整权重,则**不填写**;<br> - 如果预加载权重是分布式权重,且预加载权重保存时使用了流水线并行,则填写**合并的策略文件路径**或**分布式策略文件夹路径**;<br> - 如果预加载权重是分布式权重,且预加载权重保存时未使用流水线并行,则填写任一**ckpt_strategy_rank_x.ckpt**路径; |
| auto_trans_ckpt | 权重自动转换开关,为True开启,默认False。 |
| transform_process_num | 权重自动转换使用的进程数,默认为1。<br> - 如果transform_process_num = 1,使用**单进程转换**,转换时只有rank_0负责权重转换,其他进程等待rank_0转换结束;<br> - 如果transform_process_num > 1,使用**多进程转换**,比如8卡任务,transform_process_num=2时,转换时rank_0负责rank_0/1/2/3切片权重的转换,rank_4负责rank_4/5/6/7切片权重的转换,其他进程等待rank_0/4转换结束;<br>**注意**:<br> 1. transform_process_num越大,转换时间越短,**转换所占用的host内存越大**;当出现host侧内存不足时,需要减少transform_process_num。<br> 2. transform_process_num必须能够整除NPU卡数,且最大不得超过NPU卡数。 |
| transform_by_rank | 是否使用mindspore.transform_checkpoint_by_rank接口做权重转换。<br> - transform_process_num > 1时,自动设置为`True`;<br> - transform_process_num = 1时,如果目标权重为分布式权重,则循环调用mindspore.transform_checkpoint_by_rank串行转换每一个rank切片权重。<br>- transform_process_num = 1时,如果目标权重为完整权重,则自动设置为`False`,使用mindspore.transform_checkpoints接口做权重转换; |
| 参数名称 | 说明 |
|--------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| load_checkpoint | 预加载权重的绝对路径或文件夹路径。<br> - 如果是完整权重,则填写绝对路径;<br> - 如果是分布式权重,则填写文件夹路径,分布式权重须按照`model_dir/rank_x/xxx.ckpt`格式存放,文件夹路径填写为`model_dir`。<br>**如果rank_x文件夹下存在多个ckpt,将会使用文件名默认排序最后的ckpt文件用于转换。** |
| src_strategy_path_or_dir | 预加载权重对应的[分布式策略文件](#离线转换配置说明)路径。<br> - 如果预加载权重是完整权重,则**不填写**;<br> - 如果预加载权重是分布式权重,且预加载权重保存时使用了流水线并行,则填写**合并的策略文件路径**或**分布式策略文件夹路径**;<br> - 如果预加载权重是分布式权重,且预加载权重保存时未使用流水线并行,则填写任一**ckpt_strategy_rank_x.ckpt**路径; |
| auto_trans_ckpt | 权重自动转换开关,为 `True` 开启,默认 `False` 。 |
| transform_process_num | 权重自动转换使用的进程数,默认为1。<br> - 如果transform_process_num = 1,使用**单进程转换**,转换时只有rank_0负责权重转换,其他进程等待rank_0转换结束;<br> - 如果transform_process_num > 1,使用**多进程转换**,比如8卡任务,transform_process_num=2时,转换时rank_0负责rank_0/1/2/3切片权重的转换,rank_4负责rank_4/5/6/7切片权重的转换,其他进程等待rank_0/4转换结束;<br>**注意**:<br> 1. transform_process_num越大,转换时间越短,**转换所占用的host内存越大**;当出现host侧内存不足时,需要减少transform_process_num。<br> 2. transform_process_num必须能够整除NPU卡数,且最大不得超过NPU卡数。 |
| transform_by_rank | 是否使用mindspore.transform_checkpoint_by_rank接口做权重转换。<br> - transform_process_num > 1时,自动设置为`True`;<br> - transform_process_num = 1时,如果目标权重为分布式权重,则循环调用mindspore.transform_checkpoint_by_rank串行转换每一个rank切片权重。<br>- transform_process_num = 1时,如果目标权重为完整权重,则自动设置为`False`,使用mindspore.transform_checkpoints接口做权重转换; |

#### 不同场景下yaml配置说明

@@ -149,7 +165,7 @@ def convert_ms_to_pt(input_path, output_path, dtype=None, **kwargs):

```yaml
# load_checkpoint: 设置为预训练权重文件路径
load_checkpoint: "/worker/llama3_8b/llama3_8b.ckpt"
load_checkpoint: "/worker/qwen2_5-7b/qwen2_5-7b.ckpt"

# auto_trans_ckpt: 开启自动转换
auto_trans_ckpt: True
@@ -159,10 +175,10 @@ auto_trans_ckpt: True

```yaml
# load_checkpoint: 设置为多卡权重文件夹路径
load_checkpoint: "/worker/checkpoint/llama3-8b-2layer-dp2mp2pp2"
load_checkpoint: "/worker/checkpoint/qwen2_5-7b-2layer-dp2mp2pp2"

# src_strategy_path_or_dir: 设置为分布式策略文件路径
src_strategy_path_or_dir: "/worker/checkpoint/llama3-8b-2layer-dp2mp2pp2/strategy/merged_ckpt_strategy.ckpt"
src_strategy_path_or_dir: "/worker/checkpoint/qwen2_5-7b-2layer-dp2mp2pp2/strategy/merged_ckpt_strategy.ckpt"

# auto_trans_ckpt: 开启自动转换
auto_trans_ckpt: True
@@ -172,10 +188,10 @@ auto_trans_ckpt: True

```yaml
# load_checkpoint: 设置为多卡权重文件夹路径
load_checkpoint: "/worker/checkpoint/llama3-8b-2layer-dp1mp2pp2"
load_checkpoint: "/worker/checkpoint/qwen2_5-7b-2layer-dp1mp2pp2"

# src_strategy_path_or_dir: 设置为分布式策略文件路径
src_strategy_path_or_dir: "/worker/checkpoint/llama3-8b-2layer-dp1mp2pp2/strategy/merged_ckpt_strategy.ckpt"
src_strategy_path_or_dir: "/worker/checkpoint/qwen2_5-7b-2layer-dp1mp2pp2/strategy/merged_ckpt_strategy.ckpt"

# auto_trans_ckpt: 开启自动转换
auto_trans_ckpt: True
@@ -221,8 +237,8 @@ MindSpore每次运行分布式任务后都会在`output/strategy`文件夹下生

```shell
python transform_checkpoint.py \
--src_checkpoint /worker/checkpoint/llama3-8b-2layer/rank_0/llama3_8b.ckpt \
--dst_checkpoint_dir /worker/transform_ckpt/llama3_8b_1to8/ \
--src_checkpoint /worker/checkpoint/qwen2_5-7b-2layer/rank_0/qwen2_5-7b.ckpt \
--dst_checkpoint_dir /worker/transform_ckpt/qwen2_5-7b_1to8/ \
--dst_strategy /worker/mindformers/output/strategy/ \
--prefix "checkpoint_"
```
@@ -235,9 +251,9 @@ python transform_checkpoint.py \

```shell
bash transform_checkpoint.sh \
/worker/checkpoint/llama3-8b-2layer/rank_0/llama3_8b.ckpt \
/worker/checkpoint/qwen2_5-7b-2layer/rank_0/qwen2_5-7b.ckpt \
None \
/worker/transform_ckpt/llama3_8b_1to8/ \
/worker/transform_ckpt/qwen2_5-7b_1to8/ \
/worker/mindformers/output/strategy/ \
8 2 "checkpoint_"
```
@@ -281,7 +297,7 @@ bash transform_checkpoint.sh \

```yaml
# 配置预训练权重路径,填写权重文件的绝对路径
load_checkpoint: "/worker/checkpoint/llama3-8b/rank_0/llama3_8b.ckpt"
load_checkpoint: "/worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt"

# 设置 auto_trans_ckpt 为 True 开启自动权重转换
auto_trans_ckpt: True
@@ -374,8 +390,8 @@ bash transform_checkpoint.sh \

```shell
python mindformers/tools/ckpt_transform/transform_checkpoint.py \
--src_checkpoint /worker/checkpoint/llama3-8b/rank_0/llama_7b.ckpt \
--dst_checkpoint ./output/llama3_8b_dp2mp4pp2 \
--src_checkpoint /worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt \
--dst_checkpoint ./output/qwen2_5-7b_dp2mp4pp2 \
--dst_strategy ./output/strategy
```

@@ -384,9 +400,9 @@ bash transform_checkpoint.sh \
```shell
# 使用2个进程进行转换
bash mindformers/tools/ckpt_transform/transform_checkpoint.sh \
/worker/checkpoint/llama3-8b/rank_0/llama_7b.ckpt \
/worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt \
None \
./output/llama3_8b_dp2mp4pp2 \
./output/qwen2_5-7b_dp2mp4pp2 \
./output/strategy \
16 2
```
@@ -419,7 +435,7 @@ bash transform_checkpoint.sh \

```yaml
# 配置预训练权重路径,填写分布式权重文件夹路径 model_dir
load_checkpoint: "/worker/checkpoint/llama3_8b_dp2mp4pp2"
load_checkpoint: "/worker/checkpoint/qwen2_5-7b_dp2mp4pp2"

# 将 only_save_strategy 改为 False
only_save_strategy: False


+ 21
- 12
tutorials/source_en/custom_program/custom_backend.md View File

@@ -39,7 +39,7 @@ class MSCustomBackendBase : public BackendBase {
// The backend graph Run interface by the graph_id which are generated through the graph Build interface above.
RunningStatus Run(BackendGraphId graph_id, const VectorRef &inputs, VectorRef *outputs) {
MS_LOG(WARNING) << "MSCustomBackendBase use the origin ms_backend to run the graph.";
mindspore::backend::BackendManager::GetInstance().Run(BackendType::kMsBackend, graph_id, inputs, outputs);
mindspore::backend::BackendManager::GetInstance().Run(BackendType::kMSBackend, graph_id, inputs, outputs);
}
};
MS_REGISTER_BACKEND(kCustomBackendName, MSCustomBackendBase)
@@ -49,7 +49,7 @@ MS_REGISTER_BACKEND(kCustomBackendName, MSCustomBackendBase)

## Compiling Custom Backend

Save the above example code as `custom_backend.cpp` and compile it into `libcustom_backend.so`. The compilation command is as follows:
Save the above example code as `custom_backend.cpp` and compile it into `libcustom_backend.so`. The CMake script is as follows:

```cmake
cmake_minimum_required(VERSION 3.16)
@@ -59,7 +59,7 @@ set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# Use specified MindSpore path
set(MINDSPORE_INCLUDE_DIRS ${MINDSPORE_ROOT}/include)
set(MINDSPORE_INCLUDE_DIR ${MINDSPORE_ROOT}/include)
set(MINDSPORE_LIB_DIRS ${MINDSPORE_ROOT}/lib)
message(STATUS "Using MindSpore from: ${MINDSPORE_ROOT}")

@@ -70,20 +70,20 @@ set(CMAKE_BUILD_TYPE "Release")
include_directories(${CMAKE_CURRENT_SOURCE_DIR})

# Handle MindSpore include directories
if(MINDSPORE_INCLUDE_DIRS)
include_directories(${include_dir})
if(MINDSPORE_INCLUDE_DIR)
include_directories(${MINDSPORE_INCLUDE_DIR})
# Add complete MindSpore include paths to ensure all dependency headers are found
include_directories(${include_dir}/mindspore)
include_directories(${include_dir}/mindspore/core/include)
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore)
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/core/include)
# Add MindSpore ccsrc path, contains mindspore/ccsrc/include/
include_directories(${include_dir}/mindspore/ccsrc/include)
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ccsrc/include)
# Add MindSpore csrc path, contains mindspore/ccsrc/
include_directories(${include_dir}/mindspore/ccsrc)
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ccsrc)
# Add third_party path, contains securec.h
include_directories(${include_dir}/third_party)
include_directories(${include_dir}/third_party/include)
include_directories(${MINDSPORE_INCLUDE_DIR}/third_party)
include_directories(${MINDSPORE_INCLUDE_DIR}/third_party/include)
# Add specific pybind11 path
include_directories(${include_dir}/third_party/pybind11)
include_directories(${MINDSPORE_INCLUDE_DIR}/third_party/pybind11)
endif()

# Find Python
@@ -145,6 +145,15 @@ install(TARGETS custom_backend
)
```

The compilation command is as follows:

```bash
cmake . -DMINDSPORE_ROOT=/path/to/mindspore
make
```

Among them, `/path/to/mindspore` represents the installation path of MindSpore.

## Using Custom Backend

Using [mindspore.graph.register_custom_backend](https://www.mindspore.cn/docs/en/master/api_python/graph/mindspore.graph.register_custom_backend.html) to register the backend and use [mindspore.jit](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.jit.html) to enable the backend:


+ 345
- 0
tutorials/source_en/custom_program/custom_pass.md View File

@@ -0,0 +1,345 @@
# Custom Pass

[![View Source File](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/tutorials/source_en/custom_program/custom_pass.md)

## Overview

When it is necessary to change the structure of the computation graph, you can utilize MindSpore's custom pass feature to write pass logic, implement and register a custom pass plugin, and optimize the structure of the computation graph.

This tutorial provides a simple custom pass case as a demonstration. For more comprehensive examples, please refer to the [examples](https://gitee.com/mindspore/mindspore/blob/master/tests/st/backend/custom_pass/test_custom_passes.py).

## Implementing Custom Pass

The implementation of custom pass requires completing the following steps:

1. Reference `mindspore/include/custom_pass_api.h` header file.
2. Inherit `PatternToPatternPass` class and implement `DefineSrcPattern`, `DefineDstPattern` and `CheckMatchedDAG` interfaces.
3. Inherit `CustomPassPlugin` class and implement `GetPluginName`, `GetAvailablePassNames` and `CreatePass` interfaces.
4. Register custom backend by using the `EXPORT_CUSTOM_PASS_PLUGIN` macro.

Here, we implement a simple AddNegFusionPass and a custom Pass plugin to replace the Add operator and Neg operator with a Sub operator.

```c++
// add_neg_fusion_pass.h
// header file of AddNegFusionPass
#ifndef MINDSPORE_CUSTOM_PASS_ADD_NEG_FUSION_PASS_H_
#define MINDSPORE_CUSTOM_PASS_ADD_NEG_FUSION_PASS_H_

#include "mindspore/include/custom_pass_api.h"

namespace mindspore {
namespace opt {
/**
* @brief Pass to fuse Add and Neg operations into Sub
*
* Transforms Add(x, Neg(y)) into Sub(x, y)
* This is a standard algebraic optimization that eliminates unnecessary Neg operations
* Works on CPU/GPU/Ascend since all platforms support Add, Neg, and Sub operations
* Inherits from PatternToPatternPass to comply with MindSpore plugin system requirements
*/
class AddNegFusionPass : public PatternToPatternPass {
public:
AddNegFusionPass() : PatternToPatternPass("AddNegFusionPass") {}

void DefineSrcPattern(SrcPattern *src_pattern) override;
void DefineDstPattern(DstPattern *dst_pattern) override;
bool CheckMatchedDAG(const PatternMap &pattern_map, const FuncGraphPtr &func_graph,
const AnfNodePtr &node) const override;

private:
static bool IsAddNode(const AnfNodePtr &node);
static bool IsNegNode(const AnfNodePtr &node);

static AnfNodePtr BuildSub(const PatternMap &m, const AnfNodePtr &default_node);
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CUSTOM_PASS_ADD_NEG_FUSION_PASS_H_
```

```c++
// add_neg_fusion_pass.cc
// cpp file of AddNegFusionPass
#include "add_neg_fusion_pass.h"

namespace mindspore {
namespace opt {
void AddNegFusionPass::DefineSrcPattern(SrcPattern *src_pattern) {
MS_LOG(INFO) << "Defining source pattern for AddNegFusionPass";
MS_EXCEPTION_IF_NULL(src_pattern);

// Pattern: Add(x, Neg(y))
(*src_pattern)
.AddVar("x")
.AddVar("y")
.AddCNode("neg", {std::make_shared<Primitive>("Neg"), "y"})
.AddCNode("add", {std::make_shared<Primitive>("Add"), "x", "neg"});

MS_LOG(INFO) << "Source pattern defined: Add(x, Neg(y))";
}

AnfNodePtr AddNegFusionPass::BuildSub(const PatternMap &m, const AnfNodePtr &default_node) {
auto add_node = m.Get("add")->cast<CNodePtr>();
auto neg_node = m.Get("neg")->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(add_node);
MS_EXCEPTION_IF_NULL(neg_node);

auto sub_node = default_node->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(sub_node);

// Copy Add node's scope to maintain execution context
sub_node->set_scope(add_node->scope());

// Set abstract same as Add output
auto add_abstract = add_node->abstract();
if (add_abstract != nullptr) {
sub_node->set_abstract(add_abstract->Clone());
} else {
MS_LOG(EXCEPTION) << "Failed to create Sub abstract from Add node";
}

return sub_node;
}

void AddNegFusionPass::DefineDstPattern(DstPattern *dst_pattern) {
MS_LOG(INFO) << "Defining destination pattern for AddNegFusionPass";
MS_EXCEPTION_IF_NULL(dst_pattern);

// Replace with Sub(x, y) - directly subtract y instead of adding its negation
(*dst_pattern).AddCNode("sub", {std::make_shared<Primitive>("Sub"), "x", "y"}, BuildSub);

MS_LOG(INFO) << "Destination pattern defined: Sub(x, y)";
}

bool AddNegFusionPass::CheckMatchedDAG(const PatternMap &pattern_map, const FuncGraphPtr &func_graph,
const AnfNodePtr &node) const {
auto add_node = pattern_map.Get("add");
if (!add_node) {
MS_LOG(ERROR) << "Add node not found in pattern match";
return false;
}

auto neg_node = pattern_map.Get("neg");
if (!neg_node) {
MS_LOG(ERROR) << "Neg node not found in pattern match";
return false;
}

auto x_node = pattern_map.Get("x");
if (!x_node) {
MS_LOG(ERROR) << "x node not found in pattern match";
return false;
}

auto y_node = pattern_map.Get("y");
if (!y_node) {
MS_LOG(ERROR) << "y node not found in pattern match";
return false;
}

MS_LOG(INFO) << "AddNeg fusion pattern matched successfully";
return true;
}
} // namespace opt
} // namespace mindspore
```

```c++
// ms_custom_pass_plugin.cc
// cpp file of Custom Pass Plugin
#include <string>
#include <memory>
#include <vector>
#include "mindspore/ccsrc/include/backend/common/custom_pass/custom_pass_plugin.h"
#include "add_neg_fusion_pass.h"

namespace mindspore {
namespace opt {

class MSCustomPassPlugin : public CustomPassPlugin {
public:
std::string GetPluginName() const override { return "ms_custom_pass_plugin"; }

std::vector<std::string> GetAvailablePassNames() const override {
return {"ReplaceAddNFusionPass", "AddNegFusionPass"};
}

std::shared_ptr<Pass> CreatePass(const std::string &pass_name) const override {
if (pass_name == "AddNegFusionPass") {
auto pass = std::make_shared<AddNegFusionPass>();
MS_LOG(INFO) << "Created pass '" << pass_name << "' successfully";
return pass;
} else {
MS_LOG(WARNING) << "Pass '" << pass_name << "' not found, available: ReplaceAddNFusionPass, AddNegFusionPass";
return nullptr;
}
}
};
} // namespace opt
} // namespace mindspore

EXPORT_CUSTOM_PASS_PLUGIN(mindspore::opt::MSCustomPassPlugin)
```

## Compiling Custom Pass Plugin

Compile the above example code into `libcustom_pass.so`. The CMake script is as follows:

```cmake
cmake_minimum_required(VERSION 3.16)
project(pass VERSION 1.0.0 LANGUAGES CXX)

# Set C++ standard
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# Use specified MindSpore path
set(MINDSPORE_INCLUDE_DIR ${MINDSPORE_ROOT}/include)
set(MINDSPORE_LIB_DIRS ${MINDSPORE_ROOT}/lib)
message(STATUS "Using MindSpore from: ${MINDSPORE_ROOT}")

# Build options configuration (simplified)
set(CMAKE_BUILD_TYPE "Release")

# Set CMake module path - adjusted for mindspore test location
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")

# Include directories
include_directories(${CMAKE_CURRENT_SOURCE_DIR})

# Handle multiple MindSpore include directories
if(MINDSPORE_INCLUDE_DIR)
# Add complete MindSpore include paths to ensure all dependency headers are found
include_directories(${MINDSPORE_INCLUDE_DIR})
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore)
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/core/include)
# Add MindSpore ccsrc path
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ccsrc/)
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ccsrc/include/)
# Add MindSpore ops path
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ops)
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ops/include)
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ops/kernel/include)
# Add third_party path, contains securec.h
include_directories(${MINDSPORE_INCLUDE_DIR}/third_party)
# Add specific securec path
include_directories(${MINDSPORE_INCLUDE_DIR}/third_party/securec/include)
endif()

# Automatically find all source files
file(GLOB_RECURSE PASS_SOURCES "*.cc")
file(GLOB_RECURSE PASS_HEADERS "*.h")

# Create dynamic library (based on installed MindSpore)
add_library(custom_pass SHARED ${PASS_SOURCES})

# Link MindSpore libraries (based on actual requirements)
target_link_libraries(custom_pass
${MINDSPORE_LIB_DIRS}/libmindspore_backend_common.so
${MINDSPORE_LIB_DIRS}/libmindspore_core.so
${MINDSPORE_LIB_DIRS}/libmindspore_common.so
)

# Default settings
option(ENABLE_GLIBCXX "enable_glibcxx" OFF)

# System-related overrides
if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux")
set(ENABLE_GLIBCXX ON)
endif()

# Environment variable overrides
if(DEFINED ENV{ENABLE_GLIBCXX})
set(ENABLE_GLIBCXX $ENV{ENABLE_GLIBCXX})
endif()

# ABI flag settings
if(CMAKE_SYSTEM_NAME MATCHES "Linux")
if(NOT ENABLE_GLIBCXX)
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0)
endif()
endif()

# Set compilation options
target_compile_options(custom_pass PRIVATE
-fPIC
-std=c++17
-Wall
-Wextra
)

# Use ABI settings consistent with MindSpore
if(CMAKE_SYSTEM_NAME MATCHES "Linux")
if(NOT ENABLE_GLIBCXX)
target_compile_definitions(custom_pass PRIVATE _GLIBCXX_USE_CXX11_ABI=0)
endif()
endif()

# Set compilation definitions
target_compile_definitions(custom_pass PRIVATE
-DPASS_PLUGIN_EXPORTS
-DMINDSPORE_PASS
)

# Set dynamic library properties
set_target_properties(custom_pass PROPERTIES
VERSION ${PROJECT_VERSION}
SOVERSION ${PROJECT_VERSION_MAJOR}
PREFIX "lib"
OUTPUT_NAME "custom_pass"
)

# Installation rules
install(TARGETS custom_pass
LIBRARY DESTINATION lib
RUNTIME DESTINATION bin
)
```

The compilation command is as follows:

```bash
cmake . -DMINDSPORE_ROOT=/path/to/mindspore
make
```

`/path/to/mindspore` represents the installation path of MindSpore.

## Using Custom Pass

Using [mindspore.graph.register_custom_pass](https://www.mindspore.cn/docs/en/master/api_python/graph/mindspore.graph.register_custom_pass.html) to register and enable the custom pass:

```python
import numpy as np
import mindspore
from mindspore import jit, ops, nn, context, Tensor

custom_path = "/data1/libcustom_pass.so"
success = mindspore.graph.register_custom_pass("AddNegFusionPass", custom_path, "cpu")
assert success, "Plugin registration failed"

class AddNegNetwork(nn.Cell):
def __init__(self):
super().__init__()
self.neg = ops.Neg()

@jit(backend="ms_backend")
def construct(self, x1, x2):
# Neg operation: -x2
neg_x2 = self.neg(x2)
# Add operation: x1 + (-x2) = x1 - x2
output = x1 + neg_x2
return output

context.set_context(device_target="CPU")
net = AddNegNetwork()
x1 = Tensor(np.array([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]).astype(np.float32))
x2 = Tensor(np.array([[[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3]],
[[4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6]]]).astype(np.float32))
output = net(x1, x2)

# Verify functional correctness
expected = x1.asnumpy() - x2.asnumpy() # x1 + (-x2) = x1 - x2
np.testing.assert_array_almost_equal(output.asnumpy(), expected)
```

+ 1
- 0
tutorials/source_en/index.rst View File

@@ -83,6 +83,7 @@ MindSpore Tutorial

custom_program/op_custom
custom_program/custom_backend
custom_program/custom_pass
custom_program/hook_program

.. toctree::


+ 21
- 12
tutorials/source_zh_cn/custom_program/custom_backend.md View File

@@ -39,7 +39,7 @@ class MSCustomBackendBase : public BackendBase {
// The backend graph Run interface by the graph_id which are generated through the graph Build interface above.
RunningStatus Run(BackendGraphId graph_id, const VectorRef &inputs, VectorRef *outputs) {
MS_LOG(WARNING) << "MSCustomBackendBase use the origin ms_backend to run the graph.";
mindspore::backend::BackendManager::GetInstance().Run(BackendType::kMsBackend, graph_id, inputs, outputs);
mindspore::backend::BackendManager::GetInstance().Run(BackendType::kMSBackend, graph_id, inputs, outputs);
}
};
MS_REGISTER_BACKEND(kCustomBackendName, MSCustomBackendBase)
@@ -49,7 +49,7 @@ MS_REGISTER_BACKEND(kCustomBackendName, MSCustomBackendBase)

## 编译自定义后端

将上述示例代码保存为`custom_backend.cpp`,并编译成`libcustom_backend.so`动态库,编译命令如下:
将上述示例代码保存为`custom_backend.cpp`,并编译成`libcustom_backend.so`动态库,CMake脚本如下:

```cmake
cmake_minimum_required(VERSION 3.16)
@@ -59,7 +59,7 @@ set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# Use specified MindSpore path
set(MINDSPORE_INCLUDE_DIRS ${MINDSPORE_ROOT}/include)
set(MINDSPORE_INCLUDE_DIR ${MINDSPORE_ROOT}/include)
set(MINDSPORE_LIB_DIRS ${MINDSPORE_ROOT}/lib)
message(STATUS "Using MindSpore from: ${MINDSPORE_ROOT}")

@@ -70,20 +70,20 @@ set(CMAKE_BUILD_TYPE "Release")
include_directories(${CMAKE_CURRENT_SOURCE_DIR})

# Handle MindSpore include directories
if(MINDSPORE_INCLUDE_DIRS)
include_directories(${include_dir})
if(MINDSPORE_INCLUDE_DIR)
include_directories(${MINDSPORE_INCLUDE_DIR})
# Add complete MindSpore include paths to ensure all dependency headers are found
include_directories(${include_dir}/mindspore)
include_directories(${include_dir}/mindspore/core/include)
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore)
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/core/include)
# Add MindSpore ccsrc path, contains mindspore/ccsrc/include/
include_directories(${include_dir}/mindspore/ccsrc/include)
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ccsrc/include)
# Add MindSpore csrc path, contains mindspore/ccsrc/
include_directories(${include_dir}/mindspore/ccsrc)
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ccsrc)
# Add third_party path, contains securec.h
include_directories(${include_dir}/third_party)
include_directories(${include_dir}/third_party/include)
include_directories(${MINDSPORE_INCLUDE_DIR}/third_party)
include_directories(${MINDSPORE_INCLUDE_DIR}/third_party/include)
# Add specific pybind11 path
include_directories(${include_dir}/third_party/pybind11)
include_directories(${MINDSPORE_INCLUDE_DIR}/third_party/pybind11)
endif()

# Find Python
@@ -145,6 +145,15 @@ install(TARGETS custom_backend
)
```

编译命令如下:

```bash
cmake . -DMINDSPORE_ROOT=/path/to/mindspore
make
```

其中,`/path/to/mindspore`为MindSpore的安装路径。

## 使用自定义后端

使用[mindspore.graph.register_custom_backend](https://www.mindspore.cn/docs/zh-CN/master/api_python/graph/mindspore.graph.register_custom_backend.html)接入后端,并通过[mindspore.jit](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.jit.html)接口选择使用:


+ 342
- 0
tutorials/source_zh_cn/custom_program/custom_pass.md View File

@@ -0,0 +1,342 @@
# 自定义Pass

[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/tutorials/source_zh_cn/custom_program/custom_pass.md)

## 概述

当需要改变计算图结构时,你可以利用MindSpore的自定义pass功能,编写pass逻辑,实现并注册自定义Pass插件,对计算图的结构进行变换优化。

本教程提供一个简单的自定义pass用例作为展示。更多完整示例,参见MindSpore源码中的[用例](https://gitee.com/mindspore/mindspore/blob/master/tests/st/backend/custom_pass/test_custom_passes.py)。

## 实现自定义Pass

自定义Pass的实现需要完成以下步骤:

1. 引用`mindspore/include/custom_pass_api.h`头文件。
2. 继承`PatternToPatternPass`类并实现`DefineSrcPattern`、`DefineDstPattern`和`CheckMatchedDAG`接口。
3. 继承`CustomPassPlugin`类并实现`GetPluginName`、`GetAvailablePassNames`和`CreatePass`接口。
4. 使用`EXPORT_CUSTOM_PASS_PLUGIN`宏注册自定义Pass插件。

这里实现一个简单的AddNegFusionPass及自定义Pass插件,用于将Add算子和Neg算子替换为一个Sub算子。

```c++
// add_neg_fusion_pass.h
#ifndef MINDSPORE_CUSTOM_PASS_ADD_NEG_FUSION_PASS_H_
#define MINDSPORE_CUSTOM_PASS_ADD_NEG_FUSION_PASS_H_

#include "mindspore/include/custom_pass_api.h"

namespace mindspore {
namespace opt {
/**
* @brief Pass to fuse Add and Neg operations into Sub
*
* Transforms Add(x, Neg(y)) into Sub(x, y)
* This is a standard algebraic optimization that eliminates unnecessary Neg operations
* Works on CPU/GPU/Ascend since all platforms support Add, Neg, and Sub operations
* Inherits from PatternToPatternPass to comply with MindSpore plugin system requirements
*/
class AddNegFusionPass : public PatternToPatternPass {
public:
AddNegFusionPass() : PatternToPatternPass("AddNegFusionPass") {}

void DefineSrcPattern(SrcPattern *src_pattern) override;
void DefineDstPattern(DstPattern *dst_pattern) override;
bool CheckMatchedDAG(const PatternMap &pattern_map, const FuncGraphPtr &func_graph,
const AnfNodePtr &node) const override;

private:
static bool IsAddNode(const AnfNodePtr &node);
static bool IsNegNode(const AnfNodePtr &node);

static AnfNodePtr BuildSub(const PatternMap &m, const AnfNodePtr &default_node);
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CUSTOM_PASS_ADD_NEG_FUSION_PASS_H_
```

```c++
// add_neg_fusion_pass.cc
#include "add_neg_fusion_pass.h"

namespace mindspore {
namespace opt {
void AddNegFusionPass::DefineSrcPattern(SrcPattern *src_pattern) {
MS_LOG(INFO) << "Defining source pattern for AddNegFusionPass";
MS_EXCEPTION_IF_NULL(src_pattern);

// Pattern: Add(x, Neg(y))
(*src_pattern)
.AddVar("x")
.AddVar("y")
.AddCNode("neg", {std::make_shared<Primitive>("Neg"), "y"})
.AddCNode("add", {std::make_shared<Primitive>("Add"), "x", "neg"});

MS_LOG(INFO) << "Source pattern defined: Add(x, Neg(y))";
}

AnfNodePtr AddNegFusionPass::BuildSub(const PatternMap &m, const AnfNodePtr &default_node) {
auto add_node = m.Get("add")->cast<CNodePtr>();
auto neg_node = m.Get("neg")->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(add_node);
MS_EXCEPTION_IF_NULL(neg_node);

auto sub_node = default_node->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(sub_node);

// Copy Add node's scope to maintain execution context
sub_node->set_scope(add_node->scope());

// Set abstract same as Add output
auto add_abstract = add_node->abstract();
if (add_abstract != nullptr) {
sub_node->set_abstract(add_abstract->Clone());
} else {
MS_LOG(EXCEPTION) << "Failed to create Sub abstract from Add node";
}

return sub_node;
}

void AddNegFusionPass::DefineDstPattern(DstPattern *dst_pattern) {
MS_LOG(INFO) << "Defining destination pattern for AddNegFusionPass";
MS_EXCEPTION_IF_NULL(dst_pattern);

// Replace with Sub(x, y) - directly subtract y instead of adding its negation
(*dst_pattern).AddCNode("sub", {std::make_shared<Primitive>("Sub"), "x", "y"}, BuildSub);

MS_LOG(INFO) << "Destination pattern defined: Sub(x, y)";
}

bool AddNegFusionPass::CheckMatchedDAG(const PatternMap &pattern_map, const FuncGraphPtr &func_graph,
const AnfNodePtr &node) const {
auto add_node = pattern_map.Get("add");
if (!add_node) {
MS_LOG(ERROR) << "Add node not found in pattern match";
return false;
}

auto neg_node = pattern_map.Get("neg");
if (!neg_node) {
MS_LOG(ERROR) << "Neg node not found in pattern match";
return false;
}

auto x_node = pattern_map.Get("x");
if (!x_node) {
MS_LOG(ERROR) << "x node not found in pattern match";
return false;
}

auto y_node = pattern_map.Get("y");
if (!y_node) {
MS_LOG(ERROR) << "y node not found in pattern match";
return false;
}

MS_LOG(INFO) << "AddNeg fusion pattern matched successfully";
return true;
}
} // namespace opt
} // namespace mindspore
```

```c++
// ms_custom_pass_plugin.cc
#include <string>
#include <memory>
#include <vector>
#include "mindspore/ccsrc/include/backend/common/custom_pass/custom_pass_plugin.h"
#include "add_neg_fusion_pass.h"

namespace mindspore {
namespace opt {

class MSCustomPassPlugin : public CustomPassPlugin {
public:
std::string GetPluginName() const override { return "ms_custom_pass_plugin"; }

std::vector<std::string> GetAvailablePassNames() const override {
return {"ReplaceAddNFusionPass", "AddNegFusionPass"};
}

std::shared_ptr<Pass> CreatePass(const std::string &pass_name) const override {
if (pass_name == "AddNegFusionPass") {
auto pass = std::make_shared<AddNegFusionPass>();
MS_LOG(INFO) << "Created pass '" << pass_name << "' successfully";
return pass;
} else {
MS_LOG(WARNING) << "Pass '" << pass_name << "' not found, available: ReplaceAddNFusionPass, AddNegFusionPass";
return nullptr;
}
}
};
} // namespace opt
} // namespace mindspore

EXPORT_CUSTOM_PASS_PLUGIN(mindspore::opt::MSCustomPassPlugin)
```

## 编译自定义Pass插件

将上述示例代码编译成`libcustom_pass.so`动态库,CMake脚本如下:

```cmake
cmake_minimum_required(VERSION 3.16)
project(pass VERSION 1.0.0 LANGUAGES CXX)

# Set C++ standard
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# Use specified MindSpore path
set(MINDSPORE_INCLUDE_DIR ${MINDSPORE_ROOT}/include)
set(MINDSPORE_LIB_DIRS ${MINDSPORE_ROOT}/lib)
message(STATUS "Using MindSpore from: ${MINDSPORE_ROOT}")

# Build options configuration (simplified)
set(CMAKE_BUILD_TYPE "Release")

# Set CMake module path - adjusted for mindspore test location
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")

# Include directories
include_directories(${CMAKE_CURRENT_SOURCE_DIR})

# Handle multiple MindSpore include directories
if(MINDSPORE_INCLUDE_DIR)
# Add complete MindSpore include paths to ensure all dependency headers are found
include_directories(${MINDSPORE_INCLUDE_DIR})
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore)
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/core/include)
# Add MindSpore ccsrc path
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ccsrc/)
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ccsrc/include/)
# Add MindSpore ops path
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ops)
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ops/include)
include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ops/kernel/include)
# Add third_party path, contains securec.h
include_directories(${MINDSPORE_INCLUDE_DIR}/third_party)
# Add specific securec path
include_directories(${MINDSPORE_INCLUDE_DIR}/third_party/securec/include)
endif()

# Automatically find all source files
file(GLOB_RECURSE PASS_SOURCES "*.cc")
file(GLOB_RECURSE PASS_HEADERS "*.h")

# Create dynamic library (based on installed MindSpore)
add_library(custom_pass SHARED ${PASS_SOURCES})

# Link MindSpore libraries (based on actual requirements)
target_link_libraries(custom_pass
${MINDSPORE_LIB_DIRS}/libmindspore_backend_common.so
${MINDSPORE_LIB_DIRS}/libmindspore_core.so
${MINDSPORE_LIB_DIRS}/libmindspore_common.so
)

# Default settings
option(ENABLE_GLIBCXX "enable_glibcxx" OFF)

# System-related overrides
if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux")
set(ENABLE_GLIBCXX ON)
endif()

# Environment variable overrides
if(DEFINED ENV{ENABLE_GLIBCXX})
set(ENABLE_GLIBCXX $ENV{ENABLE_GLIBCXX})
endif()

# ABI flag settings
if(CMAKE_SYSTEM_NAME MATCHES "Linux")
if(NOT ENABLE_GLIBCXX)
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0)
endif()
endif()

# Set compilation options
target_compile_options(custom_pass PRIVATE
-fPIC
-std=c++17
-Wall
-Wextra
)

# Use ABI settings consistent with MindSpore
if(CMAKE_SYSTEM_NAME MATCHES "Linux")
if(NOT ENABLE_GLIBCXX)
target_compile_definitions(custom_pass PRIVATE _GLIBCXX_USE_CXX11_ABI=0)
endif()
endif()

# Set compilation definitions
target_compile_definitions(custom_pass PRIVATE
-DPASS_PLUGIN_EXPORTS
-DMINDSPORE_PASS
)

# Set dynamic library properties
set_target_properties(custom_pass PROPERTIES
VERSION ${PROJECT_VERSION}
SOVERSION ${PROJECT_VERSION_MAJOR}
PREFIX "lib"
OUTPUT_NAME "custom_pass"
)

# Installation rules
install(TARGETS custom_pass
LIBRARY DESTINATION lib
RUNTIME DESTINATION bin
)
```

编译命令如下:

```bash
cmake . -DMINDSPORE_ROOT=/path/to/mindspore
make
```

其中,`/path/to/mindspore`为MindSpore的安装路径。

## 使用自定义Pass

使用[mindspore.graph.register_custom_pass](https://www.mindspore.cn/docs/zh-CN/master/api_python/graph/mindspore.graph.register_custom_pass.html)进行注册接入:

```python
import numpy as np
import mindspore
from mindspore import jit, ops, nn, context, Tensor

custom_path = "/data1/libcustom_pass.so"
success = mindspore.graph.register_custom_pass("AddNegFusionPass", custom_path, "cpu")
assert success, "Plugin registration failed"

class AddNegNetwork(nn.Cell):
def __init__(self):
super().__init__()
self.neg = ops.Neg()

@jit(backend="ms_backend")
def construct(self, x1, x2):
# Neg operation: -x2
neg_x2 = self.neg(x2)
# Add operation: x1 + (-x2) = x1 - x2
output = x1 + neg_x2
return output

context.set_context(device_target="CPU")
net = AddNegNetwork()
x1 = Tensor(np.array([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]).astype(np.float32))
x2 = Tensor(np.array([[[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3]],
[[4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6]]]).astype(np.float32))
output = net(x1, x2)

# Verify functional correctness
expected = x1.asnumpy() - x2.asnumpy() # x1 + (-x2) = x1 - x2
np.testing.assert_array_almost_equal(output.asnumpy(), expected)
```

+ 1
- 0
tutorials/source_zh_cn/index.rst View File

@@ -83,6 +83,7 @@ MindSpore教程

custom_program/op_custom
custom_program/custom_backend
custom_program/custom_pass
custom_program/hook_program

.. toctree::


Loading…
Cancel
Save
Baidu
map