!17993 add custom pass tutorial

Merge pull request !17993 from yuchaojie/master
add custom pass tutorial
--- a/+ 1
+++ b/+ 1
@@ -17,6 +17,7 @@
 /tutorials/source_zh_cn/beginner/ @zwiori @gemini524
 /tutorials/source_zh_cn/compile/ @ginfung @gemini524
 /tutorials/source_zh_cn/custom_program/custom_backend.md @limingqi107 @gemini524
 /tutorials/source_zh_cn/custom_program/custom_pass.md @limingqi107 @gemini524
 /tutorials/source_zh_cn/custom_program/op_custom.rst @chujinjin @gemini524
 /tutorials/source_zh_cn/custom_program/operation/ @chujinjin @gemini524
 /tutorials/source_zh_cn/custom_program/hook_program.ipynb @zwiori @gemini524
--- a/docs/lite/api/source_en/conf.py
+++ b/docs/lite/api/source_en/conf.py
@@ -578,37 +578,17 @@ folder_converter = '../include/converter/include'
 # 查找同名文件并删除converter下的
 find_common_files2del(folder_runtime, folder_converter)

 def replace_key_struct_in_type_h(file_path):
    # 原代码块
    original_code = '''
 using Key = struct MS_API Key {
  size_t max_key_len = 32;
  size_t len = 0;
  unsigned char key[32] = {0};
  Key() : len(0) {}
  explicit Key(const char *dec_key, size_t key_len);
 };'''

    # 新代码块
    new_code = '''
 struct MS_API Key {
  size_t max_key_len = 32;
  size_t len = 0;
  unsigned char key[32] = {0};
  Key() : len(0) {}
  explicit Key(const char *dec_key, size_t key_len);
 };

 using Key = Key;'''
 # 解决lite页面内容生成错误问题
 def code_content_replace(original_code, new_code, file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
    except Exception as e:
        print(f"读取文件失败：{e}")
        return False

    modified_content = content.replace(original_code, new_code)
    print("找到完全匹配的代码块，执行精准替换")
    if modified_content:
        print("找到完全匹配的代码块，执行精准替换")

    try:
        with open(file_path, "w", encoding="utf-8") as f:
@@ -618,9 +598,46 @@ using Key = Key;'''
    except Exception as e:
        print(f"写入文件失败：{e}")
        return False
    

 original_code = '''
 using Key = struct MS_API Key {
  size_t max_key_len = 32;
  size_t len = 0;
  unsigned char key[32] = {0};
  Key() : len(0) {}
  explicit Key(const char *dec_key, size_t key_len);
 };'''
 new_code = '''
 struct MS_API Key {
  size_t max_key_len = 32;
  size_t len = 0;
  unsigned char key[32] = {0};
  Key() : len(0) {}
  explicit Key(const char *dec_key, size_t key_len);
 };

 using Key = Key;'''
 types_h_path = "../include/runtime/include/api/types.h"
 replace_key_struct_in_type_h(types_h_path)
 code_content_replace(original_code, new_code, types_h_path)

 original_code = '''
 /// \\brief Get the value with the given type from a node if it is a ValueNode.'''
 new_code = '''
 /// \\overload  // Tell Doxygen that this is an independent overload and generate a separate entry
 /// \\brief Get the value with the given type from a node if it is a ValueNode.''' 
 anf_h_path = "../include/converter/include/mindapi/ir/anf.h"
 code_content_replace(original_code, new_code, anf_h_path)

 original_code = '''using CreateKernel = std::function<std::shared_ptr<kernel::Kernel>(
  const std::vector<MSTensor> &inputs, const std::vector<MSTensor> &outputs, const schema::Primitive *primitive,
  const mindspore::Context *ctx)>;'''
 new_code = '''std::shared_ptr<kernel::Kernel> CreateKernel(
  const std::vector<MSTensor> &inputs, const std::vector<MSTensor> &outputs, const schema::Primitive *primitive,
  const mindspore::Context *ctx);

 using CreateKernel = std::function<decltype(CreateKernel)>;'''
 register_kernel_h_path = "../include/runtime/include/registry/register_kernel.h"
 code_content_replace(original_code, new_code, register_kernel_h_path)

 # for file_name in fileList:
 #     file_data = ''
--- a/docs/mindformers/docs/source_en/advanced_development/accuracy_comparison.md
+++ b/docs/mindformers/docs/source_en/advanced_development/accuracy_comparison.md
@@ -55,75 +55,75 @@ The following tables describe the configuration comparison with Megatron-LM.

    This document supports only the precision comparison of the mcore model. Therefore, `--use-mcore-model` must be configured for Megatron-LM, and `use_legacy: False` must be configured for MindSpore Transformers.

    | Megatron-LM                                | Description                                         | MindSpore Transformers                     | Description                                                                                                                                             |
    |--------------------------------------------|---------------------------------------------|--------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
    | `use-legacy-model` and `use-mcore-model`    | Specifies whether to use the mcore model.                              | `use_legacy`                               | Specifies whether to use the mcore model. `use_legacy: False` is equivalent to `--use-mcore-model`.                                                     |
    | `num-layers`                               | Number of network layers, that is, number of transformer layers.                       | `num_layers`                               | Number of network layers, that is, number of transformer layers.                                                                                        |
    | `encoder-num-layers`                       | Number of encoder layers.                             | Not supported.                                     |                                                                                                                                                         |
    | `decoder-num-layers`                       | Number of decoder layers.                             | Not supported.                                     |                                                                                                                                                         |
    | `hidden-size`                              | Size of the hidden layer, which is the dimension in the hidden state.                              | `hidden_size`                              | Size of the hidden layer, which is the dimension in the hidden state.                                                                                   |
    | `ffn-hidden-size`                          | Size of the hidden layer in the feedforward network.                                  | `intermediate_size`                        | Size of the hidden layer in the feedforward network.                                                                                                    |
    | `num-attention-heads`                      | Number of attention heads.                                      | `num_heads`                                | Number of attention heads.                                                                                                                              |
    | `kv-channels`                              | Number of key/value tensor channels.                            | `head_dim`                                 | Number of key/value tensor channels.                                                                                                                    |
    | `group-query-attention`                    | Specifies whether to enable group query attention.                                | `use_gqa`                                  | Specifies whether to enable group query attention.                                                                                                      |
    | `num-query-groups`                         | Number of query groups.                                     | `n_kv_heads`                               | Number of query groups.                                                                                                                                 |
    | `max-position-embeddings`                  | Maximum position encoding length.                                   | `max_position_embeddings`                  | Maximum position encoding length.                                                                                                                       |
    | `position-embedding-type`                  | Position encoding type, such as learned_absolute and rope.           | `position_embedding_type`                  | Position encoding type, such as learned_absolute and rope.                                                                                              |
    | `use-rotary-position-embeddings`           | Specifies whether to use rotary position embedding (RoPE).                           | Specified by `position_embedding_type`==`rope`      | Specifies whether to use RoPE.                                                                                                                          |
    | `rotary-base`                              | Rotary base used for RoPE.                               | `rotary_base`                              | Rotary base used for RoPE.                                                                                                                              |
    | `rotary-percent`                           | RoPE usage ratio.                                 | `rotary_percent`                           | RoPE usage ratio.                                                                                                                                       |
    | `rotary-interleaved`                       | Specifies whether to use interleaved RoPE.                                | `rotary_interleaved`                       | Specifies whether to use interleaved RoPE.                                                                                                              |
    | `rotary-seq-len-interpolation-factor`      | Rotary sequence length interpolation factor.                                 | `rotary_seq_len_interpolation_factor`      | Rotary sequence length interpolation factor.                                                                                                            |
    | `use-rope-scaling`                         | Specifies whether to enable RoPE scaling.                               | `use_rope_scaling`                         | Specifies whether to enable RoPE scaling.                                                                                                               |
    | `rope-scaling-factor`                      | RoPE scaling factor.                                  | `scaling_factor`                    | RoPE scaling factor.                                                                                                                                    |
    | `no-position-embedding`                    | Specifies whether to disable location encoding.                                   | `no-position-embedding`                                     | Specifies whether to disable location encoding.                                                                                                         |
    | `disable-bias-linear`                      | Disables bias in linear layers.                                | `add_bias_linear`                          | Enables bias in linear layers.                                                                                                                          |
    | `mrope-section`                            | Information of multiple RoPE sections.                           | Not supported.                                     |                                                                                                                                                         |
    | `make-vocab-size-divisible-by`             | Divides the size of the word table by a specified number.                               | Not supported.                                     | By default, the dictionary size is not changed.                                                                                                         |
    | `init-method-std`                          | Standard deviation of the normal distribution used during model parameter initialization.                        | `init_method_std`                          | Standard deviation of the normal distribution used during model parameter initialization.                                                               |
    | `attention-dropout`                        | Dropout probability applied in the multi-head self-attention mechanism.                    | `attention_dropout`                        | Dropout probability applied in the multi-head self-attention mechanism.                                                                                 |
    | `hidden-dropout`                           | Dropout probability in the hidden layer.                            | `hidden_dropout`                           | Dropout probability in the hidden layer.                                                                                                                |
    | `normalization`                            | Normalization method, which can be LayerNorm or RMSNorm.                  | `normalization`                            | Normalization method, which can be LayerNorm or RMSNorm.                                                                                                |
    | `norm-epsilon`                             | Normalized stability factor (epsilon).                           | `rms_norm_eps`                             | RMSNorm stability factor.                                                                                                                               |
    | `apply-layernorm-1p`                       | Specifies whether to add 1 after LayerNorm.                     | Not supported.                                     |                                                                                                                                                         |
    | `apply-residual-connection-post-layernorm` | Specifies whether the residual connection is applied after LayerNorm.                     | `apply_residual_connection_post_layernorm` | Specifies whether the residual connection is applied after LayerNorm.                                                                                   |
    | `openai-gelu`                              | Specifies whether to use the GELU activation function of the OpenAI version.                  | Not supported.                                     |                                                                                                                                                         |
    | `squared-relu`                             | Specifies whether to use the square ReLU activation function.                           | Not supported.                                     |                                                                                                                                                         |
    | Specified by `swiglu`, `openai-gelu`, and `squared-relu`  | The default value is **torch.nn.functional.gelu**.               | `hidden_act`                               | Activation function type.                                                                                                                               |
    | `gated_linear_unit`                        | Specifies whether to use gate linear unit in multi-layer perceptron (MLP).                      | `gated_linear_unit`                        | Specifies whether to use gate linear unit in MLP.                                                                                                       |
    | `swiglu`                                   | Specifies whether to use the SwiGLU activation function.                           | `hidden_act`==`silu` and `gated_linear_unit`| Specifies whether to use the SwiGLU activation function.                                                                                                |
    | `no-persist-layer-norm`                    | Disables persistence layer normalization.                                  | Not supported.                                     |                                                                                                                                                         |
    | `untie-embeddings-and-output-weights`      | Specifies whether to decouple the weights of the input embedding layer and output layer.                            | `untie_embeddings_and_output_weights`      | Specifies whether to decouple the weights of the input embedding layer and output layer.                                                                |
    | Specified by `fp16` and `bf16`                       | Tensor compute precision during training.                                  | `compute_dtype`                            | Tensor compute precision during training.                                                                                                               |
    | `grad-reduce-in-bf16`                      | Gradient reduction using BFloat16.                          | Not supported.                                     |                                                                                                                                                         |
    | Not supported.                                     | By default, the initialization tensor is generated in BFloat16 format.                       | `param_init_type`                          | Initial precision of the weight tensor. The default value is **Float32**, which ensures that the backward gradient is updated in Float32.               |
    | Not supported.                                     | By default, layer normalization is calculated in Float32.                       | `layernorm_compute_type`                   | Layer normalization tensor calculation precision.                                                                                                       |
    | `attention-softmax-in-fp32`                | Executes **attention softmax** in Float32.            | `softmax_compute_type`                     | Softmax tensor calculation precision.                                                                                                                   |
    | Not supported.                                     |                                             | `rotary_dtype`                             | Position encoding tensor calculation precision.                                                                                                         |
    | `loss-scale`                               | Overall loss scaling factor.                                   | `loss_scale_value`                         | Overall loss scaling factor, which is configured in **runner_wrapper**. If `compute_dtype` is set to **BFloat16**, the value is usually set to **1.0**. |
    | `initial-loss-scale`                       | Initial loss scaling factor.                                   | Not supported.                                     |                                                                                                                                                         |
    | `min-loss-scale`                           | Minimum loss scaling factor.                                   | Not supported.                                     |                                                                                                                                                         |
    | `loss-scale-window`                        | Dynamic window size scaling.                                   | `loss_scale_window`                        | Dynamic window size scaling.                                                                                                                            |
    | `hysteresis`                               | Loss scale hysteresis parameter.                                   | Not supported.                                     |                                                                                                                                                         |
    | `fp32-residual-connection`                 | Uses Float32 for residual connection.                            | Not supported.                                     |                                                                                                                                                         |
    | `accumulate-allreduce-grads-in-fp32`       | Accumulates and reduces gradients using Float32.                         | Not supported.                                     | Accumulates and reduces gradients using Float32 by default.                                                                                             |
    | `fp16-lm-cross-entropy`                    | Uses Float16 to execute the cross entropy of the LLM.                       | Not supported.                                     | Uses Float32 to execute the cross entropy of the LLM by default.                                                                                        |
    | `q-lora-rank`                              | LoRA rank of the query projection layer, which is used when Q-LoRA is enabled.         | `q_lora_rank`                              | LoRA rank of the query projection layer, which is used when Q-LoRA is enabled.                                                                          |
    | `kv-lora-rank`                             | LoRA rank of the key/value projection layer, which is used when KV-LoRA is enabled.    | `kv_lora_rank`                             | LoRA rank of the key/value projection layer, which is used when KV-LoRA is enabled.                                                                     |
    | `qk-head-dim`                              | Number of dimensions per Q/K head.                         | `qk_nope_head_dim`                         | Number of dimensions per Q/K head.                                                                                                                      |
    | `qk-pos-emb-head-dim`                      | Number of relative position embedding dimensions per Q/K head.                             | `qk_rope_head_dim`                         | Number of relative position embedding dimensions per Q/K head.                                                                                          |
    | `v-head-dim`                               | Number of dimensions per value projection (V head).                       | `v_head_dim`                               | Number of dimensions per value projection (V head).                                                                                                     |
    | `rotary-scaling-factor`                    | RoPE scaling coefficient.| `scaling_factor`                           | RoPE scaling coefficient.                                                                                                                               |
    | `use-precision-aware-optimizer`            | Enables the optimizer with precision awareness to automatically manage parameter updates of different data types.            | Not supported.                                     |                                                                                                                                                         |
    | `main-grads-dtype`                         | Data type of the main gradient.                                   | Not supported.                                     | By default, Float32 is used as the data type of the main gradient.                                                                                      |
    | `main-params-dtype`                        | Data type of the main parameter.                                   | Not supported.                                     | By default, Float32 is used as the data type of the main parameter.                                                                                     |
    | `exp-avg-dtype`                            | Data type of the exponential moving average (EMA).                           | Not supported.                                     |                                                                                                                                                         |
    | `exp-avg-sq-dtype`                         | Data type of the EMA square item.                                | Not supported.                                     |                                                                                                                                                         |
    | `first-last-layers-bf16`                   | Specifies whether to forcibly use BFloat16 at the first and last layers.                        | Not supported.                                     |                                                                                                                                                         |
    | `num-layers-at-start-in-bf16`              | Number of layers that start with BFloat16.                        | Not supported.                                     |                                                                                                                                                         |
    | `num-layers-at-end-in-bf16`                | Number of layers that end with BFloat16.                        | Not supported.                                     |                                                                                                                                                         |
    | `multi-latent-attention`                   | Specifies whether to enable the multi-hidden variable attention mechanism.                              | `multi_latent_attention`                   | Specifies whether to enable the multi-hidden variable attention mechanism.                                                                              |
    | `qk-layernorm`                             | Enables query/key layer normalization.                           | `qk-layernorm`                             | Enables query/key layer normalization.                                                                                                                  |
    | Megatron-LM                                | Description                                                                                                       | MindSpore Transformers                         | Description                                                                                                                                             |
    |--------------------------------------------|-------------------------------------------------------------------------------------------------------------------|------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
    | `use-legacy-model` and `use-mcore-model`    | Specifies whether to use the mcore model.                                                                         | `use_legacy`                                   | Specifies whether to use the mcore model. `use_legacy: False` is equivalent to `--use-mcore-model`.                                                     |
    | `num-layers`                               | Number of network layers, that is, number of transformer layers.                                                  | `num_layers`                                   | Number of network layers, that is, number of transformer layers.                                                                                        |
    | `encoder-num-layers`                       | Number of encoder layers.                                                                                         | Not supported.                                 |                                                                                                                                                         |
    | `decoder-num-layers`                       | Number of decoder layers.                                                                                         | Not supported.                                 |                                                                                                                                                         |
    | `hidden-size`                              | Size of the hidden layer, which is the dimension in the hidden state.                                             | `hidden_size`                                  | Size of the hidden layer, which is the dimension in the hidden state.                                                                                   |
    | `ffn-hidden-size`                          | Size of the hidden layer in the feedforward network.                                                              | `intermediate_size`                            | Size of the hidden layer in the feedforward network.                                                                                                    |
    | `num-attention-heads`                      | Number of attention heads.                                                                                        | `num_heads`                                    | Number of attention heads.                                                                                                                              |
    | `kv-channels`                              | Number of key/value tensor channels.                                                                              | `head_dim`                                     | Number of key/value tensor channels.                                                                                                                    |
    | `group-query-attention`                    | Specifies whether to enable group query attention.                                                                | `use_gqa`                                      | Specifies whether to enable group query attention.                                                                                                      |
    | `num-query-groups`                         | Number of query groups.                                                                                           | `n_kv_heads`                                   | Number of query groups.                                                                                                                                 |
    | `max-position-embeddings`                  | Maximum position encoding length.                                                                                 | `max_position_embeddings`                      | Maximum position encoding length.                                                                                                                       |
    | `position-embedding-type`                  | Position encoding type, such as learned_absolute and rope.                                                        | `position_embedding_type`                      | Position encoding type, such as learned_absolute and rope.                                                                                              |
    | `use-rotary-position-embeddings`           | Specifies whether to use rotary position embedding (RoPE).                                                        | Specified by `position_embedding_type`==`rope` | Specifies whether to use RoPE.                                                                                                                          |
    | `rotary-base`                              | Rotary base used for RoPE.                                                                                        | `rotary_base`                                  | Rotary base used for RoPE.                                                                                                                              |
    | `rotary-percent`                           | RoPE usage ratio.                                                                                                 | `rotary_percent`                               | RoPE usage ratio.                                                                                                                                       |
    | `rotary-interleaved`                       | Specifies whether to use interleaved RoPE.                                                                        | `rotary_interleaved`                           | Specifies whether to use interleaved RoPE.                                                                                                              |
    | `rotary-seq-len-interpolation-factor`      | Rotary sequence length interpolation factor.                                                                      | `rotary_seq_len_interpolation_factor`          | Rotary sequence length interpolation factor.                                                                                                            |
    | `use-rope-scaling`                         | Specifies whether to enable RoPE scaling.                                                                         | `use_rope_scaling`                             | Specifies whether to enable RoPE scaling.                                                                                                               |
    | `rope-scaling-factor`                      | RoPE scaling factor.                                                                                              | `scaling_factor`                               | RoPE scaling factor.                                                                                                                                    |
    | `no-position-embedding`                    | Specifies whether to disable location encoding.                                                                   | `no-position-embedding`                        | Specifies whether to disable location encoding.                                                                                                         |
    | `disable-bias-linear`                      | Disables bias in linear layers.                                                                                   | `add_bias_linear`                              | Enables bias in linear layers.                                                                                                                          |
    | `mrope-section`                            | Information of multiple RoPE sections.                                                                            | Not supported.                                 |                                                                                                                                                         |
    | `make-vocab-size-divisible-by`             | Divides the size of the word table by a specified number.                                                         | Not supported.                                 | By default, the dictionary size is not changed.                                                                                                         |
    | `init-method-std`                          | Standard deviation of the normal distribution used during model parameter initialization.                         | `init_method_std`                              | Standard deviation of the normal distribution used during model parameter initialization.                                                               |
    | `attention-dropout`                        | Dropout probability applied in the multi-head self-attention mechanism.                                           | `attention_dropout`                            | Dropout probability applied in the multi-head self-attention mechanism.                                                                                 |
    | `hidden-dropout`                           | Dropout probability in the hidden layer.                                                                          | `hidden_dropout`                               | Dropout probability in the hidden layer.                                                                                                                |
    | `normalization`                            | Normalization method, which can be LayerNorm or RMSNorm.                                                          | `normalization`                                | Normalization method, which can be LayerNorm or RMSNorm.                                                                                                |
    | `norm-epsilon`                             | Normalized stability factor (epsilon).                                                                            | `rms_norm_eps`                                 | RMSNorm stability factor.                                                                                                                               |
    | `apply-layernorm-1p`                       | Specifies whether to add 1 after LayerNorm.                                                                       | Not supported.                                 |                                                                                                                                                         |
    | `apply-residual-connection-post-layernorm` | Specifies whether the residual connection is applied after LayerNorm.                                             | `apply_residual_connection_post_layernorm`     | Specifies whether the residual connection is applied after LayerNorm.                                                                                   |
    | `openai-gelu`                              | Specifies whether to use the GELU activation function of the OpenAI version.                                      | Not supported.                                 |                                                                                                                                                         |
    | `squared-relu`                             | Specifies whether to use the square ReLU activation function.                                                     | Not supported.                                 |                                                                                                                                                         |
    | Specified by `swiglu`, `openai-gelu`, and `squared-relu`  | The default value is **torch.nn.functional.gelu**.                                                                | `hidden_act`                                   | Activation function type.                                                                                                                               |
    | `gated_linear_unit`                        | Specifies whether to use gate linear unit in multi-layer perceptron (MLP).                                        | `gated_linear_unit`                            | Specifies whether to use gate linear unit in MLP.                                                                                                       |
    | `swiglu`                                   | Specifies whether to use the SwiGLU activation function.                                                          | `hidden_act` == `silu` and `gated_linear_unit` | Specifies whether to use the SwiGLU activation function.                                                                                                |
    | `no-persist-layer-norm`                    | Disables persistence layer normalization.                                                                         | Not supported.                                 |                                                                                                                                                         |
    | `untie-embeddings-and-output-weights`      | Specifies whether to decouple the weights of the input embedding layer and output layer.                          | `untie_embeddings_and_output_weights`          | Specifies whether to decouple the weights of the input embedding layer and output layer.                                                                |
    | Specified by `fp16` and `bf16`                       | Tensor compute precision during training.                                                                         | `compute_dtype`                                | Tensor compute precision during training.                                                                                                               |
    | `grad-reduce-in-bf16`                      | Gradient reduction using BFloat16.                                                                                | Not supported.                                 |                                                                                                                                                         |
    | Not supported.                                     | By default, the initialization tensor is generated in BFloat16 format.                                            | `param_init_type`                              | Initial precision of the weight tensor. The default value is **Float32**, which ensures that the backward gradient is updated in Float32.               |
    | Not supported.                                     | By default, layer normalization is calculated in Float32.                                                         | `layernorm_compute_type`                       | Layer normalization tensor calculation precision.                                                                                                       |
    | `attention-softmax-in-fp32`                | Executes **attention softmax** in Float32.                                                                        | `softmax_compute_type`                         | Softmax tensor calculation precision.                                                                                                                   |
    | Not supported.                                     |                                                                                                                   | `rotary_dtype`                                 | Position encoding tensor calculation precision.                                                                                                         |
    | `loss-scale`                               | Overall loss scaling factor.                                                                                      | `loss_scale_value`                             | Overall loss scaling factor, which is configured in **runner_wrapper**. If `compute_dtype` is set to **BFloat16**, the value is usually set to **1.0**. |
    | `initial-loss-scale`                       | Initial loss scaling factor.                                                                                      | Not supported.                                 |                                                                                                                                                         |
    | `min-loss-scale`                           | Minimum loss scaling factor.                                                                                      | Not supported.                                 |                                                                                                                                                         |
    | `loss-scale-window`                        | Dynamic window size scaling.                                                                                      | `loss_scale_window`                            | Dynamic window size scaling.                                                                                                                            |
    | `hysteresis`                               | Loss scale hysteresis parameter.                                                                                  | Not supported.                                 |                                                                                                                                                         |
    | `fp32-residual-connection`                 | Uses Float32 for residual connection.                                                                             | `fp32_residual_connection`                     | Uses Float32 for residual connection.                                                                                                                                                        |
    | `accumulate-allreduce-grads-in-fp32`       | Accumulates and reduces gradients using Float32.                                                                  | Not supported.                                 | Accumulates and reduces gradients using Float32 by default.                                                                                             |
    | `fp16-lm-cross-entropy`                    | Uses Float16 to execute the cross entropy of the LLM.                                                             | Not supported.                                 | Uses Float32 to execute the cross entropy of the LLM by default.                                                                                        |
    | `q-lora-rank`                              | LoRA rank of the query projection layer, which is used when Q-LoRA is enabled.                                    | `q_lora_rank`                                  | LoRA rank of the query projection layer, which is used when Q-LoRA is enabled.                                                                          |
    | `kv-lora-rank`                             | LoRA rank of the key/value projection layer, which is used when KV-LoRA is enabled.                               | `kv_lora_rank`                                 | LoRA rank of the key/value projection layer, which is used when KV-LoRA is enabled.                                                                     |
    | `qk-head-dim`                              | Number of dimensions per Q/K head.                                                                                | `qk_nope_head_dim`                             | Number of dimensions per Q/K head.                                                                                                                      |
    | `qk-pos-emb-head-dim`                      | Number of relative position embedding dimensions per Q/K head.                                                    | `qk_rope_head_dim`                             | Number of relative position embedding dimensions per Q/K head.                                                                                          |
    | `v-head-dim`                               | Number of dimensions per value projection (V head).                                                               | `v_head_dim`                                   | Number of dimensions per value projection (V head).                                                                                                     |
    | `rotary-scaling-factor`                    | RoPE scaling coefficient.                                                                                         | `scaling_factor`                               | RoPE scaling coefficient.                                                                                                                               |
    | `use-precision-aware-optimizer`            | Enables the optimizer with precision awareness to automatically manage parameter updates of different data types. | Not supported.                                 |                                                                                                                                                         |
    | `main-grads-dtype`                         | Data type of the main gradient.                                                                                   | Not supported.                                 | By default, Float32 is used as the data type of the main gradient.                                                                                      |
    | `main-params-dtype`                        | Data type of the main parameter.                                                                                  | Not supported.                                 | By default, Float32 is used as the data type of the main parameter.                                                                                     |
    | `exp-avg-dtype`                            | Data type of the exponential moving average (EMA).                                                                | Not supported.                                 |                                                                                                                                                         |
    | `exp-avg-sq-dtype`                         | Data type of the EMA square item.                                                                                 | Not supported.                                 |                                                                                                                                                         |
    | `first-last-layers-bf16`                   | Specifies whether to forcibly use BFloat16 at the first and last layers.                                          | Not supported.                                 |                                                                                                                                                         |
    | `num-layers-at-start-in-bf16`              | Number of layers that start with BFloat16.                                                                        | Not supported.                                 |                                                                                                                                                         |
    | `num-layers-at-end-in-bf16`                | Number of layers that end with BFloat16.                                                                          | Not supported.                                 |                                                                                                                                                         |
    | `multi-latent-attention`                   | Specifies whether to enable the multi-hidden variable attention mechanism.                                        | `multi_latent_attention`                       | Specifies whether to enable the multi-hidden variable attention mechanism.                                                                              |
    | `qk-layernorm`                             | Enables query/key layer normalization.                                                                            | `qk-layernorm`                                 | Enables query/key layer normalization.                                                                                                                  |

 - Optimizer and learning rate scheduling configurations

--- a/docs/mindformers/docs/source_en/feature/ckpt.md
+++ b/docs/mindformers/docs/source_en/feature/ckpt.md
@@ -36,10 +36,10 @@ python convert_weight.py [-h] --model MODEL [--reversed] --input_path INPUT_PATH

 ### Conversion Example

 Assume that you have downloaded the [Llama3.1 model weight](https://gitee.com/mindspore/mindformers/blob/master/research/llama3_1/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD) and saved it in the `/home/user/torch_weights` path, to convert it to the MindSpore Transformers weight and save it in the `/home/user/ms_weights` path, run the following command:
 Assume that you have downloaded the [Qwen2.5 model weight](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD) and saved it in the `/home/user/torch_weights` path, to convert it to the MindSpore Transformers weight and save it in the `/home/user/ms_weights` path, run the following command:

 ```bash
 python convert_weight.py --model llama --input_path /home/user/torch_weights --output_path /home/user/ms_weights/llama.ckpt
 python convert_weight.py --model qwen2_5 --input_path /home/user/torch_weights --output_path /home/user/ms_weights/qwen2_5.ckpt
 ```

 After the preceding steps are performed, the HuggingFace weight is successfully converted to a MindSpore Transformers weight, facilitating model training or inference on MindSpore Transformers.
@@ -48,11 +48,9 @@ After the preceding steps are performed, the HuggingFace weight is successfully

 | Parameter Value | Supported models             |
 |-----------------|------------------------------|
 | llama           | Llama3.1                     |
 | glm-n           | GLM4                         |
 | qwen            | Qwen2.5                      |
 | qwen2_5         | Qwen2.5                      |
 | mixtral         | Mixtral                      |
 | deepseek        | DeepSeekV3                   |

 ### Developing Weight Conversion for Unsupported Models

@@ -63,60 +61,78 @@ After the preceding steps are performed, the HuggingFace weight is successfully

 ### Example of Developing Model Weight Conversion

 Llama is used as an example. To convert a HuggingFace weight to a MindSpore Transformers one, define the `convert_pt_to_ms` function in [convert_weight.py](https://gitee.com/mindspore/mindformers/blob/master/mindformers/models/llama/convert_weight.py).
 [GLM-4](https://gitee.com/mindspore/mindformers/blob/master/docs/model_cards/glm4.md) is used as an example. To convert a HuggingFace weight to a MindSpore Transformers one, define the `convert_pt_to_ms` function in [convert_weight.py](https://gitee.com/mindspore/mindformers/blob/master/mindformers/models/glm2/convert_weight.py).

 ```python
 def convert_pt_to_ms(input_path, output_path, dtype=None, **kwargs):
    """convert hf weight to ms."""
    print(f"Trying to convert huggingface checkpoint in '{input_path}'.", flush=True)
    try:
        from transformers import LlamaForCausalLM
    except:
        raise ImportError(f"Failed to load huggingface checkpoint. Please make sure transformers is available.")

    try:
        model_hf = LlamaForCausalLM.from_pretrained(os.path.dirname(input_path))
    except Exception as e:
        print(f"Do not find huggingface checkpoint in '{os.path.dirname(input_path)}', Error {e.message}.", flush=True)
        return False
    ckpt_list = []
    for name, value in model_hf.state_dict().items():
        name = name_replace(name)
        if name == 'norm.weight':
            name = 'norm_out.weight'
        if name[:7] == 'layers.':
            name = name[7:]

        print(f'\rprocessing parameter: {name} {value.shape}     ', end='', flush=True)
        ckpt_list.append({'name': name, 'data': pt2ms(value, dtype)})

    ms.save_checkpoint(ckpt_list, output_path)
    print(f"\rConvert huggingface checkpoint finished, the mindspore checkpoint is saved in '{output_path}'.",
          flush=True)
    return True
 def convert_pt_to_ms(input_path, output_path, config, dtype=ms.float32, **kwargs):
    """ Convert pytorch model file to MindSpore model file. """
    config: ChatGLM2Config = MindFormerConfig(config)['model']['model_config']
    config = ChatGLM2Config(**config)
    model = AutoModel.from_pretrained(input_path)

    print('parameter convert....')
    ms_param = []
    for k, v in tqdm(model.state_dict().items()):
        if "word_embeddings.weight" in k:
            k = k.replace("word_embeddings.weight", "embedding_weight")
        ms_param.append({"name": k, "data": v})
    # qkv weight split
    if not config.qkv_concat or config.use_rearrange_rope:
        attn_split(ms_param, config, dtype)

    # mlp weight split
    if not config.mlp_concat:
        mlp_split(ms_param, config, dtype)

    tmp_list = []
    pop_list = []
    for i, item in enumerate(ms_param):
        k, v = item["name"], item["data"]
        if not isinstance(v, ms.Tensor):
            tmp_list.append({"name": k, "data": pt2ms(v, dtype)})
            pop_list.append(i)
    for i in reversed(pop_list):
        ms_param.pop(i)
    ms_param += tmp_list

    ms.save_checkpoint(ms_param, output_path)
    print(f"Convert finished, the output is saved to {output_path}")
 ```

 To convert a MindSpore Transformers weight to a HuggingFace one, define the `convert_ms_to_pt` function in [convert_reversed.py](https://gitee.com/mindspore/mindformers/blob/master/mindformers/models/llama/convert_reversed.py).
 To convert a MindSpore Transformers weight to a HuggingFace one, define the `convert_ms_to_pt` function in [convert_reversed.py](https://gitee.com/mindspore/mindformers/blob/master/mindformers/models/glm2/convert_reversed.py).

 ```python
 def convert_ms_to_pt(input_path, output_path, dtype=None, **kwargs):
    """convert ms weight to hf."""
    print(f"Trying to convert mindspore checkpoint in '{input_path}'.", flush=True)
    model_ms = ms.load_checkpoint(input_path)

    state_dict = {}
    for name, value in model_ms.items():
        name = name_replace(name)
        print(f'\rprocessing parameter: {name} {value.shape}     ', end='', flush=True)
        if is_lora_param(name):
            name = name.replace('.tk_delta_lora_a', '.lora_A.weight')
            name = name.replace('.tk_delta_lora_b', 'lora_B.weight')
        state_dict[name] = ms2pt(value, dtype)

    torch.save(state_dict, output_path)
    print(f"\rConvert mindspore checkpoint finished, the huggingface checkpoint is saved in '{output_path}'.",
          flush=True)
    return True
 def convert_ms_to_pt(input_path, output_path, config, dtype=torch.float32, **kwargs):
    """ Convert MindSpore model file to pytorch model file. """
    ckpt_dict = ms.load_checkpoint(input_path)
    print('parameter convert....')
    pt_param = {}
    for k, v in tqdm(ckpt_dict.items()):
        v = ms2pt(v, dtype)
        if "embedding_weight" in k:
            k = k.replace("embedding_weight", "word_embeddings.weight")
        if is_lora_param(k):
            k = k.replace(".tk_delta_lora_a", ".lora_A.weight")
            k = k.replace(".tk_delta_lora_b", ".lora_B.weight")
        pt_param[k] = v

    # Convert pytorch model file to MindSpore model file.
    config: ChatGLM2Config = MindFormerConfig(config)['model']['model_config']
    config = ChatGLM2Config(**config)

    # qkv weight split
    if not config.qkv_concat:
        attn_merge(pt_param, config)
    else:
        attn_rearange(pt_param, config)

    # mlp weight split
    if not config.mlp_concat:
        mlp_merge(pt_param)

    print('saving pt ckpt....')
    torch.save(pt_param, output_path)
    print(f"Convert finished, the output is saved to {output_path}")
 ```

 ## Distributed Weight Slicing and Merging
@@ -149,7 +165,7 @@ Parameters in the `yaml` file related to **automatic weight conversion** are des

 ```yaml
 # load_checkpoint: specifies path of the pre-trained weight file.
 load_checkpoint: "/worker/llama3_8b/llama3_8b.ckpt"
 load_checkpoint: "/worker/qwen2_5-7b/qwen2_5-7b.ckpt"

 # auto_trans_ckpt: specifies whether to enable automatic conversion.
 auto_trans_ckpt: True
@@ -159,10 +175,10 @@ auto_trans_ckpt: True

 ```yaml
 # load_checkpoint: specifies the path of the multi-device weight folder.
 load_checkpoint: "/worker/checkpoint/llama3-8b-2layer-dp2mp2pp2"
 load_checkpoint: "/worker/checkpoint/qwen2_5-7b-2layer-dp2mp2pp2"

 # src_strategy_path_or_dir: specifies the path of the distributed strategy file.
 src_strategy_path_or_dir: "/worker/checkpoint/llama3-8b-2layer-dp2mp2pp2/strategy/merged_ckpt_strategy.ckpt"
 src_strategy_path_or_dir: "/worker/checkpoint/qwen2_5-7b-2layer-dp2mp2pp2/strategy/merged_ckpt_strategy.ckpt"

 # auto_trans_ckpt: specifies whether to enable automatic conversion.
 auto_trans_ckpt: True
@@ -172,10 +188,10 @@ auto_trans_ckpt: True

 ```yaml
 # load_checkpoint: specifies the path of the multi-device weight folder.
 load_checkpoint: "/worker/checkpoint/llama3-8b-2layer-dp1mp2pp2"
 load_checkpoint: "/worker/checkpoint/qwen2_5-7b-2layer-dp1mp2pp2"

 # src_strategy_path_or_dir: specifies the path of the distributed strategy file.
 src_strategy_path_or_dir: "/worker/checkpoint/llama3-8b-2layer-dp1mp2pp2/strategy/merged_ckpt_strategy.ckpt"
 src_strategy_path_or_dir: "/worker/checkpoint/qwen2_5-7b-2layer-dp1mp2pp2/strategy/merged_ckpt_strategy.ckpt"

 # auto_trans_ckpt: specifies whether to enable automatic conversion.
 auto_trans_ckpt: True
@@ -221,8 +237,8 @@ Use [mindformers/tools/ckpt_transform/transform_checkpoint.py](https://gitee.com

 ```shell
 python transform_checkpoint.py \
  --src_checkpoint /worker/checkpoint/llama3-8b-2layer/rank_0/llama3_8b.ckpt \
  --dst_checkpoint /worker/transform_ckpt/llama3_8b_1to8/ \
  --src_checkpoint /worker/checkpoint/qwen2_5-7b-2layer/rank_0/qwen2_5-7b.ckpt \
  --dst_checkpoint /worker/transform_ckpt/qwen2_5-7b_1to8/ \
  --dst_strategy /worker/mindformers/output/strategy/ \
  --prefix "checkpoint_"
 ```
@@ -235,9 +251,9 @@ Use [mindformers/tools/ckpt_transform/transform_checkpoint.sh](https://gitee.com

 ```shell
 bash transform_checkpoint.sh \
  /worker/checkpoint/llama3-8b-2layer/rank_0/llama3_8b.ckpt \
  /worker/checkpoint/qwen2_5-7b-2layer/rank_0/qwen2_5-7b.ckpt \
  None \
  /worker/transform_ckpt/llama3_8b_1to8/ \
  /worker/transform_ckpt/qwen2_5-7b_1to8/ \
  /worker/mindformers/output/strategy/ \
  8 2 "checkpoint_"
 ```
@@ -281,7 +297,7 @@ If a unified shared storage path (such as the NFS-mounted /worker directory) is

  ```yaml
  # Set the path of the pre-trained weight file to an absolute path.
  load_checkpoint: "/worker/checkpoint/llama3-8b/rank_0/llama3_8b.ckpt"
  load_checkpoint: "/worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt"

  # Set auto_trans_ckpt to True to enable automatic weight conversion.
  auto_trans_ckpt: True
@@ -374,8 +390,8 @@ If there is no shared path between servers, you need to use the offline weight c

  ```shell
  python mindformers/tools/ckpt_transform/transform_checkpoint.py \
    --src_checkpoint /worker/checkpoint/llama3-8b/rank_0/llama_7b.ckpt \
    --dst_checkpoint ./output/llama3_8b_dp2mp4pp2 \
    --src_checkpoint /worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt \
    --dst_checkpoint ./output/qwen2_5-7b_dp2mp4pp2 \
    --dst_strategy ./output/strategy
  ```

@@ -384,9 +400,9 @@ If there is no shared path between servers, you need to use the offline weight c
  ```shell
  # Use two processes for conversion.
  bash mindformers/tools/ckpt_transform/transform_checkpoint.sh \
    /worker/checkpoint/llama3-8b/rank_0/llama_7b.ckpt \
    /worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt \
    None \
    ./output/llama3_8b_dp2mp4pp2 \
    ./output/qwen2_5-7b_dp2mp4pp2 \
    ./output/strategy \
    16 2
  ```
@@ -419,7 +435,7 @@ If there is no shared path between servers, you need to use the offline weight c

  ```yaml
  # Set the pre-trained weight path to model_dir, the distributed weight folder path.
  load_checkpoint: "/worker/checkpoint/llama3_8b_dp2mp4pp2"
  load_checkpoint: "/worker/checkpoint/qwen2_5-7b_dp2mp4pp2"

  # Change only_save_strategy to False.
  only_save_strategy: False
--- a/docs/mindformers/docs/source_zh_cn/advanced_development/accuracy_comparison.md
+++ b/docs/mindformers/docs/source_zh_cn/advanced_development/accuracy_comparison.md
@@ -105,7 +105,7 @@ Megatron-LM 是一个面向大规模训练任务的成熟框架，具备高度
    | `min-loss-scale`                           | 最小损失缩放因子                                    | 不支持配置                                      |                                                                     |
    | `loss-scale-window`                        | 动态缩放窗口大小                                    | `loss_scale_window`                        | 动态缩放窗口大小                                                            |
    | `hysteresis`                               | 损失缩放迟滞参数                                    | 不支持配置                                      |                                                                     |
    | `fp32-residual-connection`                 | 使用 Float32 残差连接                             | 不支持配置                                      |                                                                     |
    | `fp32-residual-connection`                 | 使用 Float32 残差连接                             | `fp32_residual_connection`                 | 使用 Float32 残差连接                                                     |
    | `accumulate-allreduce-grads-in-fp32`       | 使用 Float32 累加并规约梯度                          | 不支持配置                                      | 默认使用 Float32 累加并规约梯度                                                |
    | `fp16-lm-cross-entropy`                    | 使用 Float16 执行语言模型交叉熵                        | 不支持配置                                      | 默认使用 Float32 执行语言模型交叉熵                                              |
    | `q-lora-rank`                              | Query 投影层的 LoRA rank，启用 Q-LoRA 时使用          | `q_lora_rank`                              | Query 投影层的 LoRA rank，启用 Q-LoRA 时使用                                  |
--- a/docs/mindformers/docs/source_zh_cn/feature/ckpt.md
+++ b/docs/mindformers/docs/source_zh_cn/feature/ckpt.md
@@ -36,23 +36,21 @@ python convert_weight.py [-h] --model MODEL [--reversed] --input_path INPUT_PATH

 ### 转换示例

 假设用户已经下载了[Llama3.1模型的权重](https://gitee.com/mindspore/mindformers/blob/master/research/llama3_1/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD)，并保存在路径`/home/user/torch_weights`中，用户希望将其转换为MindSpore Transformers权重并保存在路径`/home/user/ms_weights`中，可以使用以下命令：
 假设用户已经下载了 [Qwen2.5 模型的权重](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD)，并保存在路径`/home/user/torch_weights`中，用户希望将其转换为MindSpore Transformers权重并保存在路径`/home/user/ms_weights`中，可以使用以下命令：

 ```bash
 python convert_weight.py --model llama --input_path /home/user/torch_weights --output_path /home/user/ms_weights/llama.ckpt
 python convert_weight.py --model qwen2_5 --input_path /home/user/torch_weights --output_path /home/user/ms_weights/qwen2_5.ckpt
 ```

 通过以上步骤，可将HuggingFace权重成功转换为MindSpore Transformers权重，方便在MindSpore Transformers中继续模型训练或推理。

 ### 已支持模型

 | 参数取值     | 支持模型                         |
 |----------|------------------------------|
 | llama    | Llama3.1                     |
 | glm-n    | GLM4                         |
 | qwen     | Qwen2.5                      |
 | mixtral  | Mixtral                      |
 | deepseek | DeepSeekV3                   |
 | 参数取值     | 支持模型    |
 |----------|---------|
 | glm-n    | GLM4    |
 | qwen2_5  | Qwen2.5 |
 | mixtral  | Mixtral |

 ### 未支持模型权重转换开发

@@ -63,60 +61,78 @@ python convert_weight.py --model llama --input_path /home/user/torch_weights --o

 ### 模型权重转换开发示例

 此处以Llama为例。如若希望转换HuggingFace权重至MindSpore Transformers权重，需在[convert_weight.py](https://gitee.com/mindspore/mindformers/blob/master/mindformers/models/llama/convert_weight.py)内定义`convert_pt_to_ms`函数：
 此处以 [GLM-4](https://gitee.com/mindspore/mindformers/blob/master/docs/model_cards/glm4.md) 为例。如若希望转换HuggingFace权重至MindSpore Transformers权重，需在[convert_weight.py](https://gitee.com/mindspore/mindformers/blob/master/mindformers/models/glm2/convert_weight.py)内定义`convert_pt_to_ms`函数：

 ```python
 def convert_pt_to_ms(input_path, output_path, dtype=None, **kwargs):
    """convert hf weight to ms."""
    print(f"Trying to convert huggingface checkpoint in '{input_path}'.", flush=True)
    try:
        from transformers import LlamaForCausalLM
    except:
        raise ImportError(f"Failed to load huggingface checkpoint. Please make sure transformers is available.")

    try:
        model_hf = LlamaForCausalLM.from_pretrained(os.path.dirname(input_path))
    except Exception as e:
        print(f"Do not find huggingface checkpoint in '{os.path.dirname(input_path)}', Error {e.message}.", flush=True)
        return False
    ckpt_list = []
    for name, value in model_hf.state_dict().items():
        name = name_replace(name)
        if name == 'norm.weight':
            name = 'norm_out.weight'
        if name[:7] == 'layers.':
            name = name[7:]

        print(f'\rprocessing parameter: {name} {value.shape}     ', end='', flush=True)
        ckpt_list.append({'name': name, 'data': pt2ms(value, dtype)})

    ms.save_checkpoint(ckpt_list, output_path)
    print(f"\rConvert huggingface checkpoint finished, the mindspore checkpoint is saved in '{output_path}'.",
          flush=True)
    return True
 def convert_pt_to_ms(input_path, output_path, config, dtype=ms.float32, **kwargs):
    """ Convert pytorch model file to MindSpore model file. """
    config: ChatGLM2Config = MindFormerConfig(config)['model']['model_config']
    config = ChatGLM2Config(**config)
    model = AutoModel.from_pretrained(input_path)

    print('parameter convert....')
    ms_param = []
    for k, v in tqdm(model.state_dict().items()):
        if "word_embeddings.weight" in k:
            k = k.replace("word_embeddings.weight", "embedding_weight")
        ms_param.append({"name": k, "data": v})
    # qkv weight split
    if not config.qkv_concat or config.use_rearrange_rope:
        attn_split(ms_param, config, dtype)

    # mlp weight split
    if not config.mlp_concat:
        mlp_split(ms_param, config, dtype)

    tmp_list = []
    pop_list = []
    for i, item in enumerate(ms_param):
        k, v = item["name"], item["data"]
        if not isinstance(v, ms.Tensor):
            tmp_list.append({"name": k, "data": pt2ms(v, dtype)})
            pop_list.append(i)
    for i in reversed(pop_list):
        ms_param.pop(i)
    ms_param += tmp_list

    ms.save_checkpoint(ms_param, output_path)
    print(f"Convert finished, the output is saved to {output_path}")
 ```

 而若是希望转换MindSpore Transformers权重至HuggingFace权重，则需在[convert_reversed.py](https://gitee.com/mindspore/mindformers/blob/master/mindformers/models/llama/convert_reversed.py)内定义`convert_ms_to_pt`函数：
 而若是希望转换MindSpore Transformers权重至HuggingFace权重，则需在[convert_reversed.py](https://gitee.com/mindspore/mindformers/blob/master/mindformers/models/glm2/convert_reversed.py)内定义`convert_ms_to_pt`函数：

 ```python
 def convert_ms_to_pt(input_path, output_path, dtype=None, **kwargs):
    """convert ms weight to hf."""
    print(f"Trying to convert mindspore checkpoint in '{input_path}'.", flush=True)
    model_ms = ms.load_checkpoint(input_path)

    state_dict = {}
    for name, value in model_ms.items():
        name = name_replace(name)
        print(f'\rprocessing parameter: {name} {value.shape}     ', end='', flush=True)
        if is_lora_param(name):
            name = name.replace('.tk_delta_lora_a', '.lora_A.weight')
            name = name.replace('.tk_delta_lora_b', 'lora_B.weight')
        state_dict[name] = ms2pt(value, dtype)

    torch.save(state_dict, output_path)
    print(f"\rConvert mindspore checkpoint finished, the huggingface checkpoint is saved in '{output_path}'.",
          flush=True)
    return True
 def convert_ms_to_pt(input_path, output_path, config, dtype=torch.float32, **kwargs):
    """ Convert MindSpore model file to pytorch model file. """
    ckpt_dict = ms.load_checkpoint(input_path)
    print('parameter convert....')
    pt_param = {}
    for k, v in tqdm(ckpt_dict.items()):
        v = ms2pt(v, dtype)
        if "embedding_weight" in k:
            k = k.replace("embedding_weight", "word_embeddings.weight")
        if is_lora_param(k):
            k = k.replace(".tk_delta_lora_a", ".lora_A.weight")
            k = k.replace(".tk_delta_lora_b", ".lora_B.weight")
        pt_param[k] = v

    # Convert pytorch model file to MindSpore model file.
    config: ChatGLM2Config = MindFormerConfig(config)['model']['model_config']
    config = ChatGLM2Config(**config)

    # qkv weight split
    if not config.qkv_concat:
        attn_merge(pt_param, config)
    else:
        attn_rearange(pt_param, config)

    # mlp weight split
    if not config.mlp_concat:
        mlp_merge(pt_param)

    print('saving pt ckpt....')
    torch.save(pt_param, output_path)
    print(f"Convert finished, the output is saved to {output_path}")
 ```

 ## 权重切分与合并
@@ -135,13 +151,13 @@ def convert_ms_to_pt(input_path, output_path, dtype=None, **kwargs):

 **自动权重转换**相关`yaml`文件参数说明如下：

 | 参数名称              | 说明                                                                                                                                                                                                                                                                                                                                                                                                                                 |
 | ------------------- |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | load_checkpoint     | 预加载权重的绝对路径或文件夹路径。<br> - 如果是完整权重，则填写绝对路径；<br> - 如果是分布式权重，则填写文件夹路径，分布式权重须按照`model_dir/rank_x/xxx.ckpt`格式存放，文件夹路径填写为`model_dir`。<br>**如果rank_x文件夹下存在多个ckpt，将会使用文件名默认排序最后的ckpt文件用于转换。**                                                                                                                                                                                                                                                |
 | src_strategy_path_or_dir        | 预加载权重对应的[分布式策略文件](#离线转换配置说明)路径。<br> - 如果预加载权重是完整权重，则**不填写**；<br> - 如果预加载权重是分布式权重，且预加载权重保存时使用了流水线并行，则填写**合并的策略文件路径**或**分布式策略文件夹路径**；<br> - 如果预加载权重是分布式权重，且预加载权重保存时未使用流水线并行，则填写任一**ckpt_strategy_rank_x.ckpt**路径；                                                                                                                                                                                                                     |
 | auto_trans_ckpt     | 权重自动转换开关，为True开启，默认False。                                                                                                                                                                                                                                                                                                                                                                                                          |
 | transform_process_num | 权重自动转换使用的进程数，默认为1。<br> - 如果transform_process_num = 1，使用**单进程转换**，转换时只有rank_0负责权重转换，其他进程等待rank_0转换结束；<br> - 如果transform_process_num > 1，使用**多进程转换**，比如8卡任务，transform_process_num=2时，转换时rank_0负责rank_0/1/2/3切片权重的转换，rank_4负责rank_4/5/6/7切片权重的转换，其他进程等待rank_0/4转换结束；<br>**注意**：<br> 1. transform_process_num越大，转换时间越短，**转换所占用的host内存越大**；当出现host侧内存不足时，需要减少transform_process_num。<br> 2. transform_process_num必须能够整除NPU卡数，且最大不得超过NPU卡数。 |
 | transform_by_rank   | 是否使用mindspore.transform_checkpoint_by_rank接口做权重转换。<br> - transform_process_num > 1时，自动设置为`True`；<br> - transform_process_num = 1时，如果目标权重为分布式权重，则循环调用mindspore.transform_checkpoint_by_rank串行转换每一个rank切片权重。<br>- transform_process_num = 1时，如果目标权重为完整权重，则自动设置为`False`，使用mindspore.transform_checkpoints接口做权重转换；                                                                                                                     |
 | 参数名称                     | 说明                                                                                                                                                                                                                                                                                                                                                                                                                                   |
 |--------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | load_checkpoint          | 预加载权重的绝对路径或文件夹路径。<br> - 如果是完整权重，则填写绝对路径；<br> - 如果是分布式权重，则填写文件夹路径，分布式权重须按照`model_dir/rank_x/xxx.ckpt`格式存放，文件夹路径填写为`model_dir`。<br>**如果rank_x文件夹下存在多个ckpt，将会使用文件名默认排序最后的ckpt文件用于转换。**                                                                                                                                                                                                                                                  |
 | src_strategy_path_or_dir | 预加载权重对应的[分布式策略文件](#离线转换配置说明)路径。<br> - 如果预加载权重是完整权重，则**不填写**；<br> - 如果预加载权重是分布式权重，且预加载权重保存时使用了流水线并行，则填写**合并的策略文件路径**或**分布式策略文件夹路径**；<br> - 如果预加载权重是分布式权重，且预加载权重保存时未使用流水线并行，则填写任一**ckpt_strategy_rank_x.ckpt**路径；                                                                                                                                                                                                                      |
 | auto_trans_ckpt          | 权重自动转换开关，为 `True` 开启，默认 `False` 。                                                                                                                                                                                                                                                                                                                                                                                                    |
 | transform_process_num    | 权重自动转换使用的进程数，默认为1。<br> - 如果transform_process_num = 1，使用**单进程转换**，转换时只有rank_0负责权重转换，其他进程等待rank_0转换结束；<br> - 如果transform_process_num > 1，使用**多进程转换**，比如8卡任务，transform_process_num=2时，转换时rank_0负责rank_0/1/2/3切片权重的转换，rank_4负责rank_4/5/6/7切片权重的转换，其他进程等待rank_0/4转换结束；<br>**注意**：<br> 1. transform_process_num越大，转换时间越短，**转换所占用的host内存越大**；当出现host侧内存不足时，需要减少transform_process_num。<br> 2. transform_process_num必须能够整除NPU卡数，且最大不得超过NPU卡数。 |
 | transform_by_rank        | 是否使用mindspore.transform_checkpoint_by_rank接口做权重转换。<br> - transform_process_num > 1时，自动设置为`True`；<br> - transform_process_num = 1时，如果目标权重为分布式权重，则循环调用mindspore.transform_checkpoint_by_rank串行转换每一个rank切片权重。<br>- transform_process_num = 1时，如果目标权重为完整权重，则自动设置为`False`，使用mindspore.transform_checkpoints接口做权重转换；                                                                                                                       |

 #### 不同场景下yaml配置说明

@@ -149,7 +165,7 @@ def convert_ms_to_pt(input_path, output_path, dtype=None, **kwargs):

 ```yaml
 # load_checkpoint: 设置为预训练权重文件路径
 load_checkpoint: "/worker/llama3_8b/llama3_8b.ckpt"
 load_checkpoint: "/worker/qwen2_5-7b/qwen2_5-7b.ckpt"

 # auto_trans_ckpt: 开启自动转换
 auto_trans_ckpt: True
@@ -159,10 +175,10 @@ auto_trans_ckpt: True

 ```yaml
 # load_checkpoint: 设置为多卡权重文件夹路径
 load_checkpoint: "/worker/checkpoint/llama3-8b-2layer-dp2mp2pp2"
 load_checkpoint: "/worker/checkpoint/qwen2_5-7b-2layer-dp2mp2pp2"

 # src_strategy_path_or_dir: 设置为分布式策略文件路径
 src_strategy_path_or_dir: "/worker/checkpoint/llama3-8b-2layer-dp2mp2pp2/strategy/merged_ckpt_strategy.ckpt"
 src_strategy_path_or_dir: "/worker/checkpoint/qwen2_5-7b-2layer-dp2mp2pp2/strategy/merged_ckpt_strategy.ckpt"

 # auto_trans_ckpt: 开启自动转换
 auto_trans_ckpt: True
@@ -172,10 +188,10 @@ auto_trans_ckpt: True

 ```yaml
 # load_checkpoint: 设置为多卡权重文件夹路径
 load_checkpoint: "/worker/checkpoint/llama3-8b-2layer-dp1mp2pp2"
 load_checkpoint: "/worker/checkpoint/qwen2_5-7b-2layer-dp1mp2pp2"

 # src_strategy_path_or_dir: 设置为分布式策略文件路径
 src_strategy_path_or_dir: "/worker/checkpoint/llama3-8b-2layer-dp1mp2pp2/strategy/merged_ckpt_strategy.ckpt"
 src_strategy_path_or_dir: "/worker/checkpoint/qwen2_5-7b-2layer-dp1mp2pp2/strategy/merged_ckpt_strategy.ckpt"

 # auto_trans_ckpt: 开启自动转换
 auto_trans_ckpt: True
@@ -221,8 +237,8 @@ MindSpore每次运行分布式任务后都会在`output/strategy`文件夹下生

 ```shell
 python transform_checkpoint.py \
  --src_checkpoint /worker/checkpoint/llama3-8b-2layer/rank_0/llama3_8b.ckpt \
  --dst_checkpoint_dir /worker/transform_ckpt/llama3_8b_1to8/ \
  --src_checkpoint /worker/checkpoint/qwen2_5-7b-2layer/rank_0/qwen2_5-7b.ckpt \
  --dst_checkpoint_dir /worker/transform_ckpt/qwen2_5-7b_1to8/ \
  --dst_strategy /worker/mindformers/output/strategy/ \
  --prefix "checkpoint_"
 ```
@@ -235,9 +251,9 @@ python transform_checkpoint.py \

 ```shell
 bash transform_checkpoint.sh \
  /worker/checkpoint/llama3-8b-2layer/rank_0/llama3_8b.ckpt \
  /worker/checkpoint/qwen2_5-7b-2layer/rank_0/qwen2_5-7b.ckpt \
  None \
  /worker/transform_ckpt/llama3_8b_1to8/ \
  /worker/transform_ckpt/qwen2_5-7b_1to8/ \
  /worker/mindformers/output/strategy/ \
  8 2 "checkpoint_"
 ```
@@ -281,7 +297,7 @@ bash transform_checkpoint.sh \

  ```yaml
  # 配置预训练权重路径，填写权重文件的绝对路径
  load_checkpoint: "/worker/checkpoint/llama3-8b/rank_0/llama3_8b.ckpt"
  load_checkpoint: "/worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt"

  # 设置 auto_trans_ckpt 为 True 开启自动权重转换
  auto_trans_ckpt: True
@@ -374,8 +390,8 @@ bash transform_checkpoint.sh \

  ```shell
  python mindformers/tools/ckpt_transform/transform_checkpoint.py \
    --src_checkpoint /worker/checkpoint/llama3-8b/rank_0/llama_7b.ckpt \
    --dst_checkpoint ./output/llama3_8b_dp2mp4pp2 \
    --src_checkpoint /worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt \
    --dst_checkpoint ./output/qwen2_5-7b_dp2mp4pp2 \
    --dst_strategy ./output/strategy
  ```

@@ -384,9 +400,9 @@ bash transform_checkpoint.sh \
  ```shell
  # 使用2个进程进行转换
  bash mindformers/tools/ckpt_transform/transform_checkpoint.sh \
    /worker/checkpoint/llama3-8b/rank_0/llama_7b.ckpt \
    /worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt \
    None \
    ./output/llama3_8b_dp2mp4pp2 \
    ./output/qwen2_5-7b_dp2mp4pp2 \
    ./output/strategy \
    16 2
  ```
@@ -419,7 +435,7 @@ bash transform_checkpoint.sh \

  ```yaml
  # 配置预训练权重路径，填写分布式权重文件夹路径 model_dir
  load_checkpoint: "/worker/checkpoint/llama3_8b_dp2mp4pp2"
  load_checkpoint: "/worker/checkpoint/qwen2_5-7b_dp2mp4pp2"

  # 将 only_save_strategy 改为 False
  only_save_strategy: False
--- a/tutorials/source_en/custom_program/custom_backend.md
+++ b/tutorials/source_en/custom_program/custom_backend.md
@@ -39,7 +39,7 @@ class MSCustomBackendBase : public BackendBase {
  // The backend graph Run interface by the graph_id which are generated through the graph Build interface above.
  RunningStatus Run(BackendGraphId graph_id, const VectorRef &inputs, VectorRef *outputs) {
    MS_LOG(WARNING) << "MSCustomBackendBase use the origin ms_backend to run the graph.";
    mindspore::backend::BackendManager::GetInstance().Run(BackendType::kMsBackend, graph_id, inputs, outputs);
    mindspore::backend::BackendManager::GetInstance().Run(BackendType::kMSBackend, graph_id, inputs, outputs);
  }
 };
 MS_REGISTER_BACKEND(kCustomBackendName, MSCustomBackendBase)
@@ -49,7 +49,7 @@ MS_REGISTER_BACKEND(kCustomBackendName, MSCustomBackendBase)

 ## Compiling Custom Backend

 Save the above example code as `custom_backend.cpp` and compile it into `libcustom_backend.so`. The compilation command is as follows:
 Save the above example code as `custom_backend.cpp` and compile it into `libcustom_backend.so`. The CMake script is as follows:

 ```cmake
 cmake_minimum_required(VERSION 3.16)
@@ -59,7 +59,7 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)

 # Use specified MindSpore path
 set(MINDSPORE_INCLUDE_DIRS ${MINDSPORE_ROOT}/include)
 set(MINDSPORE_INCLUDE_DIR ${MINDSPORE_ROOT}/include)
 set(MINDSPORE_LIB_DIRS ${MINDSPORE_ROOT}/lib)
 message(STATUS "Using MindSpore from: ${MINDSPORE_ROOT}")

@@ -70,20 +70,20 @@ set(CMAKE_BUILD_TYPE "Release")
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})

 # Handle MindSpore include directories
 if(MINDSPORE_INCLUDE_DIRS)
    include_directories(${include_dir})
 if(MINDSPORE_INCLUDE_DIR)
    include_directories(${MINDSPORE_INCLUDE_DIR})
    # Add complete MindSpore include paths to ensure all dependency headers are found
    include_directories(${include_dir}/mindspore)
    include_directories(${include_dir}/mindspore/core/include)
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore)
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/core/include)
    # Add MindSpore ccsrc path, contains mindspore/ccsrc/include/
    include_directories(${include_dir}/mindspore/ccsrc/include)
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ccsrc/include)
    # Add MindSpore csrc path, contains mindspore/ccsrc/
    include_directories(${include_dir}/mindspore/ccsrc)
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ccsrc)
    # Add third_party path, contains securec.h
    include_directories(${include_dir}/third_party)
    include_directories(${include_dir}/third_party/include)
    include_directories(${MINDSPORE_INCLUDE_DIR}/third_party)
    include_directories(${MINDSPORE_INCLUDE_DIR}/third_party/include)
    # Add specific pybind11 path
    include_directories(${include_dir}/third_party/pybind11)
    include_directories(${MINDSPORE_INCLUDE_DIR}/third_party/pybind11)
 endif()

 # Find Python
@@ -145,6 +145,15 @@ install(TARGETS custom_backend
 )
 ```

 The compilation command is as follows:

 ```bash
 cmake . -DMINDSPORE_ROOT=/path/to/mindspore
 make
 ```

 Among them, `/path/to/mindspore` represents the installation path of MindSpore.

 ## Using Custom Backend

 Using [mindspore.graph.register_custom_backend](https://www.mindspore.cn/docs/en/master/api_python/graph/mindspore.graph.register_custom_backend.html) to register the backend and use [mindspore.jit](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.jit.html) to enable the backend:
--- a/tutorials/source_en/custom_program/custom_pass.md
+++ b/tutorials/source_en/custom_program/custom_pass.md
@@ -0,0 +1,345 @@
 # Custom Pass

 [![View Source File](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/tutorials/source_en/custom_program/custom_pass.md)

 ## Overview

 When it is necessary to change the structure of the computation graph, you can utilize MindSpore's custom pass feature to write pass logic, implement and register a custom pass plugin, and optimize the structure of the computation graph.

 This tutorial provides a simple custom pass case as a demonstration. For more comprehensive examples, please refer to the [examples](https://gitee.com/mindspore/mindspore/blob/master/tests/st/backend/custom_pass/test_custom_passes.py).

 ## Implementing Custom Pass

 The implementation of custom pass requires completing the following steps:

 1. Reference `mindspore/include/custom_pass_api.h` header file.
 2. Inherit `PatternToPatternPass` class and implement `DefineSrcPattern`, `DefineDstPattern` and `CheckMatchedDAG` interfaces.
 3. Inherit `CustomPassPlugin` class and implement `GetPluginName`, `GetAvailablePassNames` and `CreatePass` interfaces.
 4. Register custom backend by using the `EXPORT_CUSTOM_PASS_PLUGIN` macro.

 Here, we implement a simple AddNegFusionPass and a custom Pass plugin to replace the Add operator and Neg operator with a Sub operator.

 ```c++
 // add_neg_fusion_pass.h
 // header file of AddNegFusionPass
 #ifndef MINDSPORE_CUSTOM_PASS_ADD_NEG_FUSION_PASS_H_
 #define MINDSPORE_CUSTOM_PASS_ADD_NEG_FUSION_PASS_H_

 #include "mindspore/include/custom_pass_api.h"

 namespace mindspore {
 namespace opt {
 /**
 * @brief Pass to fuse Add and Neg operations into Sub
 *
 * Transforms Add(x, Neg(y)) into Sub(x, y)
 * This is a standard algebraic optimization that eliminates unnecessary Neg operations
 * Works on CPU/GPU/Ascend since all platforms support Add, Neg, and Sub operations
 * Inherits from PatternToPatternPass to comply with MindSpore plugin system requirements
 */
 class AddNegFusionPass : public PatternToPatternPass {
 public:
  AddNegFusionPass() : PatternToPatternPass("AddNegFusionPass") {}

  void DefineSrcPattern(SrcPattern *src_pattern) override;
  void DefineDstPattern(DstPattern *dst_pattern) override;
  bool CheckMatchedDAG(const PatternMap &pattern_map, const FuncGraphPtr &func_graph,
                       const AnfNodePtr &node) const override;

 private:
  static bool IsAddNode(const AnfNodePtr &node);
  static bool IsNegNode(const AnfNodePtr &node);

  static AnfNodePtr BuildSub(const PatternMap &m, const AnfNodePtr &default_node);
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CUSTOM_PASS_ADD_NEG_FUSION_PASS_H_
 ```

 ```c++
 // add_neg_fusion_pass.cc
 // cpp file of AddNegFusionPass
 #include "add_neg_fusion_pass.h"

 namespace mindspore {
 namespace opt {
 void AddNegFusionPass::DefineSrcPattern(SrcPattern *src_pattern) {
  MS_LOG(INFO) << "Defining source pattern for AddNegFusionPass";
  MS_EXCEPTION_IF_NULL(src_pattern);

  // Pattern: Add(x, Neg(y))
  (*src_pattern)
    .AddVar("x")
    .AddVar("y")
    .AddCNode("neg", {std::make_shared<Primitive>("Neg"), "y"})
    .AddCNode("add", {std::make_shared<Primitive>("Add"), "x", "neg"});

  MS_LOG(INFO) << "Source pattern defined: Add(x, Neg(y))";
 }

 AnfNodePtr AddNegFusionPass::BuildSub(const PatternMap &m, const AnfNodePtr &default_node) {
  auto add_node = m.Get("add")->cast<CNodePtr>();
  auto neg_node = m.Get("neg")->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(add_node);
  MS_EXCEPTION_IF_NULL(neg_node);

  auto sub_node = default_node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(sub_node);

  // Copy Add node's scope to maintain execution context
  sub_node->set_scope(add_node->scope());

  // Set abstract same as Add output
  auto add_abstract = add_node->abstract();
  if (add_abstract != nullptr) {
    sub_node->set_abstract(add_abstract->Clone());
  } else {
    MS_LOG(EXCEPTION) << "Failed to create Sub abstract from Add node";
  }

  return sub_node;
 }

 void AddNegFusionPass::DefineDstPattern(DstPattern *dst_pattern) {
  MS_LOG(INFO) << "Defining destination pattern for AddNegFusionPass";
  MS_EXCEPTION_IF_NULL(dst_pattern);

  // Replace with Sub(x, y) - directly subtract y instead of adding its negation
  (*dst_pattern).AddCNode("sub", {std::make_shared<Primitive>("Sub"), "x", "y"}, BuildSub);

  MS_LOG(INFO) << "Destination pattern defined: Sub(x, y)";
 }

 bool AddNegFusionPass::CheckMatchedDAG(const PatternMap &pattern_map, const FuncGraphPtr &func_graph,
                                       const AnfNodePtr &node) const {
  auto add_node = pattern_map.Get("add");
  if (!add_node) {
    MS_LOG(ERROR) << "Add node not found in pattern match";
    return false;
  }

  auto neg_node = pattern_map.Get("neg");
  if (!neg_node) {
    MS_LOG(ERROR) << "Neg node not found in pattern match";
    return false;
  }

  auto x_node = pattern_map.Get("x");
  if (!x_node) {
    MS_LOG(ERROR) << "x node not found in pattern match";
    return false;
  }

  auto y_node = pattern_map.Get("y");
  if (!y_node) {
    MS_LOG(ERROR) << "y node not found in pattern match";
    return false;
  }

  MS_LOG(INFO) << "AddNeg fusion pattern matched successfully";
  return true;
 }
 }  // namespace opt
 }  // namespace mindspore
 ```

 ```c++
 // ms_custom_pass_plugin.cc
 // cpp file of Custom Pass Plugin
 #include <string>
 #include <memory>
 #include <vector>
 #include "mindspore/ccsrc/include/backend/common/custom_pass/custom_pass_plugin.h"
 #include "add_neg_fusion_pass.h"

 namespace mindspore {
 namespace opt {

 class MSCustomPassPlugin : public CustomPassPlugin {
 public:
  std::string GetPluginName() const override { return "ms_custom_pass_plugin"; }

  std::vector<std::string> GetAvailablePassNames() const override {
    return {"ReplaceAddNFusionPass", "AddNegFusionPass"};
  }

  std::shared_ptr<Pass> CreatePass(const std::string &pass_name) const override {
    if (pass_name == "AddNegFusionPass") {
      auto pass = std::make_shared<AddNegFusionPass>();
      MS_LOG(INFO) << "Created pass '" << pass_name << "' successfully";
      return pass;
    } else {
      MS_LOG(WARNING) << "Pass '" << pass_name << "' not found, available: ReplaceAddNFusionPass, AddNegFusionPass";
      return nullptr;
    }
  }
 };
 }  // namespace opt
 }  // namespace mindspore

 EXPORT_CUSTOM_PASS_PLUGIN(mindspore::opt::MSCustomPassPlugin)
 ```

 ## Compiling Custom Pass Plugin

 Compile the above example code into `libcustom_pass.so`. The CMake script is as follows:

 ```cmake
 cmake_minimum_required(VERSION 3.16)
 project(pass VERSION 1.0.0 LANGUAGES CXX)

 # Set C++ standard
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)

 # Use specified MindSpore path
 set(MINDSPORE_INCLUDE_DIR ${MINDSPORE_ROOT}/include)
 set(MINDSPORE_LIB_DIRS ${MINDSPORE_ROOT}/lib)
 message(STATUS "Using MindSpore from: ${MINDSPORE_ROOT}")

 # Build options configuration (simplified)
 set(CMAKE_BUILD_TYPE "Release")

 # Set CMake module path - adjusted for mindspore test location
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")

 # Include directories
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})

 # Handle multiple MindSpore include directories
 if(MINDSPORE_INCLUDE_DIR)
    # Add complete MindSpore include paths to ensure all dependency headers are found
    include_directories(${MINDSPORE_INCLUDE_DIR})
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore)
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/core/include)
    # Add MindSpore ccsrc path
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ccsrc/)
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ccsrc/include/)
    # Add MindSpore ops path
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ops)
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ops/include)
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ops/kernel/include)
    # Add third_party path, contains securec.h
    include_directories(${MINDSPORE_INCLUDE_DIR}/third_party)
    # Add specific securec path
    include_directories(${MINDSPORE_INCLUDE_DIR}/third_party/securec/include)
 endif()

 # Automatically find all source files
 file(GLOB_RECURSE PASS_SOURCES "*.cc")
 file(GLOB_RECURSE PASS_HEADERS "*.h")

 # Create dynamic library (based on installed MindSpore)
 add_library(custom_pass SHARED ${PASS_SOURCES})

 # Link MindSpore libraries (based on actual requirements)
 target_link_libraries(custom_pass
    ${MINDSPORE_LIB_DIRS}/libmindspore_backend_common.so
    ${MINDSPORE_LIB_DIRS}/libmindspore_core.so
    ${MINDSPORE_LIB_DIRS}/libmindspore_common.so
 )

 # Default settings
 option(ENABLE_GLIBCXX "enable_glibcxx" OFF)

 # System-related overrides
 if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux")
    set(ENABLE_GLIBCXX ON)
 endif()

 # Environment variable overrides
 if(DEFINED ENV{ENABLE_GLIBCXX})
    set(ENABLE_GLIBCXX $ENV{ENABLE_GLIBCXX})
 endif()

 # ABI flag settings
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
    if(NOT ENABLE_GLIBCXX)
        add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0)
    endif()
 endif()

 # Set compilation options
 target_compile_options(custom_pass PRIVATE
    -fPIC
    -std=c++17
    -Wall
    -Wextra
 )

 # Use ABI settings consistent with MindSpore
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
    if(NOT ENABLE_GLIBCXX)
        target_compile_definitions(custom_pass PRIVATE _GLIBCXX_USE_CXX11_ABI=0)
    endif()
 endif()

 # Set compilation definitions
 target_compile_definitions(custom_pass PRIVATE
    -DPASS_PLUGIN_EXPORTS
    -DMINDSPORE_PASS
 )

 # Set dynamic library properties
 set_target_properties(custom_pass PROPERTIES
    VERSION ${PROJECT_VERSION}
    SOVERSION ${PROJECT_VERSION_MAJOR}
    PREFIX "lib"
    OUTPUT_NAME "custom_pass"
 )

 # Installation rules
 install(TARGETS custom_pass
    LIBRARY DESTINATION lib
    RUNTIME DESTINATION bin
 )
 ```

 The compilation command is as follows:

 ```bash
 cmake . -DMINDSPORE_ROOT=/path/to/mindspore
 make
 ```

 `/path/to/mindspore` represents the installation path of MindSpore.

 ## Using Custom Pass

 Using [mindspore.graph.register_custom_pass](https://www.mindspore.cn/docs/en/master/api_python/graph/mindspore.graph.register_custom_pass.html) to register and enable the custom pass:

 ```python
 import numpy as np
 import mindspore
 from mindspore import jit, ops, nn, context, Tensor

 custom_path = "/data1/libcustom_pass.so"
 success = mindspore.graph.register_custom_pass("AddNegFusionPass", custom_path, "cpu")
 assert success, "Plugin registration failed"

 class AddNegNetwork(nn.Cell):
    def __init__(self):
        super().__init__()
        self.neg = ops.Neg()

    @jit(backend="ms_backend")
    def construct(self, x1, x2):
        # Neg operation: -x2
        neg_x2 = self.neg(x2)
        # Add operation: x1 + (-x2) = x1 - x2
        output = x1 + neg_x2
        return output

 context.set_context(device_target="CPU")
 net = AddNegNetwork()
 x1 = Tensor(np.array([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
                        [[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]).astype(np.float32))
 x2 = Tensor(np.array([[[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3]],
                        [[4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6]]]).astype(np.float32))
 output = net(x1, x2)

 # Verify functional correctness
 expected = x1.asnumpy() - x2.asnumpy()  # x1 + (-x2) = x1 - x2
 np.testing.assert_array_almost_equal(output.asnumpy(), expected)
 ```
--- a/tutorials/source_en/index.rst
+++ b/tutorials/source_en/index.rst
@@ -83,6 +83,7 @@ MindSpore Tutorial

   custom_program/op_custom
   custom_program/custom_backend
   custom_program/custom_pass
   custom_program/hook_program

 .. toctree::
--- a/tutorials/source_zh_cn/custom_program/custom_backend.md
+++ b/tutorials/source_zh_cn/custom_program/custom_backend.md
@@ -39,7 +39,7 @@ class MSCustomBackendBase : public BackendBase {
  // The backend graph Run interface by the graph_id which are generated through the graph Build interface above.
  RunningStatus Run(BackendGraphId graph_id, const VectorRef &inputs, VectorRef *outputs) {
    MS_LOG(WARNING) << "MSCustomBackendBase use the origin ms_backend to run the graph.";
    mindspore::backend::BackendManager::GetInstance().Run(BackendType::kMsBackend, graph_id, inputs, outputs);
    mindspore::backend::BackendManager::GetInstance().Run(BackendType::kMSBackend, graph_id, inputs, outputs);
  }
 };
 MS_REGISTER_BACKEND(kCustomBackendName, MSCustomBackendBase)
@@ -49,7 +49,7 @@ MS_REGISTER_BACKEND(kCustomBackendName, MSCustomBackendBase)

 ## 编译自定义后端

 将上述示例代码保存为`custom_backend.cpp`，并编译成`libcustom_backend.so`动态库，编译命令如下：
 将上述示例代码保存为`custom_backend.cpp`，并编译成`libcustom_backend.so`动态库，CMake脚本如下：

 ```cmake
 cmake_minimum_required(VERSION 3.16)
@@ -59,7 +59,7 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)

 # Use specified MindSpore path
 set(MINDSPORE_INCLUDE_DIRS ${MINDSPORE_ROOT}/include)
 set(MINDSPORE_INCLUDE_DIR ${MINDSPORE_ROOT}/include)
 set(MINDSPORE_LIB_DIRS ${MINDSPORE_ROOT}/lib)
 message(STATUS "Using MindSpore from: ${MINDSPORE_ROOT}")

@@ -70,20 +70,20 @@ set(CMAKE_BUILD_TYPE "Release")
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})

 # Handle MindSpore include directories
 if(MINDSPORE_INCLUDE_DIRS)
    include_directories(${include_dir})
 if(MINDSPORE_INCLUDE_DIR)
    include_directories(${MINDSPORE_INCLUDE_DIR})
    # Add complete MindSpore include paths to ensure all dependency headers are found
    include_directories(${include_dir}/mindspore)
    include_directories(${include_dir}/mindspore/core/include)
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore)
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/core/include)
    # Add MindSpore ccsrc path, contains mindspore/ccsrc/include/
    include_directories(${include_dir}/mindspore/ccsrc/include)
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ccsrc/include)
    # Add MindSpore csrc path, contains mindspore/ccsrc/
    include_directories(${include_dir}/mindspore/ccsrc)
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ccsrc)
    # Add third_party path, contains securec.h
    include_directories(${include_dir}/third_party)
    include_directories(${include_dir}/third_party/include)
    include_directories(${MINDSPORE_INCLUDE_DIR}/third_party)
    include_directories(${MINDSPORE_INCLUDE_DIR}/third_party/include)
    # Add specific pybind11 path
    include_directories(${include_dir}/third_party/pybind11)
    include_directories(${MINDSPORE_INCLUDE_DIR}/third_party/pybind11)
 endif()

 # Find Python
@@ -145,6 +145,15 @@ install(TARGETS custom_backend
 )
 ```

 编译命令如下：

 ```bash
 cmake . -DMINDSPORE_ROOT=/path/to/mindspore
 make
 ```

 其中，`/path/to/mindspore`为MindSpore的安装路径。

 ## 使用自定义后端

 使用[mindspore.graph.register_custom_backend](https://www.mindspore.cn/docs/zh-CN/master/api_python/graph/mindspore.graph.register_custom_backend.html)接入后端，并通过[mindspore.jit](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.jit.html)接口选择使用：
--- a/tutorials/source_zh_cn/custom_program/custom_pass.md
+++ b/tutorials/source_zh_cn/custom_program/custom_pass.md
@@ -0,0 +1,342 @@
 # 自定义Pass

 [![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/tutorials/source_zh_cn/custom_program/custom_pass.md)

 ## 概述

 当需要改变计算图结构时，你可以利用MindSpore的自定义pass功能，编写pass逻辑，实现并注册自定义Pass插件，对计算图的结构进行变换优化。

 本教程提供一个简单的自定义pass用例作为展示。更多完整示例，参见MindSpore源码中的[用例](https://gitee.com/mindspore/mindspore/blob/master/tests/st/backend/custom_pass/test_custom_passes.py)。

 ## 实现自定义Pass

 自定义Pass的实现需要完成以下步骤：

 1. 引用`mindspore/include/custom_pass_api.h`头文件。
 2. 继承`PatternToPatternPass`类并实现`DefineSrcPattern`、`DefineDstPattern`和`CheckMatchedDAG`接口。
 3. 继承`CustomPassPlugin`类并实现`GetPluginName`、`GetAvailablePassNames`和`CreatePass`接口。
 4. 使用`EXPORT_CUSTOM_PASS_PLUGIN`宏注册自定义Pass插件。

 这里实现一个简单的AddNegFusionPass及自定义Pass插件，用于将Add算子和Neg算子替换为一个Sub算子。

 ```c++
 // add_neg_fusion_pass.h
 #ifndef MINDSPORE_CUSTOM_PASS_ADD_NEG_FUSION_PASS_H_
 #define MINDSPORE_CUSTOM_PASS_ADD_NEG_FUSION_PASS_H_

 #include "mindspore/include/custom_pass_api.h"

 namespace mindspore {
 namespace opt {
 /**
 * @brief Pass to fuse Add and Neg operations into Sub
 *
 * Transforms Add(x, Neg(y)) into Sub(x, y)
 * This is a standard algebraic optimization that eliminates unnecessary Neg operations
 * Works on CPU/GPU/Ascend since all platforms support Add, Neg, and Sub operations
 * Inherits from PatternToPatternPass to comply with MindSpore plugin system requirements
 */
 class AddNegFusionPass : public PatternToPatternPass {
 public:
  AddNegFusionPass() : PatternToPatternPass("AddNegFusionPass") {}

  void DefineSrcPattern(SrcPattern *src_pattern) override;
  void DefineDstPattern(DstPattern *dst_pattern) override;
  bool CheckMatchedDAG(const PatternMap &pattern_map, const FuncGraphPtr &func_graph,
                       const AnfNodePtr &node) const override;

 private:
  static bool IsAddNode(const AnfNodePtr &node);
  static bool IsNegNode(const AnfNodePtr &node);

  static AnfNodePtr BuildSub(const PatternMap &m, const AnfNodePtr &default_node);
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CUSTOM_PASS_ADD_NEG_FUSION_PASS_H_
 ```

 ```c++
 // add_neg_fusion_pass.cc
 #include "add_neg_fusion_pass.h"

 namespace mindspore {
 namespace opt {
 void AddNegFusionPass::DefineSrcPattern(SrcPattern *src_pattern) {
  MS_LOG(INFO) << "Defining source pattern for AddNegFusionPass";
  MS_EXCEPTION_IF_NULL(src_pattern);

  // Pattern: Add(x, Neg(y))
  (*src_pattern)
    .AddVar("x")
    .AddVar("y")
    .AddCNode("neg", {std::make_shared<Primitive>("Neg"), "y"})
    .AddCNode("add", {std::make_shared<Primitive>("Add"), "x", "neg"});

  MS_LOG(INFO) << "Source pattern defined: Add(x, Neg(y))";
 }

 AnfNodePtr AddNegFusionPass::BuildSub(const PatternMap &m, const AnfNodePtr &default_node) {
  auto add_node = m.Get("add")->cast<CNodePtr>();
  auto neg_node = m.Get("neg")->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(add_node);
  MS_EXCEPTION_IF_NULL(neg_node);

  auto sub_node = default_node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(sub_node);

  // Copy Add node's scope to maintain execution context
  sub_node->set_scope(add_node->scope());

  // Set abstract same as Add output
  auto add_abstract = add_node->abstract();
  if (add_abstract != nullptr) {
    sub_node->set_abstract(add_abstract->Clone());
  } else {
    MS_LOG(EXCEPTION) << "Failed to create Sub abstract from Add node";
  }

  return sub_node;
 }

 void AddNegFusionPass::DefineDstPattern(DstPattern *dst_pattern) {
  MS_LOG(INFO) << "Defining destination pattern for AddNegFusionPass";
  MS_EXCEPTION_IF_NULL(dst_pattern);

  // Replace with Sub(x, y) - directly subtract y instead of adding its negation
  (*dst_pattern).AddCNode("sub", {std::make_shared<Primitive>("Sub"), "x", "y"}, BuildSub);

  MS_LOG(INFO) << "Destination pattern defined: Sub(x, y)";
 }

 bool AddNegFusionPass::CheckMatchedDAG(const PatternMap &pattern_map, const FuncGraphPtr &func_graph,
                                       const AnfNodePtr &node) const {
  auto add_node = pattern_map.Get("add");
  if (!add_node) {
    MS_LOG(ERROR) << "Add node not found in pattern match";
    return false;
  }

  auto neg_node = pattern_map.Get("neg");
  if (!neg_node) {
    MS_LOG(ERROR) << "Neg node not found in pattern match";
    return false;
  }

  auto x_node = pattern_map.Get("x");
  if (!x_node) {
    MS_LOG(ERROR) << "x node not found in pattern match";
    return false;
  }

  auto y_node = pattern_map.Get("y");
  if (!y_node) {
    MS_LOG(ERROR) << "y node not found in pattern match";
    return false;
  }

  MS_LOG(INFO) << "AddNeg fusion pattern matched successfully";
  return true;
 }
 }  // namespace opt
 }  // namespace mindspore
 ```

 ```c++
 // ms_custom_pass_plugin.cc
 #include <string>
 #include <memory>
 #include <vector>
 #include "mindspore/ccsrc/include/backend/common/custom_pass/custom_pass_plugin.h"
 #include "add_neg_fusion_pass.h"

 namespace mindspore {
 namespace opt {

 class MSCustomPassPlugin : public CustomPassPlugin {
 public:
  std::string GetPluginName() const override { return "ms_custom_pass_plugin"; }

  std::vector<std::string> GetAvailablePassNames() const override {
    return {"ReplaceAddNFusionPass", "AddNegFusionPass"};
  }

  std::shared_ptr<Pass> CreatePass(const std::string &pass_name) const override {
    if (pass_name == "AddNegFusionPass") {
      auto pass = std::make_shared<AddNegFusionPass>();
      MS_LOG(INFO) << "Created pass '" << pass_name << "' successfully";
      return pass;
    } else {
      MS_LOG(WARNING) << "Pass '" << pass_name << "' not found, available: ReplaceAddNFusionPass, AddNegFusionPass";
      return nullptr;
    }
  }
 };
 }  // namespace opt
 }  // namespace mindspore

 EXPORT_CUSTOM_PASS_PLUGIN(mindspore::opt::MSCustomPassPlugin)
 ```

 ## 编译自定义Pass插件

 将上述示例代码编译成`libcustom_pass.so`动态库，CMake脚本如下：

 ```cmake
 cmake_minimum_required(VERSION 3.16)
 project(pass VERSION 1.0.0 LANGUAGES CXX)

 # Set C++ standard
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)

 # Use specified MindSpore path
 set(MINDSPORE_INCLUDE_DIR ${MINDSPORE_ROOT}/include)
 set(MINDSPORE_LIB_DIRS ${MINDSPORE_ROOT}/lib)
 message(STATUS "Using MindSpore from: ${MINDSPORE_ROOT}")

 # Build options configuration (simplified)
 set(CMAKE_BUILD_TYPE "Release")

 # Set CMake module path - adjusted for mindspore test location
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")

 # Include directories
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})

 # Handle multiple MindSpore include directories
 if(MINDSPORE_INCLUDE_DIR)
    # Add complete MindSpore include paths to ensure all dependency headers are found
    include_directories(${MINDSPORE_INCLUDE_DIR})
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore)
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/core/include)
    # Add MindSpore ccsrc path
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ccsrc/)
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ccsrc/include/)
    # Add MindSpore ops path
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ops)
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ops/include)
    include_directories(${MINDSPORE_INCLUDE_DIR}/mindspore/ops/kernel/include)
    # Add third_party path, contains securec.h
    include_directories(${MINDSPORE_INCLUDE_DIR}/third_party)
    # Add specific securec path
    include_directories(${MINDSPORE_INCLUDE_DIR}/third_party/securec/include)
 endif()

 # Automatically find all source files
 file(GLOB_RECURSE PASS_SOURCES "*.cc")
 file(GLOB_RECURSE PASS_HEADERS "*.h")

 # Create dynamic library (based on installed MindSpore)
 add_library(custom_pass SHARED ${PASS_SOURCES})

 # Link MindSpore libraries (based on actual requirements)
 target_link_libraries(custom_pass
    ${MINDSPORE_LIB_DIRS}/libmindspore_backend_common.so
    ${MINDSPORE_LIB_DIRS}/libmindspore_core.so
    ${MINDSPORE_LIB_DIRS}/libmindspore_common.so
 )

 # Default settings
 option(ENABLE_GLIBCXX "enable_glibcxx" OFF)

 # System-related overrides
 if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux")
    set(ENABLE_GLIBCXX ON)
 endif()

 # Environment variable overrides
 if(DEFINED ENV{ENABLE_GLIBCXX})
    set(ENABLE_GLIBCXX $ENV{ENABLE_GLIBCXX})
 endif()

 # ABI flag settings
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
    if(NOT ENABLE_GLIBCXX)
        add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0)
    endif()
 endif()

 # Set compilation options
 target_compile_options(custom_pass PRIVATE
    -fPIC
    -std=c++17
    -Wall
    -Wextra
 )

 # Use ABI settings consistent with MindSpore
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
    if(NOT ENABLE_GLIBCXX)
        target_compile_definitions(custom_pass PRIVATE _GLIBCXX_USE_CXX11_ABI=0)
    endif()
 endif()

 # Set compilation definitions
 target_compile_definitions(custom_pass PRIVATE
    -DPASS_PLUGIN_EXPORTS
    -DMINDSPORE_PASS
 )

 # Set dynamic library properties
 set_target_properties(custom_pass PROPERTIES
    VERSION ${PROJECT_VERSION}
    SOVERSION ${PROJECT_VERSION_MAJOR}
    PREFIX "lib"
    OUTPUT_NAME "custom_pass"
 )

 # Installation rules
 install(TARGETS custom_pass
    LIBRARY DESTINATION lib
    RUNTIME DESTINATION bin
 )
 ```

 编译命令如下：

 ```bash
 cmake . -DMINDSPORE_ROOT=/path/to/mindspore
 make
 ```

 其中，`/path/to/mindspore`为MindSpore的安装路径。

 ## 使用自定义Pass

 使用[mindspore.graph.register_custom_pass](https://www.mindspore.cn/docs/zh-CN/master/api_python/graph/mindspore.graph.register_custom_pass.html)进行注册接入：

 ```python
 import numpy as np
 import mindspore
 from mindspore import jit, ops, nn, context, Tensor

 custom_path = "/data1/libcustom_pass.so"
 success = mindspore.graph.register_custom_pass("AddNegFusionPass", custom_path, "cpu")
 assert success, "Plugin registration failed"

 class AddNegNetwork(nn.Cell):
    def __init__(self):
        super().__init__()
        self.neg = ops.Neg()

    @jit(backend="ms_backend")
    def construct(self, x1, x2):
        # Neg operation: -x2
        neg_x2 = self.neg(x2)
        # Add operation: x1 + (-x2) = x1 - x2
        output = x1 + neg_x2
        return output

 context.set_context(device_target="CPU")
 net = AddNegNetwork()
 x1 = Tensor(np.array([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
                        [[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]).astype(np.float32))
 x2 = Tensor(np.array([[[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3]],
                        [[4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6]]]).astype(np.float32))
 output = net(x1, x2)

 # Verify functional correctness
 expected = x1.asnumpy() - x2.asnumpy()  # x1 + (-x2) = x1 - x2
 np.testing.assert_array_almost_equal(output.asnumpy(), expected)
 ```
--- a/tutorials/source_zh_cn/index.rst
+++ b/tutorials/source_zh_cn/index.rst
@@ -83,6 +83,7 @@ MindSpore教程

   custom_program/op_custom
   custom_program/custom_backend
   custom_program/custom_pass
   custom_program/hook_program

 .. toctree::
Author	SHA1	Message	Date
i-robot	b13b1f16e7	!17993 add custom pass tutorial Merge pull request !17993 from yuchaojie/master	16 hours ago
yuchaojie	027aeade1b	add custom pass tutorial	1 day ago
i-robot	8c1666eed5	!17986 fix docs for custom backend Merge pull request !17986 from 胡彬/custom-backend	16 hours ago
i-robot	4f971c41f4	!17987 【master】【bugfix】【ckpt】相关例子替换llama为qwen2.5 Merge pull request !17987 from SaiYao/update_ckpt_example_from_llama_to_qwen	18 hours ago
i-robot	9665fb8909	!17985 整改精度文档 Merge pull request !17985 from zhangyihui/master-bugfix	18 hours ago
i-robot	c59348aff6	!17991 解决lite文档部分页面内容生成错误问题 Merge pull request !17991 from liuchao/master	18 hours ago
SaiYao	f1c8ffe45f	【master】【bugfix】【ckpt】相关例子替换llama为qwen2.5	1 day ago
myprigitee	254ac25e82	解决lite文档部分页面内容生成错误问题	21 hours ago
hb_hubin	9f57d19ec5	fix docs for custom backend	1 day ago
zhangyihuiben	0e525eeb1c	整改精度文档	1 day ago