fix a class on GPU (#2166 )

fix tokenizer not support return_tensor='ms' (#2165 )
refactor prim op for different backend (#2164 )
--- a/README.md
+++ b/README.md
@@ -70,7 +70,7 @@
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    model = AutoModel.from_pretrained("bert-base-uncased")

    inputs = tokenizer("Hello world!")
    inputs = tokenizer("Hello world!", return_tensors='ms')
    outputs = model(**inputs)
    ```

--- a/mindnlp/init.py
+++ b/mindnlp/init.py
@@ -18,7 +18,6 @@ MindNLP library.
 """
 import os
 import platform
 from packaging import version

 # huggingface env
 if os.environ.get('HF_ENDPOINT', None) is None:
@@ -29,13 +28,15 @@ if 'RANK_TABLE_FILE' in os.environ:
    del os.environ['RANK_TABLE_FILE']

 import mindspore
 from mindspore import context
 from mindspore._c_expression import MSContext # pylint: disable=no-name-in-module, import-error
 try:
    from mindspore._c_expression import disable_multi_thread
 except:
    disable_multi_thread = None

 if os.environ.get('DEVICE_TARGET', None) is not None:
    mindspore.set_device(os.environ.get('DEVICE_TARGET'))

 # for different ascend devices
 if platform.system().lower() == 'linux':
    SOC = MSContext.get_instance().get_ascend_soc_version()
--- a/mindnlp/core/_C/init.py
+++ b/mindnlp/core/_C/init.py
@@ -198,7 +198,10 @@ class Generator:
        Returns:
            Current seed and offset.
        """
        return self._generator(STEP, (self._seed, self._offset, step,))[:2]
        outs = self._generator(STEP, (self._seed, self._offset, step,))[:2]
        for o in outs:
            o._device = self.device
        return outs

 default_generator = Generator()

--- a/mindnlp/core/_apis/init.py
+++ b/mindnlp/core/_apis/init.py
--- a/mindnlp/core/_apis/cpu.py
+++ b/mindnlp/core/_apis/cpu.py
@@ -0,0 +1,1223 @@
 import ctypes
 import numbers
 import math
 import numpy as np
 import mindspore
 from mindspore._c_expression import _empty_instance
 from mindnlp import core
 from .._op_prim.cpu import legacy

 def empty(*args, **kwargs):
    return _empty_instance(*args, **kwargs, device='CPU')

 def inplace_normal(input, mean, std, generator_):
    out = np.random.normal(mean, std, input.shape).astype(core.dtype2np[input.dtype])
    numpy_to_tensor_overwrite(out, input)

    return input

 def select_ext_view(input, dim, index):
    return legacy.select_view(input, index, dim)

 def inplace_copy(input, value):
    if value.shape != input.shape:
        value = legacy.fill_v2(input.shape, value)
    # inplace_copy(input, value)
    # t2t_overwrite(input, value)
    # legacy.assign(input, value)
    if hasattr(input, '_base'):
        input._base.assign_value(value)
    input.assign_value(value)
    return input

 def fill_scalar(size, fill_value, dtype):
    if dtype is None:
        return legacy.fill_v2(size, mindspore.Tensor(fill_value))
    return legacy.cast(legacy.fill_v2(size, mindspore.Tensor(fill_value)), dtype)

 def fill_tensor(size, fill_value, dtype):
    return legacy.cast(legacy.fill_v2(size, fill_value), dtype)


 def inplace_fill_scalar(input, value):
    out = np.full_like(input.numpy(), value)
    numpy_to_tensor_overwrite(out, input)
    return input

 def zeros_like(input, dtype):
    if dtype is None:
        return legacy.zeros_like(input)
    return legacy.cast(legacy.zeros_like(input), dtype)

 def tensor_shape(input):
    return legacy.tensor_shape(input)

 def arange(start, end, step, dtype):
    return core.Tensor.from_numpy(np.arange(start, end, step, core.dtype2np[dtype]))

 def broadcast_to(input, shape):
    return legacy.broadcast_to(input, shape)

 def zeros(shape, dtype):
    return legacy.zeros(shape, dtype)

 def inplace_uniform(input, from_, to_, generator_):
    seed, _ = generator_._step(12)
    np.random.seed(seed.item())
    out = np.random.uniform(from_, to_, input.shape).astype(core.dtype2np[input.dtype])
    numpy_to_tensor_overwrite(out, input)
    return input

 def sub(input, other, alpha):
    return legacy.sub(input, legacy.mul(other, alpha))

 def contiguous(input):
    return input

 def inplace_zero(input):
    inplace_copy(input, legacy.zeros_like(input))
    return input

 def abs(input):
    return legacy.abs(input)

 def identity(input):
    return legacy.identity(input)

 def clone(input):
    return cast(legacy.mul(input, 1), input.dtype)

 def max(input):
    return legacy.reduce_max(input, (), False)

 def ones(shape, dtype):
    return legacy.ones(shape, dtype)

 def mean(input, dim, keepdim, dtype):
    if dtype is not None:
        input = legacy.cast(input, dtype)
    if dim is None:
        dim = ()
    return legacy.reduce_mean(input, dim, keepdim)

 def transpose_view(input, dim0, dim1):
    """
    Transposes the input tensor along the specified dimensions.

    Args:
        input (Tensor): The input tensor.
        dim0 (int): The first dimension to transpose.
        dim1 (int): The second dimension to transpose.

    Returns:
        Tensor: The transposed tensor.
    """
    ranks = list(range(input.ndim))
    rank0 = ranks[dim0]
    rank1 = ranks[dim1]
    ranks[dim0] = rank1
    ranks[dim1] = rank0
    return legacy.transpose(input, tuple(ranks))

 def matmul(self, other):
    if self.ndim > 2:
        if self.ndim == other.ndim:
            return legacy.batch_mat_mul(self, other, False, False)
        else:
            self_shape = self.shape
            other_shape = other.shape
            if other.ndim == 2:
                self = reshape(self, (-1, self_shape[-1]))
                out = legacy.mat_mul(self, other, False, False)
                return reshape(out, (*self_shape[:-1], out.shape[-1]))
            if self.ndim == 2:
                other = reshape(other, (-1, other_shape[-1]))
                out = legacy.mat_mul(self, other, False, False)
                return reshape(out, (*other_shape[:-1], out.shape[-1]))
    
    return legacy.mat_mul(self, other, False, False)

 def div(input, other):
    return legacy.div(input, other)

 def mul(input, other):
    return legacy.mul(input, other)

 def reduce_all(input, axis, keepdims):
    return legacy.reduce_all(input, axis, keepdims)

 def isclose(input, other, rtol, atol, equal_nan):
    return legacy.is_close(input, other, rtol, atol, equal_nan)

 def equal(input, other):
    return legacy.reduce_all(legacy.equal(input, other), (), False)

 def eq(input, other):
    return legacy.equal(input, other)


 def expand_dims(input, dim):
    return legacy.expand_dims(input, dim)

 def tile(input, dims):
    return legacy.tile(input, dims)

 py_slice = slice
 def slice(self, dim, start, end, step):
    ndim = self.ndim
    begins = [0] * ndim
    ends = [i for i in self.shape]
    strides = [1] * ndim
    begins[dim] = start
    ends[dim] = end
    strides[dim] = step
    return legacy.strided_slice(self, tuple(begins), tuple(ends), tuple(strides), 0, 0, 0, 0, 0)

 def pad_v3(input, new_pad, mode, value=None, contiguous=True):
    return legacy.pad_v3(input, new_pad, value, mode, contiguous)

 def cumsum(self, dim, dtype):
    if self.shape[dim] == 0:
        return core.tensor([], dtype=self.dtype, device=self.device)
    return legacy.cum_sum(self, dim, False, False)

 def reduce_any(input, axis, keepdims):
    return legacy.reduce_any(input, axis, keepdims)

 def concat(tensors, axis):
    return legacy.concat(tensors, axis)

 def numpy_to_tensor_overwrite(np_array, tensor):
    if not np_array.flags.c_contiguous:
        np_array = np.ascontiguousarray(np_array)

    tensor_ptr = tensor.data_ptr()
        
    ctypes.memmove(tensor_ptr, np_array.ctypes.data, tensor.nbytes)
    
    return tensor

 def t2t_overwrite(input, other):
    other._device = input.device
    ctypes.memmove(input.data_ptr(), other.data_ptr(), input.nbytes)
    return input


 def inplace_random(input, from_val=0, to_val=None, generator=None):
    # 选择随机数生成器
    rng = np.random
    arr = input.numpy()
    if np.issubdtype(arr.dtype, np.floating):
        # 浮点类型处理
        if to_val is None:
            # 默认 [0, 1) 均匀分布
            rnd = rng.random(size=arr.shape).astype(arr.dtype)
        else:
            rnd = (from_val + (to_val - from_val) * rng.random(size=arr.shape)).astype(arr.dtype)
            
    elif np.issubdtype(arr.dtype, np.integer):
        # 整数类型处理
        from_int = int(from_val)
        
        if to_val is None:
            # 默认范围 [0, dtype.max]
            max_val = np.iinfo(arr.dtype).max
            rnd = rng.randint(0, max_val + 1, size=arr.shape).astype(arr.dtype)
        else:
            # 指定范围 [from_int, to_val)
            to_int = int(to_val)
            
            # 验证参数有效性
            if from_int >= to_int:
                raise ValueError(f"Empty range for integers: from={from_int} >= to={to_int}")
                
            # 处理整数边界问题
            dtype_min = np.iinfo(arr.dtype).min
            dtype_max = np.iinfo(arr.dtype).max
            from_int = np.clip(from_int, dtype_min, dtype_max)
            to_int = np.clip(to_int, dtype_min + 1, dtype_max + 1)
            
            rnd = rng.randint(from_int, to_int, size=arr.shape).astype(arr.dtype)
            
    elif arr.dtype == bool:
        # 布尔类型处理 (忽略 from_val/to_val)
        rnd = rng.random(size=arr.shape) > 0.5
    
    else:
        raise TypeError(f"Unsupported data type: {arr.dtype}")
    
    numpy_to_tensor_overwrite(rnd, input)

    return input

 def gather_d(input, dim, index):
    return legacy.gather_d(input, dim, index)

 def reshape(input, shape):
    return legacy.reshape(input, shape)

 def flatten(input, start_dim, end_dim):
    if start_dim < 0:
        start_dim = start_dim + input.ndim
    if end_dim < 0:
        end_dim = end_dim + input.ndim
    input_shape = list(input.shape)
    input_shape[start_dim:end_dim+1] = [-1]
    return legacy.reshape(input, tuple(input_shape))

 def sort(input, dim, descending, stable):
    return legacy.sort(input, dim, descending)

 def gather(input_params, input_indices, axis, batch_dim):
    return legacy.gather(input_params, input_indices, axis, batch_dim)

 def randint(low, high, shape, generator, dtype):
    value = legacy.uniform_int(shape,
                                mindspore.tensor(low, dtype=mindspore.int32),
                                mindspore.tensor(high, dtype=mindspore.int32), 0, 0)
    return value

 def add(input, other, alpha=1):
    if alpha == 1.0:
        return legacy.add(input, other)
    return legacy.add(input, legacy.mul(other, alpha))

 def non_zero(input):
    return legacy.non_zero(input)

 def stop_gradient(input):
    return legacy.stop_gradient(input)

 def squeeze(input, axis):
    return legacy.squeeze(input, axis)

 def softmax(input, axis):
    if axis is None:
        axis = -1
    return legacy.softmax(input, axis)

 def topk(input, k, dim, largest, sorted):
    if not largest:
        input = -input
    if dim is None or dim == input.ndim - 1:
        if not largest:
            res = legacy.top_k(input, k, sorted)
            values, indices = -res[0], res[1]
            return values, indices
        return legacy.top_k(input, k, sorted)
    input = transpose_view(input, dim, input.ndim - 1)
    output = legacy.top_k(input, k, sorted)
    values = transpose_view(output[0], dim, input.ndim - 1)
    indices = transpose_view(output[1], dim, input.ndim - 1)
    if not largest:
        res = (-values, indices)
    else:
        res = (values, indices)
    return res

 def strided_slice(input, begin, end, strides, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0):
    return legacy.strided_slice(input, tuple(begin), tuple(end), tuple(strides), begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask)

 def strided_slice_grad(input, begin, end, strides, update, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask):
    return legacy.strided_slice_grad(update, input.shape, begin, end, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask)

 def masked_select(input, mask):
    return legacy.masked_select(input, mask)

 def stack(values, axis=0):
    return legacy.stack(values, axis)

 def cast(input, dtype):
    return legacy.cast(input, dtype)

 def less(input, other):
    return legacy.less(input, other)

 def select(condition, x, y):
    return legacy.select(condition, x, y)

 def round(input, decimals):
    return legacy.round(input, decimals)

 def erfinv(input):
    return legacy.erfinv(input)

 def erf(input):
    return legacy.erf(input)

 def pow_scalar_tensor(input, exponent):
    return legacy.pow(input, exponent)

 def inplace_add(input, other, alpha):
    if alpha != 1:
        return inplace_copy(input, legacy.add(input, legacy.mul(other, alpha)))
    return inplace_copy(input, legacy.add(input, other))

 def clamp_scalar(value, min_value, max_value):
    if min_value is not None:
        value = legacy.maximum(value, min_value)
    if max_value is not None:
        value = legacy.minimum(value, max_value)
    return value

 def constant_pad_nd(input, pad, value):
    return legacy.pad_v3(input, pad, value, 'constant', True)

 def randn(size, generator, dtype):
    return cast(legacy.standard_normal(tuple(size), 0, 0), dtype)

 def rand(size, generator, dtype):
    return cast(legacy.uniform_real(tuple(size), 0, 0), dtype)

 def tril(input, diagonal):
    return legacy.tril(input, diagonal)

 def dense(input, weight, bias=None):
    return legacy.dense(input, weight, bias)

 def relu(input):
    return legacy.re_lu(input)

 def assign(input, value):
    return inplace_copy(input, value)

 def square(input):
    return legacy.square(input)

 def log(input):
    if not input.dtype.is_floating_point:
        input = cast(input, mindspore.float32)
    return legacy.log(input)

 def permute(input, dims):
    return legacy.transpose(input, dims)

 def ones_like(input, dtype):
    if dtype is not None:
        return cast(legacy.ones_like(input), dtype)
    return legacy.ones_like(input)

 def embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq):
    return cast(legacy.gather(weight, input, 0, 0), weight.dtype)

 def linspace(start, end, steps, dtype):
    start = float(start)
    end = float(end)
    return legacy.lin_space(mindspore.Tensor(start), mindspore.Tensor(end), steps)

 def masked_fill(input, mask, value):
    if input.dtype.is_floating_point and isinstance(value, numbers.Number):
        value = float(value)
    return legacy.masked_fill(input, mask, value)

 def sum(input, dim, keepdim, dtype):
    if dim is None:
        dim = ()
    if input.dtype == mindspore.bool_:
        input = cast(input, mindspore.int64)
    if dtype is None:
        return legacy.reduce_sum(input, dim, keepdim, False)
    return legacy.reduce_sum(input.astype(dtype), dim, keepdim, False)

 def conv2d(input, weight, bias=None, stride=1, padding='valid', dilation=1, groups=1):
    pad_mode = 'pad'
    pad = padding
    if isinstance(padding, (tuple, list)):
        pad = (padding[0], padding[0], padding[1], padding[1])
    elif isinstance(padding, int):
        pad = (padding,) * 4
    if not isinstance(padding, (int, tuple, list)):
        pad_mode = padding
        pad = (0,) * 4
    
    if isinstance(stride, int):
        stride = (stride,) * 4

    out_channels = weight.shape[0]
    kernel_size = weight.shape[2:]

    output = legacy.conv2_d(
        input, weight,
        out_channels,
        kernel_size,
        1,#mode=1,
        pad_mode, #pad_mode=pad_mode,
        pad, #pad=pad,
        tuple(stride), #stride=tuple(stride),
        dilation, #dilation=dilation,
        groups, #group=groups,
        "NCHW", #data_format="NCHW"
    )
    if bias is not None:
        output = legacy.bias_add(output, bias, "NCHW")
    return output

 def conv2d_padding(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    return conv2d(input, weight, bias, stride, padding, dilation, groups)

 def pow_tensor_scalar(input, scalar):
    return legacy.pow(input, scalar)

 def rsqrt(input):
    return legacy.rsqrt(input)

 def layer_norm(input, normalized_shape, weight, bias, eps=1e-5):
    if weight is not None:
        begin_axis = input.ndim - weight.ndim
    else:
        begin_axis = -1
    return legacy.layer_norm(input, weight, bias, begin_axis, begin_axis, eps)

 def argmin_with_value(input, axis, keep_dims):
    return legacy.arg_min_with_value(input, axis, keep_dims)

 def argmax_with_value(input, axis, keep_dims):
    return legacy.arg_max_with_value(input, axis, keep_dims)

 def silu(input):
    return legacy.mul(input, legacy.sigmoid(input))

 def less_equal(input_x, input_y):
    return legacy.less_equal(input_x, input_y)

 def not_equal(input_x, input_y):
    return legacy.not_equal(input_x, input_y)


 def logical_not(input):
    return legacy.logical_not(input)

 def tensor_scatter_update(input, indices, updates):
    return legacy.tensor_scatter_update(input, indices, updates)

 def isinf(input):
    return legacy.is_inf(input)

 def gelu(input, approximate):
    return legacy.ge_lu(input)

 def greater(input_x, input_y):
    return legacy.greater(input_x, input_y)

 def greater_equal(input_x, input_y):
    return legacy.greater_equal(input_x, input_y)

 def eye(n, m, dtype):
    return legacy.eye(n, m, dtype)

 def argmax(input, axis, keep_dims):
    return legacy.arg_max_with_value(input, axis, keep_dims)[0]

 def argmin(input, axis, keep_dims):
    return legacy.arg_min_with_value(input, axis, keep_dims)[0]

 def exp(input):
    return legacy.exp(input)

 def split_with_size(tensor, split_sizes, dim=0):
    chunks = []
    start = 0
    for chunk_size in split_sizes:
        end = start + chunk_size
        slice_obj = [py_slice(None)] * tensor.dim()
        slice_obj[dim] = py_slice(start, end)
        chunks.append(tensor[tuple(slice_obj)])
        start = end

    return tuple(chunks)


 def cos(input):
    return legacy.cos(input)

 def sigmoid(input):
    return legacy.sigmoid(input)

 def sqrt(input):
    return legacy.sqrt(input)

 def chunk(input, chunks, dim=0):
    return legacy.split(input, dim, chunks)

 def sin(input):
    return legacy.sin(input)

 def neg(input):
    return legacy.neg(input)

 def bitwise_or_tensor(input_x, input_y):
    return legacy.bitwise_or(input_x, input_y)

 def bitwise_and_tensor(input_x, input_y):
    return legacy.bitwise_and(input_x, input_y)

 def non_zero_ext(input):
    out = legacy.non_zero(input)
    return unbind(out, 1, out.shape[1])

 def unbind(input, dim, num):
    return legacy.unstack(input, dim, num)

 def log1p(input):
    return legacy.log1p(input)

 def log_softmax(input, axis, dtype):
    if dtype is not None:
        input = input.astype(dtype)
    return legacy.log_softmax(input, axis)

 def scatter(input, dim, index, src):
    return legacy.tensor_scatter_elements(input, index, src, dim, "none")

 def batch_norm(input, weight, bias, running_mean=None, runnning_var=None, training=False, momentum=0.1, epsilon=1e-5):
    input_ndim = input.ndim
    if input_ndim == 2:
        return legacy.batch_norm(input, weight, bias, running_mean, runnning_var, training, epsilon, momentum, 'NCHW')
    else:
        input = transpose_view(input, 1, -1)
        input_shape = input.shape
        input = reshape(input, (-1, input.shape[-1]))
        outs = legacy.batch_norm(input, weight, bias, running_mean, runnning_var, training, epsilon, momentum, 'NCHW')
        out = reshape(outs[0], (*input_shape[:-1], -1))
        out = transpose_view(out, 1, -1)

        return out, outs[1], outs[2]

 def tanh(input):
    return legacy.tanh(input)

 def dropout(input, p, seed, offset):
    return legacy.dropout(input, 1-p, 0, 0)

 def split_tensor(input, split_size_or_sections, dim):
    if isinstance(split_size_or_sections, int):
        num = input.shape[dim] // split_size_or_sections
        return legacy.split(input, dim, num)

 def bmm(input_x, input_y):
    return legacy.batch_mat_mul(input_x, input_y, False, False)

 def nllloss(input, target, weight, reduction, ingore_index):
    return legacy.nll_loss(input, target, weight, reduction, ingore_index)

 def nllloss_2d(input, target, weight, reduction, ingore_index):
    input = reshape(transpose_view(input, 1, -1), (-1, input.shape[1]))
    target = reshape(target, (-1,))
    out = legacy.nll_loss(input, target, weight, reduction, ingore_index)
    return out


 def binary_cross_entropy_with_logits(input, target, weight, posWeight, reduction):
    return legacy.bce_with_logits_loss(input, target, weight, posWeight, reduction)

 def std(input, dim, correction, keepdim):
    if dim is None:
        dim = ()
    return legacy.reduce_std(input, dim, bool(correction), keepdim)[0]

 def linalg_vector_norm(x, ord=2, dim=None, keepdim=False, dtype=None):
    return legacy.lp_norm(x, dim, int(ord), keepdim, 1e-12)

 def rfft(input, n=None, dim=-1, norm=None):
    if input.shape[dim] < n:
        pad_inf = (0, n - input.shape[dim])
        pad_dims = (0, 0) * (input.ndim - (dim + 1)) + pad_inf
        input = constant_pad_nd(input, pad_dims, 0.)
    else:
        input = narrow(input, dim, 0, n)
    return legacy.fft_with_size(input, input.ndim, False, True, norm, True, ())

 def narrow(input, dim, start, length):
    begin = [0] * input.ndim
    size = [i for i in input.shape]
    begin[dim] = start
    size[dim] = length
    return legacy.slice(input, begin, size)

 def conj(input):
    return legacy.conj(input)

 def irfft(input, n, dim, norm):
    if input.shape[dim] < n:
        pad_inf = (0, n - input.shape[dim])
        pad_dims = (0, 0) * (input.ndim - (dim + 1)) + pad_inf
        input = constant_pad_nd(input, pad_dims, 0.)
    else:
        input = narrow(input, dim, 0, n)
    return legacy.fft_with_size(input, input.ndim, True, True, norm, True, ())

 def avg_pool1d(input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True):
    if isinstance(padding, int):
        padding = (0, 0, 0, 0, padding, padding)
    elif isinstance(padding, tuple):
        if len(padding) != 1:
            raise ValueError("For avg_pool1d, padding should be int or tuple of length 1.")
        padding = (0, 0, 0, 0, padding[0], padding[1])
    else:
        raise TypeError("For avg_pool1d, padding should be int or tuple of length 1.")

    if isinstance(stride, tuple):
        if len(stride) != 1:
            raise ValueError("For avg_pool1d, stride should be int or tuple of length 1.")
        stride = stride[0]

    input = expand_dims(input, 2)
    input = expand_dims(input, 2)
    input = legacy.avg_pool3_d(input, (1, 1, kernel_size), (1, 1, stride), 'pad', padding, ceil_mode, count_include_pad, 0, 'NCDHW')
    input = squeeze(input, (2, 3))
    return input

 def fmod_scalar(input, other):
    return legacy.floor_mod(input, other)

 def conv1d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    pad_mode = 'pad'
    pad = padding
    if isinstance(padding, tuple):
        pad = (0, 0, padding[0], padding[0])
    elif isinstance(padding, int):
        pad = (0, 0) + (padding,) * 2
    if not isinstance(padding, (int, tuple)):
        pad_mode = padding
        pad = (0,) * 4

    input = expand_dims(input, 2)
    weight = expand_dims(weight, 2)

    output = legacy.conv2_d(
        input, weight,
        weight.shape[0],
        (1, weight.shape[-1]),
        1,#mode=1,
        pad_mode, #pad_mode=pad_mode,
        pad, #pad=pad,
        (1, stride) if isinstance(stride, int) else (1, *stride), #stride=tuple(stride),
        (1, dilation) if isinstance(dilation, int) else (1, *dilation), #dilation=dilation,
        groups, #group=groups,
        "NCHW", #data_format="NCHW"
    )


    if bias is not None:
        output = legacy.bias_add(output, bias, "NCHW")

    output = squeeze(output, 2)
    return output

 def maximum(input, other):
    return legacy.maximum(input, other)

 def prod(input, axis, keepdims, dtype):
    if axis is None:
        axis = ()
    return legacy.reduce_prod(input, axis, keepdims)

 def mse_loss(input, target, reduction):
    x = square(input - target)
    average_flag = True
    reduce_flag = True
    if reduction == 'sum':
        average_flag = False
    if reduction == 'none':
        reduce_flag = False

    if reduce_flag and average_flag:
        x = mean(x, tuple(range(x.ndim)), False, None)

    if reduce_flag and not average_flag:
        x = sum(x, tuple(range(x.ndim)), False, None)

    return x

 def adaptive_avg_pool2d(input, output_size):
    return legacy.adaptive_avg_pool2_d(input, output_size)

 def avg_pool2d(input, kernel_size, stride, padding=0, ceil_mode=False, count_include_pad=True, divisor_override=None):
    if isinstance(padding, int):
        padding = (0, 0, padding, padding, padding, padding)
    elif isinstance(padding, tuple):
        if len(padding) != 1:
            raise ValueError("For avg_pool1d, padding should be int or tuple of length 1.")
        padding = (0, 0, padding[0], padding[1], padding[2], padding[3])
    else:
        raise TypeError("For avg_pool1d, padding should be int or tuple of length 1.")

    if isinstance(kernel_size, int):
        kernel_size = (kernel_size, kernel_size)
    if isinstance(stride, int):
        stride = (stride, stride)

    input = expand_dims(input, 2)
    input = legacy.avg_pool3_d(input, (1, *kernel_size), (1, *stride), 'pad', padding, ceil_mode, count_include_pad, 0, 'NCDHW')
    input = squeeze(input, 2)
    return input

 def bitwise_or_scalar(input, value):
    return legacy.bitwise_or(input, value)

 def floor_div(input, other):
    return legacy.floor_div(input, other)

 def minimum(input, other):
    return legacy.minimum(input, other)

 def reverse_v2(input, axis):
    if isinstance(axis, int):
        axis = (axis,)
    return legacy.reverse_v2(input, axis)

 def divmod(input, other, rounding_mode):
    if rounding_mode == 'floor':
        return legacy.floor_div(input, other)
    elif rounding_mode == 'trunc':
        if isinstance(input, numbers.Number):
            input = mindspore.Tensor(input)
        return legacy.truncate_div(input, other)
    else:
        raise ValueError(f'Invalid rounding mode: {rounding_mode}')

 def pow(input, exponent):
    return legacy.pow(input, exponent)


 def bitwise_and_scalar(input, value):
    return legacy.bitwise_and(input, value)

 def rand_like(input, generator, dtype):
    return rand(input.shape, generator, dtype)

 def bincount(input, weights=None, minlength=0):
    if weights is None:
        weights =  mindspore.Tensor(1, dtype=mindspore.int32)
    return legacy.bincount(cast(input, mindspore.int32),
                           mindspore.Tensor(minlength, dtype=mindspore.int32),
                           weights)

 def lgamma(input):
    return legacy.lgamma(input)

 def _deconv_output_length(pad_mode, filter_size, stride_size, dilation_size, padding):
    """Calculate the width and height of output."""
    length = 0
    filter_size = filter_size + (filter_size - 1) * (dilation_size - 1)
    if pad_mode == 'valid':
        if filter_size - stride_size > 0:
            length = filter_size - stride_size
    elif pad_mode == 'pad':
        length = - padding + filter_size - stride_size

    return length


 def conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1):
    pad_mode = 'pad'
    pad = padding
    if isinstance(padding, tuple):
        pad = (0, 0, padding[0], padding[0])
    elif isinstance(padding, int):
        pad = (0, 0) + (padding,) * 2
    if not isinstance(padding, (int, tuple)):
        pad_mode = padding
        pad = (0,) * 4

    if isinstance(dilation, int):
        dilation = (dilation, dilation)

    in_channel, out_channels = weight.shape[0], weight.shape[1] * groups
    kernel_size = weight.shape[2:]

    n, _, h, w = input.shape
    h_add = _deconv_output_length(pad_mode, kernel_size[0], stride[0], dilation[0], pad[0] + pad[1])
    w_add = _deconv_output_length(pad_mode, kernel_size[1], stride[1], dilation[1], pad[2] + pad[3])

    out = legacy.conv2_d_transpose(
        input, weight,
        (n, out_channels, h * stride[0] + h_add, w * stride[1] + w_add),
        out_channels,
        kernel_size,
        pad_mode,
        pad,
        None,
        1,
        stride,
        dilation,
        groups,
        'NCHW'
    )
    if bias is not None:
        out = legacy.bias_add(out, bias, 'NCHW')
    return out

 def expm1(x):
    return legacy.expm1(x)

 py_min = min
 def min(input):
    return legacy.reduce_min(input, (), False)

 def acos(x):
    return legacy.a_cos(x)

 def upsample_bilinear2d(input, size=None, scale_factor=None, align_corners=False):
    return legacy.resize_bilinear_v2(input, size, align_corners, not align_corners)

 def unstack_view(input, dim):
    return legacy.unstack(input, dim, input.shape[dim])

 def triu(input, diagonal=0):
    return legacy.triu(input, diagonal)

 def masked_scatter(input, mask, value):
    return legacy.masked_scatter(input, mask, value)

 def max_pool2d(input, kernel_size, stride=1, padding=0, dilation=1, ceil_mode=False, return_indices=False):
    out, indices = legacy.max_pool_with_argmax_v2(input, kernel_size, stride, padding, dilation, ceil_mode, mindspore.int64)

    if return_indices:
        return out, indices
    return out

 def baddbmm(input, batch1, batch2, alpha=1, beta=1):
    return add(mul(beta, input), mul(alpha, bmm(batch1, batch2)))

 def inplace_fill_tensor(input, value):
    out = np.full_like(input.numpy(), value)
    numpy_to_tensor_overwrite(out, input)
    return input

 def softplus(input, beta=1, threshold=20):
    return legacy.softplus(input)

 def gather_nd(input, indices):
    return legacy.gather_nd(input, indices)

 def unique_consecutive(input, return_inverse, return_counts, dim):
    return legacy.unique_consecutive(input, return_inverse, return_counts, dim)

 def meshgrid(input, lambd):
    return legacy.meshgrid(input, lambd)

 def addcmul(input, tensor1, tensor2, value=1.0):
    return legacy.addcmul(input, tensor1, tensor2, mindspore.Tensor(value))

 def addmm(input, mat1, mat2, alpha=1.0, beta=1.0):
    return add(mul(beta, input), mul(alpha, bmm(mat1, mat2)))

 def im2col(input, kernel_size, dilation=1, padding=0, stride=1):
    out = legacy.im2_col(input, kernel_size, stride, dilation, padding)
    out_shape = out.shape[:1] + (-1,) + out.shape[-1:]
    out = reshape(out, out_shape)
    return out

 def floor(input):
    return legacy.floor(input)

 def upsample_nearest2d(input, output_size, scale_factors):
    if output_size is None:
        tuple_len = py_min(len(input.shape) - 2, len(scale_factors))
        output_size = tuple([math.floor(input.shape[i + 2] * scale_factors[i])
                        for i in range(tuple_len)])

    return legacy.resize_nearest_neighbor(input, output_size, False, False)

 def upsample_bicubic2d(input, size=None, scale_factor=None, align_corners=False):
    return legacy.resize_bicubic(input, size, align_corners, not align_corners)

 def conv3d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    pad_mode = 'pad'
    pad = padding
    if isinstance(padding, (tuple, list)):
        pad = (padding[0], padding[0], padding[1], padding[1], padding[2], padding[2])
    elif isinstance(padding, int):
        pad = (padding,) * 6
    if not isinstance(padding, (int, tuple, list)):
        pad_mode = padding
        pad = (0,) * 6

    out_channels = weight.shape[0]
    kernel_size = weight.shape[2:]

    output = legacy.conv3_d(input, weight,
                            out_channels,
                            kernel_size,
                            1,
                            pad_mode,
                            pad,
                            tuple(stride),
                            dilation,
                            groups,
                            "NCDHW")
                            
    if bias is not None:
        output = legacy.bias_add(output, bias, 'NCHW')
    return output


    return legacy.conv3_d(input, weight, bias, stride, padding, dilation, groups)

 def normal_float_float(mean, std, size, dtype, generator):
    out = np.random.normal(mean, std, size).astype(core.dtype2np[dtype])
    out = mindspore.Tensor(out)
    return out

 def normal_tensor_tensor(mean, std, size, dtype, generator):
    out = np.random.normal(mean.item(), std.item(), size).astype(core.dtype2np[dtype])
    out = mindspore.Tensor(out)
    return out

 def inplace_relu(input):
    return legacy.assign(input, legacy.re_lu(input))

 def adaptive_avg_pool1d(input, output_size):
    x_in_shape = input.shape
    width = x_in_shape[2]
    stride = width // output_size
    kernel_size = width - (output_size - 1) * stride
    stride = (1, width // output_size)
    kernel_size = (1, kernel_size)
    input = expand_dims(input, 2)
    input = legacy.avg_pool(input, kernel_size, stride, "VALID", "NCHW")
    input = squeeze(input, 2)
    return input

 def remainder_tensor_scalar(input, other):
    out = sub(input, mul(floor_div(input, other), other), 1)
    return out

 def outer(input, other):
    input = reshape(input, (-1, 1))
    y = mul(input, other)
    return y

 def view_as_complex(input):
    real_part, imag_part = chunk(input, 2, -1)
    return legacy.complex(squeeze(real_part, -1), squeeze(imag_part, -1))

 def cdist(x1, x2, p):
    return legacy.cdist(x1, x2, float(p))

 def prelu(input, weight):
    return legacy.p_re_lu(input, weight)

 def reciprocal(input):
    return legacy.reciprocal(input)

 def ctc_loss(log_probs, targets, input_lengths, target_lengths, blank, reduction, zero_infinity):
    loss, log_alpha = legacy.ctc_loss_v2(log_probs, targets, input_lengths, target_lengths, blank, 'none', zero_infinity)
    if reduction == 'sum':
        loss = sum(loss, (), False, None)
    if reduction == 'mean':
        # input_type = loss.dtype
        # target_length_t = target_lengths.clip(1., None)
        # loss = loss.astype("float32")
        loss = div(loss, target_lengths)
        loss = mean(loss, (), False, None)
        # loss = loss.astype(input_type)
    return (loss, log_alpha)

 def glu(input, dim=-1):
    return legacy.glu(input, dim)

 def one_hot(tensor, num_classes):
    on_value = mindspore.Tensor(1, dtype=tensor.dtype)
    off_value = mindspore.Tensor(0, dtype=tensor.dtype)
    return legacy.one_hot(tensor, num_classes, on_value, off_value, -1)

 def polar(abs, angle):
    return legacy.polar(abs, angle)

 def scatter_value(input, dim, index, src, reduce='none'):
    if isinstance(src, numbers.Number):
        src = fill_scalar(index.shape, src, dtype=input.dtype)
    return legacy.tensor_scatter_elements(input, index, src, dim, reduce)

 def pixel_shuffle(input, upscale_factor):
    idx = input.shape
    length = input.ndim
    pre = idx[:-3]
    c, h, w = idx[-3:]
    c = c // upscale_factor ** 2
    input_perm = pre + (c, upscale_factor, upscale_factor, h, w)
    input = reshape(input, input_perm)
    input_perm = [i for i in range(length - 2)]
    input_perm = input_perm + [length, length - 2, length + 1, length - 1]
    input_perm = tuple(input_perm)
    input = permute(input, input_perm)
    input = reshape(input, (pre + (c, upscale_factor * h, upscale_factor * w)))
    return input

 def rms_norm(input, weight, eps=1e-5):
    input_dtype = input.dtype
    input = cast(input, mindspore.float32)
    variance = mean(pow(input, 2), -1, True, None)
    input = mul(input, rsqrt(add(variance, eps, 1)))
    return mul(weight, cast(input, input_dtype))

 def count_nonzero(input, dims):
    return legacy.count_non_zero(input, dims)

 def index_add_ext(input, dim, index, source, alpha):
    if alpha != 1:
        source = mul(alpha, source)
    return legacy.index_add(input, cast(index, mindspore.int32), source, dim, True, True)

 def real(input):
    return legacy.real(input)

 def upsample_linear1d(input, output_size, scale_factor, align_corners=False):
    coordinate_transformation_mode = "align_corners" if align_corners else "half_pixel"
    return legacy.resize_linear1_d(input, output_size, coordinate_transformation_mode)

 def imag(input):
    return legacy.imag(input)

 def bitwise_xor_tensor(input, other):
    return legacy.bitwise_xor(input, other)

 def grid_sampler_2d(input, grid, mode='bilinear', padding_mode='zeros', align_corners=False):
    return legacy.grid_sampler2_d(input, grid, mode, padding_mode, align_corners)

 def l1_loss(input, target, reduction='mean'):
    loss = abs(sub(input, target))
    if reduction == 'mean':
        return mean(loss, (), False, False)
    elif reduction == 'sum':
        return sum(loss, (), False, False)
    return loss

 def leaky_relu(input, negative_slope):
    select_op = maximum
    if negative_slope > 1:
        select_op = minimum
    return select_op(mul(negative_slope, input), input)

 def ceil(input):
    return legacy.ceil(input)

 def reduce_max(input, axis, keepdims):
    return legacy.reduce_max(input, axis, keepdims)

 def nan_to_num(input, nan=0.0, posinf=None, neginf=None):
    return legacy.nan_to_num(input, nan, posinf, neginf)

 def elu(input, alpha):
    return legacy.elu(input, alpha)

 def sign(input):
    return legacy.sign(input)

 def inplace_fill_diagonal(input, fill_value, wrap):
    inplace_copy(input, legacy.fill_diagonal(input, float(fill_value), wrap))
    return input

 def clamp_tensor(value, min_value, max_value):
    if min_value is not None:
        value = legacy.maximum(value, min_value)
    if max_value is not None:
        value = legacy.minimum(value, max_value)
    return value

 def lstm(input, h, c, w, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout, proj_size):
    return legacy.lstm(input, h, c, w, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout, proj_size)

 def var(input, dim=None, correction=1, keepdim=False):
    if dim is None:
        input_mean = mean(input, (), False, None)
    else:
        input_mean = mean(input, dim=dim, keepdim=True, dtype=None)
    
    # 计算与均值的平方差
    squared_diff = pow(sub(input, input_mean, 1), 2) 
    # 计算方差
    if dim is None:
        variance = mean(squared_diff, (), False, None)
        n = input.numel()  # 总元素个数
    else:
        variance = mean(squared_diff, dim=dim, keepdim=keepdim, dtype=None)
        n = input.size(dim)  # 指定维度的元素个数
    
    # 无偏估计校正
    if correction and n > 1:
        variance = mul(variance, (n / (n - 1)))
    
    return variance

 def log2(input):
    return div(log(input), math.log(2))

 def bucketize(input, boundaries, right=False):
    epsilon_ = 0. if right else 1.e-6
    boundaries = [boundary + epsilon_ for boundary in boundaries]
    return legacy.bucketize(input, boundaries)

 def col2im(input, output_size, kernel_size, dilation=1, padding=0, stride=1):
    return legacy.col2_im(input, output_size, kernel_size, dilation, padding, stride)

 def randperm(n, generator, dtype):
    seed, offset = generator._step(12)  # pylint: disable=protected-access
    return legacy.randperm_v2(n, seed, offset, dtype)

 def gamma(shape, alpha, beta):
    out = np.random.gamma(alpha, 1/beta, shape)
    return core.Tensor.from_numpy(out)

 def logical_or(input_x, input_y):
    return legacy.logical_or(input_x, input_y)

 def hswish(input):
    return legacy.h_swish(input)

 def logical_and(input_x, input_y):
    return legacy.logical_and(input_x, input_y)

 def logsigmoid(input):
    output = sigmoid(input)
    ret = log(output)
    return ret

 def as_strided(input, size, stride, storage_offset):
    if len(size) != len(stride):
        raise RuntimeError("mismatch in length of strides and shape.")
    index = np.arange(0, size[0]*stride[0], stride[0])
    for i in np.arange(1, len(size)):
        tmp = np.arange(0, size[i]*stride[i], stride[i])
        index = np.expand_dims(index, -1)
        index = index + tmp
    if storage_offset is not None:
        index = index + storage_offset

    if index.size == 0:
        input_indices = mindspore.numpy.empty(index.shape, dtype=mindspore.int32)
    else:
        input_indices = mindspore.tensor(index.astype(np.int32))
    out = gather(reshape(input, (-1,)), input_indices, 0, 0)
    return out

 def dropout2d(input_x, p):
    return legacy.dropout2_d(input_x, p)

 def linalg_qr(input_x, mode):
    full_matrices = 'mode' == 'complete'
    return legacy.qr(input_x, full_matrices)

 def diag(input, diagonal):
    out = np.diag(input.numpy(), diagonal)
    return core.Tensor.from_numpy(out)

 def logit(input, eps=1e-5):
    return legacy.logit(input, eps)

 def relu6(input):
    return legacy.re_lu6(input)

 def logsumexp(input, dim, keepdim=False):
    input_max = legacy.reduce_max(input, dim, keepdim)
    input_exp = exp(sub(input, input_max))
    input_sumexp = sum(input_exp, dim, keepdim, None)
    input_logsumexp = log(input_sumexp)
    if not keepdim:
        input_max = squeeze(input_max, dim)
    return add(input_logsumexp, input_max)

 def bernoulli(input, generator):
    return legacy.bernoulli(input, seed, offset)
--- a/mindnlp/core/_apis/gpu.py
+++ b/mindnlp/core/_apis/gpu.py
@@ -0,0 +1,1123 @@
 import ctypes
 import numbers
 import math
 import mindspore
 from mindspore._c_expression import _empty_instance
 from mindnlp import core
 from .._op_prim.cpu import legacy

 try:
    from mindspore._c_expression import TensorPy as Tensor_
 except:
    from mindspore._c_expression import Tensor as Tensor_


 def empty(size, dtype):
    return mindspore.Tensor(Tensor_(shape=size, dtype=dtype))

 def select_ext_view(input, dim, index):
    return legacy.select_view(input, index, dim)

 def inplace_copy(input, value):
    if value.shape != input.shape:
        value = legacy.fill_v2(input.shape, value)
    # inplace_copy(input, value)
    # legacy.assign(input, value)
    if hasattr(input, '_base'):
        input._base.assign_value(value)
    input.assign_value(value)
    return input

 def fill_scalar(size, fill_value, dtype):
    if dtype is None:
        return legacy.fill_v2(size, mindspore.Tensor(fill_value))
    return legacy.cast(legacy.fill_v2(size, mindspore.Tensor(fill_value)), dtype)

 def fill_tensor(size, fill_value, dtype):
    return legacy.cast(legacy.fill_v2(size, fill_value), dtype)

 def zeros_like(input, dtype):
    if dtype is None:
        return legacy.zeros_like(input)
    return legacy.cast(legacy.zeros_like(input), dtype)

 def tensor_shape(input):
    return legacy.tensor_shape(input)

 def broadcast_to(input, shape):
    return legacy.broadcast_to(input, shape)

 def zeros(shape, dtype):
    return legacy.zeros(shape, dtype)

 def sub(input, other, alpha=1):
    return legacy.sub(input, legacy.mul(other, alpha))

 def contiguous(input):
    return input

 def inplace_zero(input):
    inplace_copy(input, legacy.zeros_like(input))
    return input

 def abs(input):
    return legacy.abs(input)

 def identity(input):
    return legacy.identity(input)

 def clone(input):
    return cast(legacy.mul(input, 1), input.dtype)

 def max(input):
    return legacy.reduce_max(input, (), False)

 def ones(shape, dtype):
    return legacy.ones(shape, dtype)

 def mean(input, dim, keepdim, dtype):
    if dtype is not None:
        input = legacy.cast(input, dtype)
    if dim is None:
        dim = ()
    return legacy.reduce_mean(input, dim, keepdim)

 def transpose_view(input, dim0, dim1):
    """
    Transposes the input tensor along the specified dimensions.

    Args:
        input (Tensor): The input tensor.
        dim0 (int): The first dimension to transpose.
        dim1 (int): The second dimension to transpose.

    Returns:
        Tensor: The transposed tensor.
    """
    ranks = list(range(input.ndim))
    rank0 = ranks[dim0]
    rank1 = ranks[dim1]
    ranks[dim0] = rank1
    ranks[dim1] = rank0
    return legacy.transpose(input, tuple(ranks))

 def matmul(self, other):
    if self.ndim > 2:
        if self.ndim == other.ndim:
            return legacy.batch_mat_mul(self, other, False, False)
        else:
            self_shape = self.shape
            other_shape = other.shape
            if other.ndim == 2:
                self = reshape(self, (-1, self_shape[-1]))
                out = legacy.mat_mul(self, other, False, False)
                return reshape(out, (*self_shape[:-1], out.shape[-1]))
            if self.ndim == 2:
                other = reshape(other, (-1, other_shape[-1]))
                out = legacy.mat_mul(self, other, False, False)
                return reshape(out, (*other_shape[:-1], out.shape[-1]))
    
    return legacy.mat_mul(self, other, False, False)

 def div(input, other):
    return legacy.div(input, other)

 def mul(input, other):
    return legacy.mul(input, other)

 def reduce_all(input, axis, keepdims):
    return legacy.reduce_all(input, axis, keepdims)

 def isclose(input, other, rtol, atol, equal_nan):
    return legacy.is_close(input, other, rtol, atol, equal_nan)

 def equal(input, other):
    return legacy.reduce_all(legacy.equal(input, other), (), False)

 def eq(input, other):
    return legacy.equal(input, other)


 def expand_dims(input, dim):
    return legacy.expand_dims(input, dim)

 def tile(input, dims):
    return legacy.tile(input, dims)

 py_slice = slice
 def slice(self, dim, start, end, step):
    ndim = self.ndim
    begins = [0] * ndim
    ends = [i for i in self.shape]
    strides = [1] * ndim
    begins[dim] = start
    ends[dim] = end
    strides[dim] = step
    return legacy.strided_slice(self, tuple(begins), tuple(ends), tuple(strides), 0, 0, 0, 0, 0)

 def pad_v3(input, new_pad, mode, value=None, contiguous=True):
    return legacy.pad_v3(input, new_pad, value, mode, contiguous)

 def cumsum(self, dim, dtype):
    if self.shape[dim] == 0:
        return core.tensor([], dtype=self.dtype, device=self.device)
    return legacy.cum_sum(self, dim, False, False)

 def reduce_any(input, axis, keepdims):
    return legacy.reduce_any(input, axis, keepdims)

 def concat(tensors, axis):
    return legacy.concat(tensors, axis)

 def gather_d(input, dim, index):
    return legacy.gather_d(input, dim, index)

 def reshape(input, shape):
    return legacy.reshape(input, shape)

 def flatten(input, start_dim, end_dim):
    if start_dim < 0:
        start_dim = start_dim + input.ndim
    if end_dim < 0:
        end_dim = end_dim + input.ndim
    input_shape = list(input.shape)
    input_shape[start_dim:end_dim+1] = [-1]
    return legacy.reshape(input, tuple(input_shape))

 def sort(input, dim, descending, stable):
    return legacy.sort(input, dim, descending)

 def gather(input_params, input_indices, axis, batch_dim):
    return legacy.gather(input_params, input_indices, axis, batch_dim)

 def randint(low, high, shape, generator, dtype):
    value = legacy.uniform_int(shape,
                                mindspore.tensor(low, dtype=mindspore.int32),
                                mindspore.tensor(high, dtype=mindspore.int32), 0, 0)
    return value

 def add(input, other, alpha=1):
    if alpha == 1.0:
        return legacy.add(input, other)
    return legacy.add(input, legacy.mul(other, alpha))

 def non_zero(input):
    return legacy.non_zero(input)

 def stop_gradient(input):
    return legacy.stop_gradient(input)

 def squeeze(input, axis):
    return legacy.squeeze(input, axis)

 def softmax(input, axis):
    if axis is None:
        axis = -1
    return legacy.softmax(input, axis)

 def topk(input, k, dim, largest, sorted):
    if not largest:
        input = -input
    if dim is None or dim == input.ndim - 1:
        if not largest:
            res = legacy.top_k(input, k, sorted)
            values, indices = -res[0], res[1]
            return values, indices
        return legacy.top_k(input, k, sorted)
    input = transpose_view(input, dim, input.ndim - 1)
    output = legacy.top_k(input, k, sorted)
    values = transpose_view(output[0], dim, input.ndim - 1)
    indices = transpose_view(output[1], dim, input.ndim - 1)
    if not largest:
        res = (-values, indices)
    else:
        res = (values, indices)
    return res

 def strided_slice(input, begin, end, strides, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0):
    return legacy.strided_slice(input, tuple(begin), tuple(end), tuple(strides), begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask)

 def strided_slice_grad(input, begin, end, strides, update, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask):
    return legacy.strided_slice_grad(update, input.shape, begin, end, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask)

 def masked_select(input, mask):
    return legacy.masked_select(input, mask)

 def stack(values, axis=0):
    return legacy.stack(values, axis)

 def cast(input, dtype):
    return legacy.cast(input, dtype)

 def less(input, other):
    return legacy.less(input, other)

 def select(condition, x, y):
    return legacy.select(condition, x, y)

 def round(input, decimals):
    return legacy.round(input, decimals)

 def erfinv(input):
    return legacy.erfinv(input)

 def erf(input):
    return legacy.erf(input)

 def pow_scalar_tensor(input, exponent):
    return legacy.pow(input, exponent)

 def inplace_add(input, other, alpha):
    if alpha != 1:
        return inplace_copy(input, legacy.add(input, legacy.mul(other, alpha)))
    return inplace_copy(input, legacy.add(input, other))

 def clamp_scalar(value, min_value, max_value):
    if min_value is not None:
        value = legacy.maximum(value, min_value)
    if max_value is not None:
        value = legacy.minimum(value, max_value)
    return value

 def constant_pad_nd(input, pad, value):
    return legacy.pad_v3(input, pad, value, 'constant', True)

 def randn(size, generator, dtype):
    return cast(legacy.standard_normal(tuple(size), 0, 0), dtype)

 def rand(size, generator, dtype):
    return cast(legacy.uniform_real(tuple(size), 0, 0), dtype)

 def tril(input, diagonal):
    return legacy.tril(input, diagonal)

 def dense(input, weight, bias=None):
    return legacy.dense(input, weight, bias)

 def relu(input):
    return legacy.re_lu(input)

 def assign(input, value):
    return inplace_copy(input, value)

 def square(input):
    return legacy.square(input)

 def log(input):
    if not input.dtype.is_floating_point:
        input = cast(input, mindspore.float32)
    return legacy.log(input)

 def permute(input, dims):
    return legacy.transpose(input, dims)

 def ones_like(input, dtype):
    if dtype is not None:
        return cast(legacy.ones_like(input), dtype)
    return legacy.ones_like(input)

 def embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq):
    return cast(legacy.gather(weight, input, 0, 0), weight.dtype)

 def linspace(start, end, steps, dtype):
    start = float(start)
    end = float(end)
    return legacy.lin_space(mindspore.Tensor(start), mindspore.Tensor(end), steps)

 def masked_fill(input, mask, value):
    if input.dtype.is_floating_point and isinstance(value, numbers.Number):
        value = float(value)
    return legacy.masked_fill(input, mask, value)

 def sum(input, dim, keepdim, dtype):
    if dim is None:
        dim = ()
    if input.dtype == mindspore.bool_:
        input = cast(input, mindspore.int64)
    if dtype is None:
        return legacy.reduce_sum(input, dim, keepdim, False)
    return legacy.reduce_sum(input.astype(dtype), dim, keepdim, False)

 def conv2d(input, weight, bias=None, stride=1, padding='valid', dilation=1, groups=1):
    pad_mode = 'pad'
    pad = padding
    if isinstance(padding, (tuple, list)):
        pad = (padding[0], padding[0], padding[1], padding[1])
    elif isinstance(padding, int):
        pad = (padding,) * 4
    if not isinstance(padding, (int, tuple, list)):
        pad_mode = padding
        pad = (0,) * 4
    
    if isinstance(stride, int):
        stride = (stride,) * 4

    out_channels = weight.shape[0]
    kernel_size = weight.shape[2:]

    output = legacy.conv2_d(
        input, weight,
        out_channels,
        kernel_size,
        1,#mode=1,
        pad_mode, #pad_mode=pad_mode,
        pad, #pad=pad,
        tuple(stride), #stride=tuple(stride),
        dilation, #dilation=dilation,
        groups, #group=groups,
        "NCHW", #data_format="NCHW"
    )
    if bias is not None:
        output = legacy.bias_add(output, bias, "NCHW")
    return output

 def conv2d_padding(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    return conv2d(input, weight, bias, stride, padding, dilation, groups)

 def pow_tensor_scalar(input, scalar):
    return legacy.pow(input, scalar)

 def rsqrt(input):
    return legacy.rsqrt(input)

 def layer_norm(input, normalized_shape, weight, bias, eps=1e-5):
    if weight is not None:
        begin_axis = input.ndim - weight.ndim
    else:
        begin_axis = -1
    return legacy.layer_norm(input, weight, bias, begin_axis, begin_axis, eps)

 def argmin_with_value(input, axis, keep_dims):
    return legacy.arg_min_with_value(input, axis, keep_dims)

 def argmax_with_value(input, axis, keep_dims):
    return legacy.arg_max_with_value(input, axis, keep_dims)

 def silu(input):
    return legacy.mul(input, legacy.sigmoid(input))

 def less_equal(input_x, input_y):
    return legacy.less_equal(input_x, input_y)

 def not_equal(input_x, input_y):
    return legacy.not_equal(input_x, input_y)


 def logical_not(input):
    return legacy.logical_not(input)

 def tensor_scatter_update(input, indices, updates):
    return legacy.tensor_scatter_update(input, indices, updates)

 def isinf(input):
    return legacy.is_inf(input)

 def gelu(input, approximate):
    return legacy.ge_lu(input)

 def greater(input_x, input_y):
    return legacy.greater(input_x, input_y)

 def greater_equal(input_x, input_y):
    return legacy.greater_equal(input_x, input_y)

 def eye(n, m, dtype):
    return legacy.eye(n, m, dtype)

 def argmax(input, axis, keep_dims):
    return legacy.arg_max_with_value(input, axis, keep_dims)[0]

 def argmin(input, axis, keep_dims):
    return legacy.arg_min_with_value(input, axis, keep_dims)[0]

 def exp(input):
    return legacy.exp(input)

 def split_with_size(tensor, split_sizes, dim=0):
    chunks = []
    start = 0
    for chunk_size in split_sizes:
        end = start + chunk_size
        slice_obj = [py_slice(None)] * tensor.dim()
        slice_obj[dim] = py_slice(start, end)
        chunks.append(tensor[tuple(slice_obj)])
        start = end

    return tuple(chunks)


 def cos(input):
    return legacy.cos(input)

 def sigmoid(input):
    return legacy.sigmoid(input)

 def sqrt(input):
    return legacy.sqrt(input)

 def chunk(input, chunks, dim=0):
    return legacy.split(input, dim, chunks)

 def sin(input):
    return legacy.sin(input)

 def neg(input):
    return legacy.neg(input)

 def bitwise_or_tensor(input_x, input_y):
    return legacy.bitwise_or(input_x, input_y)

 def bitwise_and_tensor(input_x, input_y):
    return legacy.bitwise_and(input_x, input_y)

 def non_zero_ext(input):
    out = legacy.non_zero(input)
    return unbind(out, 1, out.shape[1])

 def unbind(input, dim, num):
    return legacy.unstack(input, dim, num)

 def log1p(input):
    return legacy.log1p(input)

 def log_softmax(input, axis, dtype):
    if dtype is not None:
        input = input.astype(dtype)
    return legacy.log_softmax(input, axis)

 def scatter(input, dim, index, src):
    return legacy.tensor_scatter_elements(input, index, src, dim, "none")

 def batch_norm(input, weight, bias, running_mean=None, runnning_var=None, training=False, momentum=0.1, epsilon=1e-5):
    input_ndim = input.ndim
    if input_ndim == 2:
        return legacy.batch_norm(input, weight, bias, running_mean, runnning_var, training, epsilon, momentum, 'NCHW')
    else:
        input = transpose_view(input, 1, -1)
        input_shape = input.shape
        input = reshape(input, (-1, input.shape[-1]))
        outs = legacy.batch_norm(input, weight, bias, running_mean, runnning_var, training, epsilon, momentum, 'NCHW')
        out = reshape(outs[0], (*input_shape[:-1], -1))
        out = transpose_view(out, 1, -1)

        return out, outs[1], outs[2]

 def tanh(input):
    return legacy.tanh(input)

 def dropout(input, p, seed, offset):
    return legacy.dropout(input, 1-p, 0, 0)

 def split_tensor(input, split_size_or_sections, dim):
    if isinstance(split_size_or_sections, int):
        num = input.shape[dim] // split_size_or_sections
        return legacy.split(input, dim, num)

 def bmm(input_x, input_y):
    return legacy.batch_mat_mul(input_x, input_y, False, False)

 def nllloss(input, target, weight, reduction, ingore_index):
    return legacy.nll_loss(input, target, weight, reduction, ingore_index)

 def nllloss_2d(input, target, weight, reduction, ingore_index):
    input = reshape(transpose_view(input, 1, -1), (-1, input.shape[1]))
    target = reshape(target, (-1,))
    out = legacy.nll_loss(input, target, weight, reduction, ingore_index)
    return out


 def binary_cross_entropy_with_logits(input, target, weight, posWeight, reduction):
    return legacy.bce_with_logits_loss(input, target, weight, posWeight, reduction)

 def std(input, dim, correction, keepdim):
    if dim is None:
        dim = ()
    return legacy.reduce_std(input, dim, bool(correction), keepdim)[0]

 def linalg_vector_norm(x, ord=2, dim=None, keepdim=False, dtype=None):
    return legacy.lp_norm(x, dim, int(ord), keepdim, 1e-12)

 def rfft(input, n=None, dim=-1, norm=None):
    if input.shape[dim] < n:
        pad_inf = (0, n - input.shape[dim])
        pad_dims = (0, 0) * (input.ndim - (dim + 1)) + pad_inf
        input = constant_pad_nd(input, pad_dims, 0.)
    else:
        input = narrow(input, dim, 0, n)
    return legacy.fft_with_size(input, input.ndim, False, True, norm, True, ())

 def narrow(input, dim, start, length):
    begin = [0] * input.ndim
    size = [i for i in input.shape]
    begin[dim] = start
    size[dim] = length
    return legacy.slice(input, begin, size)

 def conj(input):
    return legacy.conj(input)

 def irfft(input, n, dim, norm):
    if input.shape[dim] < n:
        pad_inf = (0, n - input.shape[dim])
        pad_dims = (0, 0) * (input.ndim - (dim + 1)) + pad_inf
        input = constant_pad_nd(input, pad_dims, 0.)
    else:
        input = narrow(input, dim, 0, n)
    return legacy.fft_with_size(input, input.ndim, True, True, norm, True, ())

 def avg_pool1d(input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True):
    if isinstance(padding, int):
        padding = (0, 0, 0, 0, padding, padding)
    elif isinstance(padding, tuple):
        if len(padding) != 1:
            raise ValueError("For avg_pool1d, padding should be int or tuple of length 1.")
        padding = (0, 0, 0, 0, padding[0], padding[1])
    else:
        raise TypeError("For avg_pool1d, padding should be int or tuple of length 1.")

    if isinstance(stride, tuple):
        if len(stride) != 1:
            raise ValueError("For avg_pool1d, stride should be int or tuple of length 1.")
        stride = stride[0]

    input = expand_dims(input, 2)
    input = expand_dims(input, 2)
    input = legacy.avg_pool3_d(input, (1, 1, kernel_size), (1, 1, stride), 'pad', padding, ceil_mode, count_include_pad, 0, 'NCDHW')
    input = squeeze(input, (2, 3))
    return input

 def fmod_scalar(input, other):
    return legacy.floor_mod(input, other)

 def conv1d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    pad_mode = 'pad'
    pad = padding
    if isinstance(padding, tuple):
        pad = (0, 0, padding[0], padding[0])
    elif isinstance(padding, int):
        pad = (0, 0) + (padding,) * 2
    if not isinstance(padding, (int, tuple)):
        pad_mode = padding
        pad = (0,) * 4

    input = expand_dims(input, 2)
    weight = expand_dims(weight, 2)

    output = legacy.conv2_d(
        input, weight,
        weight.shape[0],
        (1, weight.shape[-1]),
        1,#mode=1,
        pad_mode, #pad_mode=pad_mode,
        pad, #pad=pad,
        (1, stride) if isinstance(stride, int) else (1, *stride), #stride=tuple(stride),
        (1, dilation) if isinstance(dilation, int) else (1, *dilation), #dilation=dilation,
        groups, #group=groups,
        "NCHW", #data_format="NCHW"
    )


    if bias is not None:
        output = legacy.bias_add(output, bias, "NCHW")

    output = squeeze(output, 2)
    return output

 def maximum(input, other):
    return legacy.maximum(input, other)

 def prod(input, axis, keepdims, dtype):
    if axis is None:
        axis = ()
    return legacy.reduce_prod(input, axis, keepdims)

 def mse_loss(input, target, reduction):
    x = square(input - target)
    average_flag = True
    reduce_flag = True
    if reduction == 'sum':
        average_flag = False
    if reduction == 'none':
        reduce_flag = False

    if reduce_flag and average_flag:
        x = mean(x, tuple(range(x.ndim)), False, None)

    if reduce_flag and not average_flag:
        x = sum(x, tuple(range(x.ndim)), False, None)

    return x

 def adaptive_avg_pool2d(input, output_size):
    return legacy.adaptive_avg_pool2_d(input, output_size)

 def avg_pool2d(input, kernel_size, stride, padding=0, ceil_mode=False, count_include_pad=True, divisor_override=None):
    if isinstance(padding, int):
        padding = (0, 0, padding, padding, padding, padding)
    elif isinstance(padding, tuple):
        if len(padding) != 1:
            raise ValueError("For avg_pool1d, padding should be int or tuple of length 1.")
        padding = (0, 0, padding[0], padding[1], padding[2], padding[3])
    else:
        raise TypeError("For avg_pool1d, padding should be int or tuple of length 1.")

    if isinstance(kernel_size, int):
        kernel_size = (kernel_size, kernel_size)
    if isinstance(stride, int):
        stride = (stride, stride)

    input = expand_dims(input, 2)
    input = legacy.avg_pool3_d(input, (1, *kernel_size), (1, *stride), 'pad', padding, ceil_mode, count_include_pad, 0, 'NCDHW')
    input = squeeze(input, 2)
    return input

 def bitwise_or_scalar(input, value):
    return legacy.bitwise_or(input, value)

 def floor_div(input, other):
    return legacy.floor_div(input, other)

 def minimum(input, other):
    return legacy.minimum(input, other)

 def reverse_v2(input, axis):
    if isinstance(axis, int):
        axis = (axis,)
    return legacy.reverse_v2(input, axis)

 def divmod(input, other, rounding_mode):
    if rounding_mode == 'floor':
        return legacy.floor_div(input, other)
    elif rounding_mode == 'trunc':
        if isinstance(input, numbers.Number):
            input = mindspore.Tensor(input)
        return legacy.truncate_div(input, other)
    else:
        raise ValueError(f'Invalid rounding mode: {rounding_mode}')

 def pow(input, exponent):
    return legacy.pow(input, exponent)


 def bitwise_and_scalar(input, value):
    return legacy.bitwise_and(input, value)

 def rand_like(input, generator, dtype):
    return rand(input.shape, generator, dtype)

 def bincount(input, weights=None, minlength=0):
    if weights is None:
        weights =  mindspore.Tensor(1, dtype=mindspore.int32)
    return legacy.bincount(cast(input, mindspore.int32),
                           mindspore.Tensor(minlength, dtype=mindspore.int32),
                           weights)

 def lgamma(input):
    return legacy.lgamma(input)

 def _deconv_output_length(pad_mode, filter_size, stride_size, dilation_size, padding):
    """Calculate the width and height of output."""
    length = 0
    filter_size = filter_size + (filter_size - 1) * (dilation_size - 1)
    if pad_mode == 'valid':
        if filter_size - stride_size > 0:
            length = filter_size - stride_size
    elif pad_mode == 'pad':
        length = - padding + filter_size - stride_size

    return length


 def conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1):
    pad_mode = 'pad'
    pad = padding
    if isinstance(padding, tuple):
        pad = (0, 0, padding[0], padding[0])
    elif isinstance(padding, int):
        pad = (0, 0) + (padding,) * 2
    if not isinstance(padding, (int, tuple)):
        pad_mode = padding
        pad = (0,) * 4

    if isinstance(dilation, int):
        dilation = (dilation, dilation)

    in_channel, out_channels = weight.shape[0], weight.shape[1] * groups
    kernel_size = weight.shape[2:]

    n, _, h, w = input.shape
    h_add = _deconv_output_length(pad_mode, kernel_size[0], stride[0], dilation[0], pad[0] + pad[1])
    w_add = _deconv_output_length(pad_mode, kernel_size[1], stride[1], dilation[1], pad[2] + pad[3])

    out = legacy.conv2_d_transpose(
        input, weight,
        (n, out_channels, h * stride[0] + h_add, w * stride[1] + w_add),
        out_channels,
        kernel_size,
        pad_mode,
        pad,
        None,
        1,
        stride,
        dilation,
        groups,
        'NCHW'
    )
    if bias is not None:
        out = legacy.bias_add(out, bias, 'NCHW')
    return out

 def expm1(x):
    return legacy.expm1(x)

 py_min = min
 def min(input):
    return legacy.reduce_min(input, (), False)

 def acos(x):
    return legacy.a_cos(x)

 def upsample_bilinear2d(input, size=None, scale_factor=None, align_corners=False):
    return legacy.resize_bilinear_v2(input, size, align_corners, not align_corners)

 def unstack_view(input, dim):
    return legacy.unstack(input, dim, input.shape[dim])

 def triu(input, diagonal=0):
    return legacy.triu(input, diagonal)

 def masked_scatter(input, mask, value):
    return legacy.masked_scatter(input, mask, value)

 def max_pool2d(input, kernel_size, stride=1, padding=0, dilation=1, ceil_mode=False, return_indices=False):
    out, indices = legacy.max_pool_with_argmax_v2(input, kernel_size, stride, padding, dilation, ceil_mode, mindspore.int64)

    if return_indices:
        return out, indices
    return out

 def baddbmm(input, batch1, batch2, alpha=1, beta=1):
    return add(mul(beta, input), mul(alpha, bmm(batch1, batch2)))

 def softplus(input, beta=1, threshold=20):
    return legacy.softplus(input)

 def gather_nd(input, indices):
    return legacy.gather_nd(input, indices)

 def unique_consecutive(input, return_inverse, return_counts, dim):
    return legacy.unique_consecutive(input, return_inverse, return_counts, dim)

 def meshgrid(input, lambd):
    return legacy.meshgrid(input, lambd)

 def addcmul(input, tensor1, tensor2, value=1.0):
    return legacy.addcmul(input, tensor1, tensor2, mindspore.Tensor(value))

 def addmm(input, mat1, mat2, alpha=1.0, beta=1.0):
    return add(mul(beta, input), mul(alpha, bmm(mat1, mat2)))

 def im2col(input, kernel_size, dilation=1, padding=0, stride=1):
    out = legacy.im2_col(input, kernel_size, stride, dilation, padding)
    out_shape = out.shape[:1] + (-1,) + out.shape[-1:]
    out = reshape(out, out_shape)
    return out

 def floor(input):
    return legacy.floor(input)

 def upsample_nearest2d(input, output_size, scale_factors):
    if output_size is None:
        tuple_len = py_min(len(input.shape) - 2, len(scale_factors))
        output_size = tuple([math.floor(input.shape[i + 2] * scale_factors[i])
                        for i in range(tuple_len)])

    return legacy.resize_nearest_neighbor(input, output_size, False, False)

 def upsample_bicubic2d(input, size=None, scale_factor=None, align_corners=False):
    return legacy.resize_bicubic(input, size, align_corners, not align_corners)

 def conv3d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    pad_mode = 'pad'
    pad = padding
    if isinstance(padding, (tuple, list)):
        pad = (padding[0], padding[0], padding[1], padding[1], padding[2], padding[2])
    elif isinstance(padding, int):
        pad = (padding,) * 6
    if not isinstance(padding, (int, tuple, list)):
        pad_mode = padding
        pad = (0,) * 6

    out_channels = weight.shape[0]
    kernel_size = weight.shape[2:]

    output = legacy.conv3_d(input, weight,
                            out_channels,
                            kernel_size,
                            1,
                            pad_mode,
                            pad,
                            tuple(stride),
                            dilation,
                            groups,
                            "NCDHW")
                            
    if bias is not None:
        output = legacy.bias_add(output, bias, 'NCHW')
    return output


    return legacy.conv3_d(input, weight, bias, stride, padding, dilation, groups)

 def inplace_relu(input):
    return legacy.assign(input, legacy.re_lu(input))

 def adaptive_avg_pool1d(input, output_size):
    x_in_shape = input.shape
    width = x_in_shape[2]
    stride = width // output_size
    kernel_size = width - (output_size - 1) * stride
    stride = (1, width // output_size)
    kernel_size = (1, kernel_size)
    input = expand_dims(input, 2)
    input = legacy.avg_pool(input, kernel_size, stride, "VALID", "NCHW")
    input = squeeze(input, 2)
    return input

 def remainder_tensor_scalar(input, other):
    out = sub(input, mul(floor_div(input, other), other), 1)
    return out

 def outer(input, other):
    input = reshape(input, (-1, 1))
    y = mul(input, other)
    return y

 def view_as_complex(input):
    real_part, imag_part = chunk(input, 2, -1)
    return legacy.complex(squeeze(real_part, -1), squeeze(imag_part, -1))

 def cdist(x1, x2, p):
    return legacy.cdist(x1, x2, float(p))

 def prelu(input, weight):
    return legacy.p_re_lu(input, weight)

 def reciprocal(input):
    return legacy.reciprocal(input)

 def ctc_loss(log_probs, targets, input_lengths, target_lengths, blank, reduction, zero_infinity):
    loss, log_alpha = legacy.ctc_loss_v2(log_probs, targets, input_lengths, target_lengths, blank, 'none', zero_infinity)
    if reduction == 'sum':
        loss = sum(loss, (), False, None)
    if reduction == 'mean':
        # input_type = loss.dtype
        # target_length_t = target_lengths.clip(1., None)
        # loss = loss.astype("float32")
        loss = div(loss, target_lengths)
        loss = mean(loss, (), False, None)
        # loss = loss.astype(input_type)
    return (loss, log_alpha)

 def glu(input, dim=-1):
    return legacy.glu(input, dim)

 def one_hot(tensor, num_classes):
    on_value = mindspore.Tensor(1, dtype=tensor.dtype)
    off_value = mindspore.Tensor(0, dtype=tensor.dtype)
    return legacy.one_hot(tensor, num_classes, on_value, off_value, -1)

 def polar(abs, angle):
    return legacy.polar(abs, angle)

 def scatter_value(input, dim, index, src, reduce='none'):
    if isinstance(src, numbers.Number):
        src = fill_scalar(index.shape, src, dtype=input.dtype)
    return legacy.tensor_scatter_elements(input, index, src, dim, reduce)

 def pixel_shuffle(input, upscale_factor):
    idx = input.shape
    length = input.ndim
    pre = idx[:-3]
    c, h, w = idx[-3:]
    c = c // upscale_factor ** 2
    input_perm = pre + (c, upscale_factor, upscale_factor, h, w)
    input = reshape(input, input_perm)
    input_perm = [i for i in range(length - 2)]
    input_perm = input_perm + [length, length - 2, length + 1, length - 1]
    input_perm = tuple(input_perm)
    input = permute(input, input_perm)
    input = reshape(input, (pre + (c, upscale_factor * h, upscale_factor * w)))
    return input

 def rms_norm(input, weight, eps=1e-5):
    input_dtype = input.dtype
    input = cast(input, mindspore.float32)
    variance = mean(pow(input, 2), -1, True, None)
    input = mul(input, rsqrt(add(variance, eps, 1)))
    return mul(weight, cast(input, input_dtype))

 def count_nonzero(input, dims):
    return legacy.count_non_zero(input, dims)

 def index_add_ext(input, dim, index, source, alpha):
    if alpha != 1:
        source = mul(alpha, source)
    return legacy.index_add(input, cast(index, mindspore.int32), source, dim, True, True)

 def real(input):
    return legacy.real(input)

 def upsample_linear1d(input, output_size, scale_factor, align_corners=False):
    coordinate_transformation_mode = "align_corners" if align_corners else "half_pixel"
    return legacy.resize_linear1_d(input, output_size, coordinate_transformation_mode)

 def imag(input):
    return legacy.imag(input)

 def bitwise_xor_tensor(input, other):
    return legacy.bitwise_xor(input, other)

 def grid_sampler_2d(input, grid, mode='bilinear', padding_mode='zeros', align_corners=False):
    return legacy.grid_sampler2_d(input, grid, mode, padding_mode, align_corners)

 def l1_loss(input, target, reduction='mean'):
    loss = abs(sub(input, target))
    if reduction == 'mean':
        return mean(loss, (), False, False)
    elif reduction == 'sum':
        return sum(loss, (), False, False)
    return loss

 def leaky_relu(input, negative_slope):
    select_op = maximum
    if negative_slope > 1:
        select_op = minimum
    return select_op(mul(negative_slope, input), input)

 def ceil(input):
    return legacy.ceil(input)

 def reduce_max(input, axis, keepdims):
    return legacy.reduce_max(input, axis, keepdims)

 def nan_to_num(input, nan=0.0, posinf=None, neginf=None):
    return legacy.nan_to_num(input, nan, posinf, neginf)

 def elu(input, alpha):
    return legacy.elu(input, alpha)

 def sign(input):
    return legacy.sign(input)

 def inplace_fill_diagonal(input, fill_value, wrap):
    inplace_copy(input, legacy.fill_diagonal(input, float(fill_value), wrap))
    return input

 def clamp_tensor(value, min_value, max_value):
    if min_value is not None:
        value = legacy.maximum(value, min_value)
    if max_value is not None:
        value = legacy.minimum(value, max_value)
    return value

 def lstm(input, h, c, w, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout, proj_size):
    return legacy.lstm(input, h, c, w, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout, proj_size)

 def var(input, dim=None, correction=1, keepdim=False):
    if dim is None:
        input_mean = mean(input, (), False, None)
    else:
        input_mean = mean(input, dim=dim, keepdim=True, dtype=None)
    
    # 计算与均值的平方差
    squared_diff = pow(sub(input, input_mean, 1), 2) 
    # 计算方差
    if dim is None:
        variance = mean(squared_diff, (), False, None)
        n = input.numel()  # 总元素个数
    else:
        variance = mean(squared_diff, dim=dim, keepdim=keepdim, dtype=None)
        n = input.size(dim)  # 指定维度的元素个数
    
    # 无偏估计校正
    if correction and n > 1:
        variance = mul(variance, (n / (n - 1)))
    
    return variance

 def log2(input):
    return div(log(input), math.log(2))

 def bucketize(input, boundaries, right=False):
    epsilon_ = 0. if right else 1.e-6
    boundaries = [boundary + epsilon_ for boundary in boundaries]
    return legacy.bucketize(input, boundaries)

 def col2im(input, output_size, kernel_size, dilation=1, padding=0, stride=1):
    return legacy.col2_im(input, output_size, kernel_size, dilation, padding, stride)

 def randperm(n, generator, dtype):
    seed, offset = generator._step(12)  # pylint: disable=protected-access
    return legacy.randperm_v2(n, seed, offset, dtype)

 def logical_or(input_x, input_y):
    return legacy.logical_or(input_x, input_y)

 def hswish(input):
    return legacy.h_swish(input)

 def logical_and(input_x, input_y):
    return legacy.logical_and(input_x, input_y)

 def logsigmoid(input):
    output = sigmoid(input)
    ret = log(output)
    return ret

 def dropout2d(input_x, p):
    return legacy.dropout2_d(input_x, p)

 def linalg_qr(input_x, mode):
    full_matrices = 'mode' == 'complete'
    return legacy.qr(input_x, full_matrices)

 def logit(input, eps=1e-5):
    return legacy.logit(input, eps)

 def relu6(input):
    return legacy.re_lu6(input)

 def logsumexp(input, dim, keepdim=False):
    input_max = legacy.reduce_max(input, dim, keepdim)
    input_exp = exp(sub(input, input_max))
    input_sumexp = sum(input_exp, dim, keepdim, None)
    input_logsumexp = log(input_sumexp)
    if not keepdim:
        input_max = squeeze(input_max, dim)
    return add(input_logsumexp, input_max)

 def bernoulli(input, generator):
    return legacy.bernoulli(input, seed, offset)

 def arange(start, end, step, dtype):
    return legacy.range(start, end, step, 100000)

 def inplace_fill_scalar(input, value):
    input.assign_value(fill_scalar(input.shape, value, input.dtype))
    return input

 def inplace_normal(input, mean, std, generator):
    out = legacy.standard_normal(input.shape, 0, 0)
    value = add(mul(out, std), mean)
    return input.assign_value(value)

 def inplace_uniform(input, from_, to_, generator_):
    if input.dtype.is_floating_point:
        uniform_real = legacy.uniform_real(tuple(input.shape), 0, 0)
        value = add(mul(uniform_real, sub(to_, from_)), from_)
    else:
        value = legacy.uniform_int(input.shape,
                                    mindspore.tensor(from_, dtype=mindspore.int32),
                                    mindspore.tensor(to_, dtype=mindspore.int32), 0, 0)
    return input.assign_value(value)
--- a/mindnlp/core/_apis/meta.py
+++ b/mindnlp/core/_apis/meta.py
@@ -0,0 +1,379 @@
 try:
    from mindspore._c_expression import TensorPy as Tensor_
 except:
    from mindspore._c_expression import Tensor as Tensor_

 import math
 import numpy as np
 from mindnlp import core

 __all__ = []

 def arange(start, end, step, dtype):
    out = Tensor_(shape=(math.ceil((end - start) / step), ), dtype=dtype)
    return core.Tensor(out)

 __all__.append('arange')

 def broadcast_to(input, shape):
    out_shape = ()
    input_shape = input.shape
    if len(input_shape) != shape:
        input_shape = (1,) + input_shape
    for idx, s in enumerate(shape):
        if s == -1:
            s = input_shape[idx]
        out_shape += (s,)

    out = Tensor_(shape=out_shape, dtype=input.dtype)
    return core.Tensor(out)

 __all__.append('broadcast_to')

 def zeros(size, dtype):
    out = Tensor_(shape=size, dtype=dtype)
    return core.Tensor(out)

 __all__.append('zeros')

 def ones(size, dtype):
    out = Tensor_(shape=size, dtype=dtype)
    return core.Tensor(out)

 __all__.append('ones')

 def inplace_uniform(input, *args):
    return input

 __all__.append('inplace_uniform')

 def inplace_fill_scalar(input, value):
    return input

 __all__.append('inplace_fill_scalar')

 def inplace_normal(input, *args):
    return input

 __all__.append('inplace_normal')

 def getitem(input, slice):
    out = input.asnumpy()[slice]
    out = Tensor_(shape=out.shape, dtype=input.dtype)
    return core.Tensor(out)

 __all__.append('getitem')

 def sub(input, other, alpha):
    if isinstance(input, core.Tensor):
        return input
    return other

 __all__.append('sub')

 def pad_v3(input, pad, mode, value):
    out = np.pad(input.asnumpy(), pad, mode, constant_values=value)
    out = Tensor_(shape=out.shape, dtype=input.dtype)
    return core.Tensor(out)

 __all__.append('pad_v3')

 def abs(input):
    return input

 __all__.append('abs')

 def cast(input, dtype):
    out = Tensor_(shape=input.shape, dtype=dtype)
    return core.Tensor(out)

 __all__.append('cast')

 def index_select(input, dim, index):
    out = np.take(input.asnumpy(), index.asnumpy(), dim)
    out = Tensor_(shape=out.shape, dtype=input.dtype)
    return core.Tensor(out)

 __all__.append('index_select')

 def identity(input):
    out = Tensor_(shape=input.shape, dtype=input.dtype)
    return core.Tensor(out)

 __all__.append('identity')

 def contiguous(input):
    return input

 __all__.append('contiguous')

 def inplace_copy(input, other):
    return input

 __all__.append('inplace_copy')

 def div(input, other):
    if isinstance(input, core.Tensor):
        shape = input.shape
        dtype = input.dtype
    else:
        shape = other.shape
        dtype = other.dtype
    out = Tensor_(shape=shape, dtype=dtype)
    return core.Tensor(out)

 __all__.append('div')

 def pow_scalar_tensor(input, other):
    out = Tensor_(shape=other.shape, dtype=other.dtype)
    return core.Tensor(out)

 __all__.append('pow_scalar_tensor')

 def concat(tensors, dim):
    shape = list(tensors[0].shape)
    shape[dim] = sum([t.shape[dim] for t in tensors])
    out = Tensor_(shape=tuple(shape), dtype=tensors[0].dtype)
    return core.Tensor(out)

 __all__.append('concat')

 def tril(input, k):
    return input

 __all__.append('tril')

 def reshape(input, shape):
    out = Tensor_(shape=tuple(shape), dtype=input.dtype)
    return core.Tensor(out)

 __all__.append('reshape')

 def linalg_vector_norm(input, p, dim, keepdim, dtype):
    input_shape = list(input.shape)
    if isinstance(dim, int):
        dim = (dim,)
    for d in dim:
        input_shape[d] = 1 if keepdim else 0
    
    new_shape = []
    for s in input_shape:
        if s != 0:
            new_shape.append(s)
    if dtype is None:
        dtype = input.dtype
    out = Tensor_(shape=tuple(new_shape), dtype=dtype)
    return core.Tensor(out)

 __all__.append('linalg_vector_norm')

 def erfinv(input):
    return input
 __all__.append('erfinv')


 def stop_gradient(input):
    out = Tensor_(shape=input.shape, dtype=input.dtype)
    return core.Tensor(out)

 __all__.append('stop_gradient')

 def log(input):
    return input
 __all__.append('log')

 def mul(input, other):
    out = Tensor_(shape=input.shape, dtype=input.dtype)
    return core.Tensor(out)
 __all__.append('mul')

 def randn(size, generator, dtype):
    out = Tensor_(shape=size, dtype=dtype)
    return core.Tensor(out)

 __all__.append('randn')

 def zeros_like(input, *args, **kwargs):
    out = Tensor_(shape=input.shape, dtype=input.dtype)
    return core.Tensor(out)
 __all__.append('zeros_like')

 def inplace_add(input, other, alpha):
    return input
 __all__.append('inplace_add')

 def clamp_scalar(input, *args):
    return input
 __all__.append('clamp_scalar')

 def expand_dims(input, dim):
    input_shape = list(input.shape)
    input_shape.insert(dim, 1)

    out = Tensor_(shape=tuple(input_shape), dtype=input.dtype)
    return core.Tensor(out)


 def floor_div(input, other):
    return input
 __all__.append('floor_div')

 def sin(input):
    return input

 __all__.append('sin')

 def cos(input):
    return input

 __all__.append('cos')

 def triu(input, diagonal):
    return input

 __all__.append('triu')

 def fill_scalar(size, fill_value, dtype):
    if dtype is None:
        dtype = core.get_default_dtype()
    out = Tensor_(shape=size, dtype=dtype)
    return core.Tensor(out)

 __all__.append('fill_scalar')

 def sqrt(input):
    return input

 __all__.append('sqrt')

 def normal_float_float(mean, std, size, geneartor):
    out = Tensor_(shape=size, dtype=core.float32)
    return core.Tensor(out)


 __all__.append('normal_float_float')

 def stack(tensors, dim):
    x_shape = list(tensors[0].shape)
    x_shape.insert(dim, len(tensors))
    out = Tensor_(shape=tuple(x_shape), dtype=tensors[0].dtype)
    return core.Tensor(out)

 __all__.append('stack')

 def argmax_with_value(input, dim, keepdim):
    out_shape = list(input.shape)
    if keepdim:
        out_shape[dim] = 1
    else:
        out_shape.pop(dim)

    indices = Tensor_(shape=out_shape, dtype=core.int64)
    values = Tensor_(shape=out_shape, dtype=input.dtype)

    return core.Tensor(indices), core.Tensor(values)

 __all__.append('argmax_with_value')

 def tile(input, dims):
    input_shape = input.shape
    out_shape = [input_shape[i] * dims[i] for i in range(input.ndim)]
    out = Tensor_(shape=tuple(out_shape), dtype=input.dtype)
    return core.Tensor(out)

 __all__.append('tile')

 def flatten(input, start_dim, end_dim):
    input_shape = list(input.shape)
    if start_dim < 0:
        start_dim = start_dim + input.ndim
    if end_dim < 0:
        end_dim = end_dim + input.ndim

    flatten_shape = input_shape[:start_dim] + input_shape[start_dim:end_dim+1] + input_shape[end_dim+1:]
    out = Tensor_(shape=tuple(flatten_shape), dtype=input.dtype)
    return core.Tensor(out)

 __all__.append('flatten')

 def cumsum(input, dim, dtype):
    return input

 __all__.append('cumsum')

 def squeeze(input, dim):
    input_shape = list(input.shape)
    if isinstance(dim, int):
        dim = (dim,)
    
    new_shape = ()
    for idx, s in enumerate(input_shape):
        if idx not in dim and s != 1:
            new_shape += (s,)

    out = Tensor_(shape=tuple(new_shape), dtype=input.dtype)
    return core.Tensor(out)

 __all__.append('squeeze')

 def exp(input):
    return input

 __all__.append('exp')

 def rand(size, generator, dtype):
    out = Tensor_(shape=size, dtype=dtype)
    return core.Tensor(out)

 __all__.append('rand')

 def add(input, other, alpha):
    return input

 __all__.append('add')

 def neg(input):
    return input

 __all__.append('neg')

 def expm1(input):
    return input

 __all__.append('expm1')

 def reverse_v2(input, dims):
    return input

 __all__.append('reverse_v2')

 def rsqrt(input):
    return input

 __all__.append('rsqrt')

 def bitwise_xor_tensor(input, other):
    return input

 __all__.append('bitwise_xor_tensor')

 def divmod(input, other, rounding_mode):
    if isinstance(input, core.Tensor):
        return input
    return other

 __all__.append('divmod')

 def greater_equal(input, other):
    if isinstance(input, core.Tensor):
        return input
    return other

 __all__.append('greater_equal')

 def inplace_zero(input):
    return input

 def clone(input):
    return input

--- a/mindnlp/core/_apis/npu.py
+++ b/mindnlp/core/_apis/npu.py
@@ -0,0 +1,1596 @@
 import mindspore
 from mindspore._c_expression import _empty_instance
 from ..configs import use_pyboost, ON_A1, ON_ORANGE_PI
 from .._op_prim.ascend import legacy, pyboost


 def empty(*args, **kwargs):
    return _empty_instance(*args, **kwargs, device='Ascend')

 def reshape(x, shape):
    """
    Reshape the input tensor to the given shape.

    Args:
        x (Tensor): The input tensor.
        shape (tuple): The target shape.

    Returns:
        Tensor: The reshaped tensor.
    """
    if use_pyboost():
        return pyboost.reshape_op(x, shape)
    else:
        return legacy.reshape(x, shape)

 def contiguous(x):
    """
    Returns a contiguous tensor containing the same data as the input tensor.

    Args:
        x (Tensor): The input tensor.

    Returns:
        Tensor: The contiguous tensor.
    """
    if use_pyboost:
        return pyboost.contiguous_op(x)
    else:
        return x

 def select_ext_view(input, dim, index):
    """
    Selects a slice from the input tensor along the specified dimension.

    Args:
        input (Tensor): The input tensor.
        dim (int): The dimension along which to select the slice.
        index (int): The index of the slice to select.

    Returns:
        Tensor: The selected slice.
    """
    if use_pyboost():
        return pyboost.select_ext_view_op(input, dim, index)
    else:
        return legacy.select_view(input, index, dim)

 def inplace_copy(self, value):
    """
    Copies the data from the given tensor to the current tensor.

    Args:
        value (Tensor): The tensor from which to copy the data.
    """
    if use_pyboost:
        return pyboost.inplace_copy_op(self, value)
    else:
        self.assign_value(value)
        return self

 def slice(input, dim, start, end, step):
    """
    Slices the input tensor along the specified dimension.

    Args:
        input (Tensor): The input tensor.
        dim (int): The dimension along which to slice.
        start (int): The starting index of the slice.
        end (int): The ending index of the slice.
        step (int): The step size of the slice.

    Returns:
        Tensor: The sliced tensor.
    """
    if use_pyboost():
        return pyboost.slice_ext_op(input, dim, start, end, step)
    else:
        return legacy.slice(input, dim, start, end, step)

 def embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq):
    """
    Applies embedding to the input tensor.

    Args:
        input (Tensor): The input tensor.
        weight (Tensor): The embedding weight tensor.
        padding_idx (int): The index of the padding element.
        max_norm (float): The maximum norm of the embedding vectors.
        norm_type (float): The p-norm to use for normalization.
        scale_grad_by_freq (bool): Whether to scale the gradient by frequency.
        sparse (bool): Whether to use sparse gradients.

    Returns:
        Tensor: The embedded tensor.
    """
    return pyboost.embedding_op(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq)

 def add(input, other, alpha): # pylint: disable=unused-argument
    """
    Adds two tensors element-wise.

    Args:
        input (Tensor): The input tensor.
        other (Tensor): The other tensor.
        alpha (float): The scaling factor for the other tensor.

    Returns:
        Tensor: The result of the addition.
    """
    if use_pyboost():
        return pyboost.add_ext_op(input, other, alpha)
    if alpha == 1.0:
        return legacy.add(input, other)
    return legacy.add(input, legacy.mul(other, alpha))

 def layer_norm(input, normalized_shape, weight, bias, eps=1e-5):
    """
    Applies layer normalization to the input tensor.

    Args:
        input (Tensor): The input tensor.
        normalized_shape (tuple): The shape of the input tensor to be normalized.
        weight (Tensor): The weight tensor.
        bias (Tensor): The bias tensor.
        eps (float): The epsilon value for numerical stability.

    Returns:
        Tensor: The normalized tensor.
    """
    if use_pyboost():
        return pyboost.layer_norm_ext_op(input, normalized_shape, weight, bias, eps)
    if weight is not None:
        begin_axis = input.ndim - weight.ndim
    else:
        begin_axis = -1
    return legacy.layer_norm(input, weight, bias, begin_axis, begin_axis, eps)

 def expand_dims(input, axis):
    """
    Adds an extra dimension to the input tensor.

    Args:
        input (Tensor): The input tensor.
        axis (int): The axis along which to add the dimension.

    Returns:
        Tensor: The expanded tensor.
    """
    if use_pyboost():
        return pyboost.expand_dims_op(input, axis)
    return legacy.expand_dims(input, axis)

 def cast(input, dtype):
    """
    Casts the input tensor to the specified data type.

    Args:
        input (Tensor): The input tensor.
        dtype (str): The target data type.

    Returns:
        Tensor: The casted tensor.
    """
    return legacy.cast(input, dtype)

 def sub(input, other, alpha):
    """
    Subtracts the other tensor from the input tensor.

    Args:
        input (Tensor): The input tensor.
        other (Tensor): The tensor to subtract.
        alpha (float): The scale factor for the other tensor.

    Returns:
        Tensor: The result of the subtraction.
    """
    if use_pyboost():
        return pyboost.sub_ext_op(input, other, alpha)
    return legacy.sub(input, legacy.mul(other, alpha))

 def mul(input, other):
    """
    Multiplies the input tensor with the other tensor.

    Args:
        input (Tensor): The input tensor.
        other (Tensor): The tensor to multiply.

    Returns:
        Tensor: The result of the multiplication.
    """
    if use_pyboost():
        return pyboost.mul_op(input, other)
    return legacy.mul(input, other)

 def dense(input, weight, bias=None):
    """
    Performs a dense (fully connected) operation.

    Args:
        input (Tensor): The input tensor.
        weight (Tensor): The weight tensor.
        bias (Tensor, optional): The bias tensor. Defaults to None.

    Returns:
        Tensor: The result of the dense operation.
    """
    if use_pyboost():
        return pyboost.dense_op(input, weight, bias)
    return legacy.dense(input, weight, bias)

 def transpose_view(input, dim0, dim1):
    """
    Transposes the input tensor along the specified dimensions.

    Args:
        input (Tensor): The input tensor.
        dim0 (int): The first dimension to transpose.
        dim1 (int): The second dimension to transpose.

    Returns:
        Tensor: The transposed tensor.
    """
    if use_pyboost():
        return pyboost.transpose_ext_view_op(input, dim0, dim1)
    ranks = list(range(input.ndim))
    rank0 = ranks[dim0]
    rank1 = ranks[dim1]
    ranks[dim0] = rank1
    ranks[dim1] = rank0
    return legacy.transpose(input, ranks)

 def matmul(input, other):
    """
    Performs a matrix multiplication of the input tensor with another tensor.

    Args:
        input (Tensor): The input tensor.
        other (Tensor): The other tensor.

    Returns:
        Tensor: The result of the matrix multiplication.
    """
    if use_pyboost():
        return pyboost.matmul_ext_op(input, other)
    return legacy.mat_mul(input, other)

 def div(input, other):
    """
    Divides the input tensor by another tensor.

    Args:
        input (Tensor): The input tensor.
        other (Tensor): The other tensor.

    Returns:
        Tensor: The result of the division.
    """
    if use_pyboost():
        return pyboost.div_op(input, other)
    return legacy.div(input, other)

 def divmod(input, other, rounding_mode):
    """
    Divides the input tensor by another tensor and returns both the quotient and the remainder.

    Args:
        input (Tensor): The input tensor.
        other (Tensor): The other tensor.
        rounding_mode (str): The rounding mode to use.

    Returns:
        Tuple[Tensor, Tensor]: The quotient and the remainder.
    """
    if use_pyboost():
        return pyboost.divmod_op(input, other, rounding_mode)
    if rounding_mode == 'floor':
        return legacy.floor_div(input, other)
    elif rounding_mode == 'trunc':
        return legacy.truncate_div(input, other)
    else:
        raise ValueError(f'Invalid rounding mode: {rounding_mode}')

 def softmax(input, axis=-1):
    """
    Computes the softmax of the input tensor along the specified axis.

    Args:
        input (Tensor): The input tensor.
        axis (int): The axis along which to compute the softmax.

    Returns:
        Tensor: The softmax of the input tensor.
    """
    if use_pyboost():
        return pyboost.softmax_impl(input, axis)
    return legacy.softmax(input, axis)

 def permute(input, axes=None):
    """
    Transposes the dimensions of the input tensor according to the specified axes.

    Args:
        input (Tensor): The input tensor.
        axes (Tuple[int]): The axes to transpose.

    Returns:
        Tensor: The transposed tensor.
    """
    if use_pyboost():
        return pyboost.transpose_view_op(input, axes)
    return legacy.transpose(input, axes)

 def gelu(input, approximate):
    """
    Computes the Gaussian Error Linear Unit (GELU) activation function.

    Args:
        input (Tensor): The input tensor.

    Returns:
        Tensor: The GELU activation of the input tensor.
    """
    if use_pyboost():
        return pyboost.gelu_ext_op(input, approximate)
    return legacy.ge_lu(input)

 def tanh(input):
    """
    Computes the hyperbolic tangent of the input tensor.

    Args:
        input (Tensor): The input tensor.

    Returns:
        Tensor: The hyperbolic tangent of the input tensor.
    """
    if use_pyboost():
        return pyboost.tanh_op(input)
    return legacy.tanh(input)

 def broadcast_to(input, shape):
    """
    Broadcasts the input tensor to the specified shape.

    Args:
        input (Tensor): The input tensor.
        shape (Tuple[int]): The shape to broadcast to.

    Returns:
        Tensor: The broadcasted tensor.
    """
    if use_pyboost():
        return pyboost.broadcast_to_view_op(input, shape)
    return legacy.broadcast_to(input, shape)

 def split_tensor(tensor, split_size_or_sections, dim):
    """
    Splits a tensor into multiple sub-tensors.

    Args:
        tensor (Tensor): The input tensor.
        split_size_or_sections (Union[int, Tuple[int]]): The size or number of sections to split the tensor into.
        dim (int): The dimension along which to split the tensor.

    Returns:
        List[Tensor]: The list of split sub-tensors.
    """
    if use_pyboost():
        return pyboost.split_tensor_op(tensor, split_size_or_sections, dim)
    return legacy.split(tensor, split_size_or_sections, dim)

 def squeeze(input, dim):
    """
    Removes dimensions of size 1 from the shape of the input tensor.

    Args:
        input (Tensor): The input tensor.
        dim (Union[int, Tuple[int]]): The dimensions to squeeze.

    Returns:
        Tensor: The squeezed tensor.
    """
    if use_pyboost():
        return pyboost.squeeze_impl(input, dim)
    return legacy.squeeze(input, dim)

 def zeros(shape, dtype):
    """
    Returns a tensor filled with zeros.

    Args:
        shape (Union[int, Tuple[int]]): The shape of the tensor.
        dtype (str): The data type of the tensor.

    Returns:
        Tensor: The tensor filled with zeros.
    """
    return legacy.zeros(shape, dtype)

 def equal(input, other):
    """
    Returns a tensor with boolean values, indicating element-wise equality.

    Args:
        input (Tensor): The input tensor.
        other (Tensor): The tensor to compare with.

    Returns:
        Tensor: The tensor with boolean values.
    """
    if use_pyboost():
        return pyboost.equal_ext_op(input, other)
    return legacy.equal(input, other).all()

 def eq(input, other):
    """
    Returns a tensor with boolean values, indicating element-wise equality.

    Args:
        input (Tensor): The input tensor.
        other (Tensor): The tensor to compare with.

    Returns:
        Tensor: The tensor with boolean values.
    """
    if use_pyboost():
        return pyboost.equal_op(input, other)
    return legacy.equal(input, other)


 def sum(input, dim, keepdim, dtype):
    """
    Returns the sum of elements over a specified dimension.

    Args:
        input (Tensor): The input tensor.
        dim (Union[int, Tuple[int]]): The dimensions to sum over.
        keepdim (bool): Whether to keep the dimensions of size one.

    Returns:
        Tensor: The tensor with summed elements.
    """
    if use_pyboost():
        return pyboost.sum_ext_op(input, dim, keepdim, dtype)
    return legacy.reduce_sum(input.astype(dtype), dim, keepdim)

 def dropout(input, p, seed, offset):
    """
    Returns a tensor with dropout applied element-wise.

    Args:
        input (Tensor): The input tensor.
        p (float): The dropout probability.
        seed (int): The random seed.

    Returns:
        Tensor: The tensor with dropout applied.
    """
    if use_pyboost():
        return pyboost.dropout_ext_op(input, p, seed, offset)
    return legacy.dropout(input, 1-p, 0, 0)

 def clone(input):
    """
    Returns a copy of the input tensor.

    Args:
        input (Tensor): The input tensor.

    Returns:
        Tensor: The copied tensor.
    """
    if use_pyboost():
        return pyboost.clone_op(input)
    return legacy.identity(input)

 def inplace_normal(input, mean, std, generator):
    """
    Returns a tensor with normal distribution applied element-wise.

    Args:
        input (Tensor): The input tensor.
        mean (float): The mean of the normal distribution.
        std (float): The standard deviation of the normal distribution.
        seed (int): The random seed.

    Returns:
        Tensor: The tensor with normal distribution applied.
    """
    seed, offset = generator._step(12)
    if use_pyboost():
        return pyboost.inplace_normal_op(input, mean, std, seed, offset)
    return legacy.normal(input, mean, std, 0, 0)

 def reduce_all(input, dim, keepdim):
    """
    Returns the sum of all elements in the tensor.

    Args:
        input (Tensor): The input tensor.
        dim (int): The dimension to reduce.
        keepdim (bool): Whether to keep the reduced dimension.

    Returns:
        Tensor: The tensor with the sum of all elements.
    """
    if use_pyboost():
        return pyboost.reduce_all_impl(input, dim, keepdim)
    return legacy.reduce_all(input, dim, keepdim)

 def masked_fill(input, mask, value):
    """
    Fills elements of the input tensor with the specified value where the mask is True.

    Args:
        input (Tensor): The input tensor.
        mask (Tensor): The mask tensor.
        value (float): The value to fill.

    Returns:
        Tensor: The tensor with elements filled.
    """
    if use_pyboost():
        return pyboost.masked_fill_op(input, mask, value)
    return legacy.masked_fill(input, mask, value)

 def isin(input, test_elements, assume_unique=False, invert=False):
    """
    Checks if elements of input tensor are in test_elements.

    Args:
        input (Tensor): The input tensor.
        test_elements (Tensor): The tensor to test against.
        assume_unique (bool): If True, assumes that test_elements contains unique elements.
        invert (bool): If True, inverts the result.

    Returns:
        Tensor: The tensor with boolean values indicating whether elements are in test_elements.
    """
    if use_pyboost():
        return pyboost.isin(input, test_elements, assume_unique, invert)
    return legacy.isin(input, test_elements, assume_unique, invert)

 def pad_v3(input, new_pad, mode, value=None, contiguous=True):
    if input.dtype == mindspore.bool_:
        input = cast(input, mindspore.int8)
        out = legacy.pad_v3(input, new_pad, int(value), mode, contiguous)
        return cast(out, mindspore.bool_)
    return legacy.pad_v3(input, new_pad, value, mode, contiguous)

 def log_softmax(input, axis=-1, dtype=None):
    """
    Computes the log softmax of the input tensor along the specified axis.

    Args:
        input (Tensor): The input tensor.
        axis (int): The axis along which to compute the log softmax.
        dtype (dtype): The data type of the output tensor.

    Returns:
        Tensor: The tensor with log softmax values.
    """
    if use_pyboost():
        return pyboost.log_softmax_impl(input, axis)
    return legacy.log_softmax(input, axis)

 def not_equal(input, other):
    """
    Computes the element-wise comparison of two tensors for inequality.

    Args:
        input (Tensor): The input tensor.
        other (Tensor): The other tensor.

    Returns:
        Tensor: The tensor with boolean values indicating whether elements are not equal.
    """
    if use_pyboost():
        return pyboost.not_equal_op(input, other)
    return legacy.not_equal(input, other)

 def chunk(input, chunks, dim=0):
    """
    Splits a tensor into a specified number of chunks.

    Args:
        input (Tensor): The input tensor.
        chunks (int): The number of chunks to split the tensor into.
        dim (int): The dimension along which to split the tensor.

    Returns:
        Tensor: The tensor split into chunks.
    """
    if use_pyboost():
        return pyboost.chunk_op(input, chunks, dim)
    return legacy.split(input, dim, chunks)

 def ones(shape, dtype):
    """
    Returns a tensor filled with ones.

    Args:
        shape (tuple): The shape of the tensor.
        dtype (dtype): The data type of the tensor.

    Returns:
        Tensor: The tensor filled with ones.
    """
    return legacy.ones(shape, dtype)

 def greater(input, other):
    """
    Returns a tensor with boolean values indicating whether elements in the input tensor are greater than those in the other tensor.

    Args:
        input (Tensor): The input tensor.
        other (Tensor): The other tensor.

    Returns:
        Tensor: The tensor with boolean values indicating whether elements are greater.
    """
    if use_pyboost():
        return pyboost.greater_op(input, other)
    return legacy.greater(input, other)

 def randint(low, high, shape, generator, dtype):
    """
    Returns a tensor filled with random integers from low (inclusive) to high (exclusive).

    Args:
        low (int): The lower bound of the range.
        high (int): The upper bound of the range.
        shape (tuple): The shape of the tensor.
        dtype (dtype): The data type of the tensor.

    Returns:

        Tensor: The tensor filled with random integers.
    """
    seed, offset = generator._step(12)

    if use_pyboost():
        return pyboost.randint_op(low, high, shape, seed, offset, dtype)
    value = legacy.uniform_int(shape,
                                mindspore.tensor(low, dtype=mindspore.int32),
                                mindspore.tensor(high, dtype=mindspore.int32), 0, 0)
    return value
    
 def nllloss(input, target, weight, reduction, ingore_index):
    if use_pyboost():
        return pyboost.nllloss_impl(input, target, weight, reduction, ingore_index)
    return legacy.nll_loss(input, target, weight, reduction, ingore_index)

 def clamp_scalar(value, min_value, max_value):
    if use_pyboost():
        return pyboost.clamp_scalar_op(value, min_value, max_value)
    if min_value is not None:
        value = legacy.maximum(value, min_value)
    if max_value is not None:
        value = legacy.minimum(value, max_value)
    return value

 def cumsum(self, dim, dtype):
    if use_pyboost():
        return pyboost.cumsum_ext_op(self, dim, dtype)
    return legacy.cum_sum(self, dim, False, False)

 def reduce_any(input, axis, keepdims):
    if use_pyboost():
        return pyboost.reduce_any_impl(input, axis, keepdims)
    return legacy.reduce_any(input, axis, keepdims)

 def concat(tensors, axis):
    if use_pyboost():
        return pyboost.concat_impl(tensors, axis)
    return legacy.concat(tensors, axis)

 def gather_d(input, dim, index):
    if use_pyboost():
        return pyboost.gather_d_op(input, dim, index)
    return legacy.gather_d(input, dim, index)

 def greater_equal(input, other):
    if use_pyboost():
        return pyboost.greater_equal_op(input, other)
    return legacy.greater_equal(input, other)

 def less(input, other):
    if use_pyboost():
        return pyboost.less_op(input, other)
    return legacy.less(input, other)

 def less_equal(input, other):
    if use_pyboost():
        return pyboost.less_equal_op(input, other)
    return legacy.less_equal(input, other)

 def select(condition, input, other):
    if use_pyboost():
        return pyboost.select_op(condition, input, other)
    return legacy.select(condition, input, other)

 def mean(input, axis, keepdims, dtype):
    if use_pyboost():
        return pyboost.mean_ext_op(input, axis, keepdims, dtype)
    return legacy.reduce_mean(input, axis, keepdims)

 def index(input, index):
    if use_pyboost():
        return pyboost.index_op(input, index)
    return legacy.index(input, index)

 def scatter(input, dim, index, src):
    if use_pyboost():
        return pyboost.scatter_op(input, dim, index, src)
    return legacy.tensor_scatter_elements(input, index, src, dim)

 def tril(input, diagonal=0):
    if use_pyboost():
        return pyboost.tril_ext_op(input, diagonal)
    return legacy.tril(input, diagonal)

 def triu(input, diagonal=0):
    if use_pyboost():
        return pyboost.triu_impl(input, diagonal)
    return legacy.triu(input, diagonal)

 def inplace_index_put(input, indices, values, accumulate):
    if use_pyboost():
        return pyboost.inplace_index_put_op(input, indices, values, accumulate)
    return legacy.tensor_scatter_elements(input, indices, values, accumulate)

 def zeros_like(input, dtype):
    if use_pyboost():
        return pyboost.zeros_like_ext_op(input, dtype)
    return legacy.zeros_like(input)

 def ones_like(input, dtype):
    if use_pyboost():
        return pyboost.ones_like_ext_op(input, dtype)
    return legacy.ones_like(input)

 def tile(input, multiples):
    return legacy.tile(input, multiples)

 def arange(start, end, step, dtype):
    if use_pyboost():
        return pyboost.arange_op(start, end, step, dtype)
    return legacy.range(start, end, step, 100000)

 def fill_scalar(input, value, dtype):
    if use_pyboost():
        return pyboost.fill_scalar_op(input, value, dtype)
    return legacy.fill(input, value)

 def stop_gradient(input):
    return legacy.stop_gradient(input)

 def isinf(input):
    if use_pyboost():
        return pyboost.isinf_op(input)
    return legacy.is_inf(input)

 def sort(input, dim, descending, stable):
    if use_pyboost():
        return pyboost.sort_ext_op(input, dim, descending, stable)
    return legacy.sort(input, dim, descending)

 def prod(input, axis, keepdims, dtype):
    if use_pyboost():
        return pyboost.prod_ext_op(input, axis, keepdims, dtype)
    return legacy.reduce_prod(input, axis, keepdims)

 def isclose(input, other, rtol, atol, equal_nan):
    if use_pyboost():
        return pyboost.isclose_impl(input, other, rtol, atol, equal_nan)
    return legacy.is_close(input, other, rtol, atol, equal_nan)

 def argmax(input, axis, keepdims):
    if use_pyboost():
        return pyboost.argmax_ext_op(input, axis, keepdims)
    return legacy.argmax(input, axis, keepdims)

 def argmin(input, axis, keepdims):
    if use_pyboost():
        return pyboost.argmin_ext_op(input, axis, keepdims)
    return legacy.argmin(input, axis, keepdims)


 def bmm(input, other):
    if use_pyboost():
        return pyboost.bmm_ext_op(input, other)
    return legacy.batch_mat_mul(input, other)

 def topk(input, k, dim, largest, sorted):
    if use_pyboost():
        return pyboost.topk_ext_op(input, k, dim, largest, sorted)

    if not largest:
        input = -input
    if dim is None or dim == input.ndim - 1:
        if not largest:
            res = legacy.top_k(input, k, sorted)
            values, indices = -res[0], res[1]
            return values, indices
        return legacy.top_k(input, k, sorted)
    input = transpose_view(input, dim, input.ndim - 1)
    output = legacy.top_k(input, k, sorted)
    values = transpose_view(output[0], dim, input.ndim - 1)
    indices = transpose_view(output[1], dim, input.ndim - 1)
    if not largest:
        res = (-values, indices)
    else:
        res = (values, indices)
    return res

 def logical_not(input):
    if use_pyboost():
        return pyboost.logical_not_op(input)
    return legacy.logical_not(input)

 def rand(size, generator, dtype):
    seed, offset = generator._step(12)
    if use_pyboost():
        return pyboost.rand_ext_op(size, seed, offset, dtype)
    return legacy.uniform_real(size, 0, 0)

 def inplace_uniform(input, from_, to, generator):
    seed, offset = generator._step(12)
    if use_pyboost():
        return pyboost.uniform_ext_op(input, from_, to, seed, offset)

    if input.dtype.is_floating_point:
        out = legacy.uniform_real(input.shape, 0, 0)
        value = legacy.add(legacy.mul(out, (legacy.sub(to, from_))), from_)
    else:
        value = legacy.uniform_int(input.shape,
                                    mindspore.tensor(from_, dtype=mindspore.int32),
                                    mindspore.tensor(to, dtype=mindspore.int32), 0, 0)
    input.assign_value(legacy.cast(value, input.dtype))

 def bitwise_or_tensor(input, other):
    if use_pyboost():
        return pyboost.bitwise_or_tensor_op(input, other)
    return legacy.bitwise_or(input, other)

 def bitwise_and_tensor(input, other):
    if use_pyboost():
        return pyboost.bitwise_and_tensor_op(input, other)
    return legacy.bitwise_and(input, other)

 def bitwise_or_scalar(input, other):
    if use_pyboost():
        return pyboost.bitwise_or_scalar_op(input, other)
    return legacy.bitwise_or(input, other)


 def max(input):
    if use_pyboost():
        return pyboost.max_op(input)
    return legacy.reduce_max(input, (), False)

 def stack(tensors, axis=0):
    if use_pyboost():
        return pyboost.stack_ext_impl(tensors, axis)
    return legacy.stack(tensors, axis)

 def narrow(input, dim, start, length):
    if use_pyboost():
        return pyboost.narrow_op(input, dim, start, length)
    begin = [0] * input.ndim
    size = [i for i in input.shape]
    begin[dim] = start
    size[dim] = length
    return legacy.slice(input, begin, size)

 def std(input, dim, correction, keepdim):
    if use_pyboost():
        return pyboost.std_op(input, dim, correction, keepdim)
    return legacy.reduce_std(input, dim, keepdim)


 def log(input):
    if use_pyboost():
        return pyboost.log_op(input)
    return legacy.log(input)

 def gather(input_params, input_indices, axis, batch_dim):
    return legacy.gather(input_params, input_indices, axis, batch_dim)

 def non_zero_ext(input):
    if use_pyboost():
        return pyboost.non_zero_ext_op(input)
    return legacy.non_zero(input)

 def binary_cross_entropy_with_logits(input, target, weight, posWeight, reduction):
    if use_pyboost():
        return pyboost.binary_cross_entropy_with_logits_impl(input, target, weight, posWeight, reduction)
    return legacy.bce_with_logits_loss(input, target, weight, posWeight, reduction)

 def rand_like(input, generator, dtype):
    seed, offset = generator._step(123)
    if use_pyboost():
        return pyboost.rand_like_ext_op(input, seed, offset, dtype)
    return rand(input.shape, dtype)

 def floor_div(input, other):
    if use_pyboost():
        return pyboost.floor_div_op(input, other)
    return legacy.floor_div(input, other)

 def inplace_fill_scalar(input, value):
    if use_pyboost():
        return pyboost.inplace_fill_scalar_op(input, value)
    input.assign_value(fill_scalar(input.shape, value, input.dtype))
    return input

 def linalg_vector_norm(x, ord=2, dim=None, keepdim=False, dtype=None):
    if use_pyboost():
        return pyboost.linalg_vector_norm_op(x, ord, dim, keepdim, dtype)

 def non_zero(input):
    if use_pyboost():
        return pyboost.non_zero_op(input)
    return legacy.non_zero(input)

 def fmod_scalar(input, other):
    if use_pyboost():
        return pyboost.fmod_scalar_op(input, other)
    return legacy.floor_mod(input, other)

 def inplace_zero(input):
    if use_pyboost():
        return pyboost.inplace_zero_op(input)
    input.assign_value(zeros(input.shape, input.dtype))
    return input

 def mse_loss(input, target, reduction):
    if use_pyboost():
        return pyboost.mse_loss_ext_op(input, target, reduction)

 def abs(input):
    if use_pyboost():
        return pyboost.abs_op(input)
    return legacy.abs(input)

 def bincount(input, weights=None, minlength=0):
    if use_pyboost():
        return pyboost.bincount_ext_op(input, weights, minlength)
    return legacy.bincount(input, minlength, weights)

 def bitwise_and_scalar(input, other):
    if use_pyboost():
        return pyboost.bitwise_and_scalar_op(input, other)
    return legacy.bitwise_and(input, other)

 def argmax_with_value(input, axis, keep_dims):
    if use_pyboost():
        return pyboost.argmax_with_value_impl(input, axis, keep_dims)
    return legacy.argmax(input, axis, keep_dims)

 def index_select(input, dim, index):
    if use_pyboost():
        return pyboost.index_select_op(input, dim, index)
    return legacy.gather(input, index, dim, 0)

 def min(input):
    if use_pyboost():
        return pyboost.min_op(input)
    return legacy.reduce_min(input, (), False)

 def minimum(input, other):
    if use_pyboost():
        return pyboost.minimum_op(input, other)
    return legacy.minimum(input, other)

 def argmin_with_value(input, axis, keep_dims):
    if use_pyboost():
        return pyboost.argmin_with_value_impl(input, axis, keep_dims)
    return legacy.argmin(input, axis, keep_dims)

 def flatten(input, start_dim, end_dim):
    if use_pyboost():
        return pyboost.flatten_ext_op(input, start_dim, end_dim)
    if start_dim < 0:
        start_dim = start_dim + input.ndim
    if end_dim < 0:
        end_dim = end_dim + input.ndim
    input_shape = list(input.shape)
    input_shape[start_dim:end_dim] = [-1]
    return legacy.reshape(input, tuple(input_shape))

 def conv2d_padding(input, weight, bias=None, stride=1, padding='valid', dilation=1, groups=1):
    if use_pyboost():
        return pyboost.conv2d_padding_op(input, weight, bias, stride, padding, dilation, groups)
    return legacy.conv2d(input, weight, bias, stride, padding, dilation, groups)

 def conv2d(input, weight, bias=None, stride=1, padding='valid', dilation=1, groups=1):
    if use_pyboost():
        return pyboost.conv2d_ext_op(input, weight, bias, stride, padding, dilation, groups)
    return legacy.conv2d(input, weight, bias, stride, padding, dilation, groups)

 def cos(input):
    if use_pyboost():
        return pyboost.cos_op(input)
    return legacy.cos(input)

 def pow_tensor_scalar(input, exponent):
    if use_pyboost():
        return pyboost.pow_tensor_scalar_op(input, exponent)
    return legacy.pow(input, exponent)

 def sin(input):
    if use_pyboost():
        return pyboost.sin_op(input)
    return legacy.sin(input)

 def batch_norm(input, weight, bias, running_mean=None, runnning_var=None, training=False, momentum=0.1, epsilon=1e-5):
    if use_pyboost():
        return pyboost.batch_norm_ext_op(input, weight, bias, running_mean, runnning_var, training, momentum, epsilon)
    return legacy.batch_norm(input, weight, bias, running_mean, runnning_var, training, momentum, epsilon, 'NHWC')

 def silu(input):
    if use_pyboost():
        return pyboost.silu_op(input)
    return legacy.silu(input)

 def rsqrt(input):
    if use_pyboost():
        return pyboost.rsqrt_op(input)
    return legacy.rsqrt(input)

 def sqrt(input):
    if use_pyboost():
        return pyboost.sqrt_op(input)
    return legacy.sqrt(input)

 def masked_scatter(input, mask, value):
    return legacy.masked_scatter(input, mask, value)

 def neg(input):
    if use_pyboost():
        return pyboost.neg_op(input)
    return legacy.neg(input)

 def log1p(input):
    if use_pyboost():
        return pyboost.log1p_op(input)
    return legacy.log1p(input)

 def pow_scalar_tensor(input, scalar):
    if use_pyboost():
        return pyboost.pow_scalar_tensor_op(input, scalar)
    return legacy.pow(input, scalar)

 def adaptive_avg_pool2d(input, output_size):
    if use_pyboost():
        return pyboost.adaptive_avg_pool2d_ext_op(input, output_size)
    return legacy.adaptive_avg_pool2_d(input, output_size)


 def exp(input):
    if use_pyboost():
        return pyboost.exp_op(input)
    return legacy.exp(input)

 def sigmoid(input):
    if use_pyboost():
        return pyboost.sigmoid_op(input)
    return legacy.sigmoid(input)

 def constant_pad_nd(input, pad, value=0.0):
    if use_pyboost():
        return pyboost.constant_pad_nd_op(input, pad, value)

 def rfft(input, n=None, dim=-1, norm=None):
    if use_pyboost():
        return pyboost.rfft_op(input, n, dim, norm)
    if input.shape[dim] < n:
        pad_inf = (0, n - input.shape[dim])
        pad_dims = (0, 0) * (input.ndim - (dim + 1)) + pad_inf
        input = constant_pad_nd(input, pad_dims)
    else:
        input = narrow(input, dim, 0, n)
    return legacy.fft_with_size(input, input.ndim, False, True, norm, True, ())

 def avg_pool2d(input, kernel_size, stride, padding=0, ceil_mode=False, count_include_pad=True, divisor_override=None):
    if use_pyboost():
        return pyboost.avg_pool2d_op(input, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)

 def conj(input):
    if use_pyboost():
        return pyboost.conj_op(input)
    return legacy.conj(input)

 def fill_tensor(size, value, dtype):
    if use_pyboost():
        return pyboost.fill_tensor_op(size, value, dtype)
    return legacy.fill_v2(size, value)

 def maximum(input, other):
    if use_pyboost():
        return pyboost.maximum_op(input, other)
    return legacy.maximum(input, other)

 def irfft(input, n, dim, norm):
    if use_pyboost():
        return pyboost.irfft_op(input, n, dim, norm)
    return legacy.fft_with_size(input, input.ndim, True, True, norm)

 def randn(size, generator, dtype):
    if use_pyboost():
        seed, offset = generator._step(12)
        return pyboost.randn_op(size, seed, offset, dtype)
    return cast(legacy.standard_normal(size, 0, 0), dtype)

 def avg_pool1d(input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True):
    if use_pyboost():
        return pyboost.avg_pool1d_op(input, kernel_size, stride, padding, ceil_mode, count_include_pad)
    return legacy.avg_pool1d(input, kernel_size, stride, padding, ceil_mode, count_include_pad)

 def pow(input, exponent):
    if use_pyboost():
        return pyboost.pow_op(input, exponent)
    return legacy.pow(input, exponent)

 def roll(input, shifts, axis):
    if use_pyboost():
        return pyboost.roll_impl(input, shifts, axis)
    return legacy.roll(input, shifts, axis)

 def conv1d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    if use_pyboost():
        return pyboost.conv1d_ext_op(input, weight, bias, stride, padding, dilation, groups)
    return legacy.conv1d(input, weight, bias, pad, stride, dilation)

 def conv1d_padding(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    if use_pyboost():
        return pyboost.conv1d_padding_op(input, weight, bias, stride, padding, dilation, groups)
    return legacy.conv1d(input, weight, bias, pad, stride, dilation)

 def square(input):
    if use_pyboost():
        return pyboost.square_op(input)
    return legacy.square(input)

 def lgamma(input):
    return legacy.lgamma(input)

 def reverse_v2(input, axis):
    if isinstance(axis, int):
        axis = (axis,)
    if use_pyboost():
        return pyboost.reverse_v2_impl(input, axis)
    return legacy.reverse_v2(input, axis)

 def unique_consecutive(input, return_inverse, return_counts, dim):
    if use_pyboost():
        return pyboost.unique_consecutive_impl(input, return_inverse, return_counts, dim)
    return legacy.unique_consecutive(input, return_inverse, return_counts, dim)

 def split_with_size(input, size, dim=0):
    if use_pyboost():
        return pyboost.split_with_size_op(input, size, dim)
    return legacy.split_with_size(input, size, dim)

 def softplus(input, beta=1, threshold=20):
    if use_pyboost():
        return pyboost.softplus_ext_op(input, beta, threshold)
    return legacy.softplus(input, beta, threshold)

 def remainder_tensor_scalar(input, other):
    if use_pyboost():
        return pyboost.remainder_tensor_scalar_op(input, other)
    out = input - floor_div(input, other) * other
    return out

 def baddbmm(input, batch1, batch2, alpha=1, beta=1):
    if use_pyboost():
        return pyboost.baddbmm_op(input, batch1, batch2, alpha, beta)
    return legacy.baddbmm(input, batch1, batch2, alpha, beta)

 def floor(input):
    if use_pyboost():
        return pyboost.floor_op(input)
    return legacy.floor(input)

 def conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1):
    if use_pyboost():
        return pyboost.conv_transpose2d_op(input, weight, bias, stride, padding, output_padding, groups, dilation)
    return legacy.conv_transpose2d(input, weight, bias, stride, padding, output_padding, groups, dilation)

 def relu(input):
    if use_pyboost():
        return pyboost.relu_op(input)
    return legacy.re_lu(input)

 def max_pool2d(input, kernel_size, stride=1, padding=0, dilation=1, ceil_mode=False, return_indices=False):
    # out, indices = legacy.max_pool_with_argmax_v2(input, kernel_size, stride, padding, dilation, ceil_mode)

    out, indices = legacy.max_pool_with_indices(input, kernel_size, stride, padding, dilation, ceil_mode)
    if return_indices:
        return out, indices
    return out

 def upsample_bilinear2d(input, size=None, scale_factor=None, align_corners=False):
    if use_pyboost():
        return pyboost.upsample_bilinear2d_op(input, size, scale_factor, align_corners)
    return legacy.resize_bilinear_v2(input, size, scale_factor, align_corners)

 def group_norm(input, num_groups, weight=None, bias=None, eps=1e-5):
    if use_pyboost():
        return pyboost.group_norm_op(input, num_groups, weight, bias, eps)
    return legacy.group_norm(input, num_groups, eps, affine)

 def nllloss_2d(input, target, weight, reduction='mean', ignore_index=-100):
    if use_pyboost():
        return pyboost.nllloss_2d_op(input, target, weight, reduction, ignore_index)
    return legacy.nll_loss(input, target, weight, ignore_index, reduction)

 def inplace_relu(input):
    if use_pyboost():
        return pyboost.inplace_relu_op(input)
    return legacy.assign(input, legacy.re_lu(input))

 def expm1(input):
    if use_pyboost():
        return pyboost.expm1_op(input)
    return legacy.expm1(input)

 def upsample_bicubic2d(input, size=None, scale_factor=None, align_corners=False):
    if use_pyboost():
        return pyboost.upsample_bicubic2d_op(input, size, scale_factor, align_corners)
    return legacy.resize_bicubic(input, size, scale_factor, align_corners)

 def acos(input):
    if use_pyboost():
        return pyboost.acos_op(input)
    return legacy.acos(input)

 def cdist(x1, x2, p):
    return legacy.cdist(x1, x2, float(p))

 def unstack_view(input, dim):
    if use_pyboost():
        return pyboost.unstack_ext_view_op(input, dim)
    return legacy.unstack(input, dim, input.shape[dim])

 def l1_loss(input, target, reduction='mean'):
    if use_pyboost():
        return pyboost.l1_loss_ext_op(input, target, reduction)
    return legacy.l1(input, target, reduction)

 def diag(input, diagonal):
    if use_pyboost():
        return pyboost.diag_ext_op(input, diagonal)
    return legacy.diag(input, diagonal)

 def logsigmoid(input):
    if use_pyboost():
        return pyboost.logsigmoid_op(input)
    return legacy.logsigmoid(input)

 def one_hot(tensor, num_classes):
    if use_pyboost():
        on_value = mindspore.Tensor(1, dtype=tensor.dtype)
        off_value = mindspore.Tensor(0, dtype=tensor.dtype)
        return pyboost.one_hot_ext_impl(tensor, num_classes, on_value, off_value, -1)
    return legacy.one_hot(tensor, num_classes, on_value, off_value, -1)

 def var(input, dim=None, correction=1, keepdim=False):
    if use_pyboost():
        return pyboost.var_op(input, dim, correction, keepdim)
    return legacy.var(input, dim, correction, keepdim)

 def linspace(start, end, steps, dtype=None):
    if use_pyboost():
        return pyboost.lin_space_ext_op(start, end, steps, dtype)
    return legacy.lin_space(start, end, steps)

 def masked_select(input, mask):
    if use_pyboost():
        return pyboost.masked_select_op(input, mask)
    return legacy.masked_select(input, mask)

 def glu(input, dim=-1):
    if use_pyboost():
        return pyboost.glu_impl(input, dim)
    return legacy.glu(input, dim)

 def scatter_value(input, dim, index, src, reduce='none'):
    if use_pyboost():
        return pyboost.scatter_value_op(input, dim, index, src, reduce)
    return legacy.scatter(input, dim, index, src, reduce)

 def unique_dim(input, sorted, return_inverse, dim):
    if use_pyboost():
        return pyboost.unique_dim_op(input, sorted, return_inverse, dim)
    return legacy.unique_dim(input, sorted, return_inverse, dim)

 def inplace_add(input, other, alpha):
    if use_pyboost():
        return pyboost.inplace_add_ext_op(input, other, alpha)
    return legacy.inplace_add(input, other)

 def logsumexp(input, dim, keepdim):
    if use_pyboost():
        return pyboost.logsumexp_op(input, dim, keepdim)
    return legacy.logsumexp(input, dim, keepdim)

 def ctc_loss(log_probs, targets, input_lengths, target_lengths, blank, reduction, zero_infinity):
    loss, log_alpha = legacy.ctc_loss_v2(log_probs, targets, input_lengths, target_lengths, blank, 'none', zero_infinity)
    if reduction == 'sum':
        loss = sum(loss, (), False, None)
    if reduction == 'mean':
        # input_type = loss.dtype
        # target_length_t = target_lengths.clip(1., None)
        # loss = loss.astype("float32")
        loss = div(loss, target_lengths)
        loss = mean(loss, (), False, None)
        # loss = loss.astype(input_type)
    return (loss, log_alpha)

 def inplace_exponential(self, lambd, generator):
    seed, offset = generator._step(12)
    if use_pyboost():
        return pyboost.inplace_exponential_op(self, lambd, seed, offset)
    return legacy.expo(self, lambd, generator)

 def im2col(input, kernel_size, dilation=1, padding=0, stride=1):
    if use_pyboost() and not ON_A1:
        return pyboost.im2col_ext_op(input, kernel_size, dilation, padding, stride)
    out = legacy.im2_col(input, kernel_size, stride, dilation, padding)
    out_shape = out.shape[:1] + (-1,) + out.shape[-1:]
    out = reshape(out, out_shape)
    return out

 def upsample_nearest2d(input, output_size, scale_factors):
    if use_pyboost():
        return pyboost.upsample_nearest2d_op(input, output_size, scale_factors)
    return legacy.upsample_nearest2d(input, scale_factor, align_corners)

 def addmm(input, mat1, mat2, alpha=1.0, beta=1.0):
    if use_pyboost():
        return pyboost.addmm_op(input, mat1, mat2, alpha, beta)
    return legacy.addmm(input, mat1, mat2, alpha, beta)

 def meshgrid(input, lambd):
    if use_pyboost():
        return pyboost.meshgrid_impl(input, lambd)
    return legacy.meshgrid(input, lambd)

 def adaptive_avg_pool1d(input, output_size):
    if use_pyboost():
        return pyboost.adaptive_avg_pool1d_op(input, output_size)
    return legacy.adaptive_avg_pool1d(input, output_size)

 def conv3d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    if use_pyboost():
        return pyboost.conv3d_ext_op(input, weight, bias, stride, padding, dilation, groups)
    return legacy.conv3d(input, weight, bias, stride, padding, dilation, groups)

 def outer(input, other):
    if use_pyboost():
        return pyboost.outer_op(input, other)
    return legacy.outer(input, other)

 def addcmul(input, tensor1, tensor2, value=1.0):
    if use_pyboost():
        return pyboost.addcmul_op(input, tensor1, tensor2, value)
    return legacy.addcmul(input, tensor1, tensor2, value)

 def prelu(input, weight):
    if use_pyboost():
        return pyboost.prelu_op(input, weight)
    return legacy.p_re_lu(input, weight)

 def reciprocal(input):
    if use_pyboost():
        return pyboost.reciprocal_op(input)
    return legacy.reciprocal(input)

 def index_add_ext(input, dim, index, source, alpha):
    if use_pyboost():
        return pyboost.index_add_ext_op(input, dim, index, source, alpha)
    return legacy.index_add(input, dim, index, source, alpha)

 def polar(abs, angle):
    if use_pyboost():
        return pyboost.polar_op(abs, angle)
    return legacy.polar(abs, angle)

 def upsample_linear1d(input, output_size, scale_factor, align_corners=False):
    if use_pyboost():
        return pyboost.upsample_linear1d_op(input, output_size, scale_factor, align_corners)
    return legacy.upsample_linear1d(input, output_size, scale_factor, align_corners)

 def grid_sampler_2d(input, grid, mode='bilinear', padding_mode='zeros', align_corners=False):
    if use_pyboost():
        return pyboost.grid_sampler_2d_impl(input, grid, mode, padding_mode, align_corners)
    return legacy.grid_sampler_2d(input, grid, mode, padding_mode, align_corners)

 def pixel_shuffle(input, upscale_factor):
    if use_pyboost():
        return pyboost.pixel_shuffle_op(input, upscale_factor)
    return legacy.pixel_shuffle(input, upscale_factor)

 def view_as_complex(input):
    real_part, imag_part = chunk(input, 2, -1)
    return legacy.complex(squeeze(real_part, -1), squeeze(imag_part, -1))

 def rms_norm(input, weight, eps=1e-5):
    if use_pyboost():
        return pyboost.rms_norm_impl(input, weight, eps)[0]
    input_dtype = input.dtype
    input = cast(input, mindspore.float32)
    variance = mean(pow(input, 2), -1, True, None)
    input = mul(input, rsqrt(add(variance, eps, 1)))
    return mul(weight, cast(input, input_dtype))

 def normal_float_float(mean, std, size, dtype, generator):
    seed, offset = generator._step(12)
    if use_pyboost():
        return pyboost.normal_float_float_op(mean, std, size, seed, offset)

 def real(input):
    if use_pyboost():
        return pyboost.real_op(input)
    return legacy.real(input)

 def imag(input):
    return legacy.imag(input)

 def leaky_relu(input, negative_slope):
    if use_pyboost():
        return pyboost.leaky_relu_ext_op(input, negative_slope)
    return legacy.leaky_relu(input, negative_slope)

 def ceil(input):
    if use_pyboost():
        return pyboost.ceil_op(input)
    return legacy.ceil(input)

 def erf(input):
    if use_pyboost():
        return pyboost.erf_op(input)
    return legacy.erf(input)

 def cross(input, other, dim):
    if use_pyboost():
        return pyboost.cross_impl(input, other, dim)
    return legacy.cross(input, other, dim)

 def elu(input, alpha):
    if use_pyboost():
        return pyboost.elu_ext_impl(input, alpha)
    return legacy.elu(input, alpha)

 def reduce_max(input, axis, keepdims):
    if use_pyboost():
        return pyboost.reduce_max_impl(input, axis, keepdims)
    return legacy.reduce_max(input, axis, keepdims)

 def dynamic_rnn(x, w, b, seq_length, init_h, init_c):
    return legacy.dynamic_rnn(x, w, b, seq_length, init_h, init_c,
                              'LSTM', 'UNIDIRECTIONAL', 1, False, 1.0, -1.0, 0, True, 'tanh', 0.0, True)

 def nan_to_num(input, nan=0.0, posinf=None, neginf=None):
    return legacy.nan_to_num(input, nan, posinf, neginf)

 def round(input, decimals):
    if use_pyboost():
        return pyboost.round_op(input, decimals)
    return legacy.round(input, decimals)

 def fftn(input, s=None, dim=None, norm=None):
    if use_pyboost():
        return pyboost.fftn_op(input, s, dim, norm)

 def eye(n, m=None, dtype=None):
    if use_pyboost():
        return pyboost.eye_op(n, m, dtype)
    return legacy.eye(n, m, dtype)

 def erfinv(input):
    if use_pyboost():
        return pyboost.erfinv_op(input)
    return legacy.erfinv(input)

 def logit(input, eps=1e-5):
    return legacy.logit(input, eps)

 def bitwise_xor_tensor(input, other):
    if use_pyboost():
        return pyboost.bitwise_xor_tensor_op(input, other)
    return legacy.bitwise_xor(input, other)

 def unique2(input, sorted, return_inverse, return_counts):
    if use_pyboost():
        return pyboost.unique2_op(input, sorted, return_inverse, return_counts)
    return legacy.unique(input, sorted, return_inverse, return_counts)

 def sign(input):
    if use_pyboost():
        return pyboost.sign_op(input)
    return legacy.sign(input)

 def log2(input):
    if use_pyboost():
        return pyboost.log2_op(input)
    return legacy.log2(input)

 def bucketize(input, boundaries, right=False):
    epsilon_ = 0. if right else 1.e-6
    boundaries = [boundary + epsilon_ for boundary in boundaries]
    return legacy.bucketize(input, boundaries)

 def inplace_fill_diagonal(input, fill_value, wrap):
    if use_pyboost():
        return pyboost.inplace_fill_diagonal_op(input, fill_value, wrap)
    return legacy.fill_diagonal(input, fill_value, wrap)

 def clamp_tensor(input, min, max):
    if use_pyboost():
        return pyboost.clamp_tensor_op(input, min, max)

 def hswish(input):
    if use_pyboost():
        return pyboost.hswish_op(input)
    return legacy.h_swish(input)

 def logical_and(input, other):
    if use_pyboost():
        return pyboost.logical_and_op(input, other)
    return legacy.logical_and(input, other)

 def as_strided(input, size, stride, storage_offset):
    if use_pyboost():
        return pyboost.as_strided_op(input, size, stride, storage_offset)
    return legacy.as_strided(input, size, stride, storage_offset)

 def relu6(input):
    if use_pyboost():
        return pyboost.relu6_op(input)
    return legacy.re_lu6(input)

 def col2im(input, output_size, kernel_size, dilation=1, padding=0, stride=1):
    if use_pyboost():
        return pyboost.col2im_ext_op(input, output_size, kernel_size, dilation, padding, stride)
    return legacy.col2im(input, output_size, kernel_size, dilation, padding, stride)

 def flash_attention_score(query, key, value, real_shift, drop_mask, padding_mask, attn_mask, prefix, actual_seq_qlen, actual_seq_kvlen, head_num, keep_prob, scale_value, pre_tokens, next_tokens, inner_precise, input_layout, sparse_mode):
    if use_pyboost():
        return pyboost.flash_attention_score_impl(query, key, value, real_shift, drop_mask, padding_mask, attn_mask, prefix, actual_seq_qlen, actual_seq_kvlen, head_num, keep_prob, scale_value, pre_tokens, next_tokens, inner_precise, input_layout, sparse_mode)
    return legacy.flash_attention_score(query, key, value, real_shift, drop_mask, padding_mask, attn_mask, prefix, actual_seq_qlen, actual_seq_kvlen, head_num, keep_prob, scale_value, pre_tokens, next_tokens, inner_precise, input_layout, sparse_mode)

 def randperm(n, generator, dtype):
    if use_pyboost():
        return pyboost.randperm_ext_op(n, seed, offset, dtype)
    return legacy.randperm(n, seed)

 def logical_or(input_x, input_y):
    if use_pyboost():
        return pyboost.logical_or_op(input_x, input_y)
    return legacy.logical_or(input_x, input_y)

 def dropout2d(input_x, p):
    return legacy.dropout2_d(input_x, p)

 def linalg_qr(input_x, mode):
    if use_pyboost():
        return pyboost.linalg_qr_op(input_x, mode)
    full_matrices = 'mode' == 'complete'
    return legacy.qr(input_x, full_matrices)

 def bernoulli(input, generator):
    seed, offset = generator._step(12)
    if use_pyboost():
        return pyboost.bernoulli_ext_op(input, seed, offset)
    return legacy.bernoulli(input, seed, offset)

 def multinomial(input, num_samples, replacement, generator):
    seed, offset = generator._step(12)  # pylint: disable=protected-access
    return pyboost.multinomial_ext_op(input, num_samples, replacement, seed, offset)
--- a/mindnlp/core/_dtype.py
+++ b/mindnlp/core/_dtype.py
@@ -4,7 +4,7 @@ from mindspore.common.dtype import *
 from mindspore._c_expression import typing
 from mindspore._c_expression.typing import Type

 from .configs import ON_A1, SUPPORT_BF16
 from .configs import ON_A1, SUPPORT_BF16, DEVICE_TARGET

 if SUPPORT_BF16:
    from mindspore.common.np_dtype import bfloat16 as np_bfloat16# pylint: disable=import-error
@@ -15,7 +15,7 @@ bool_alias = bool
 float_alias = float
 int_alias = int

 if ON_A1:
 if ON_A1 or DEVICE_TARGET == 'GPU':
    warnings.warn('MindSpore on GPU/910A do not support bfloat16, use float16 instead.')
    bfloat16 = float16

@@ -124,3 +124,11 @@ py2dtype = {
    float_alias: float,
    int_alias: int64
 }

 mantissa_bits_map = {
    int64: 63,
    float32: 23,   # FP32, 单精度[6](@ref)
    float64: 52,   # FP64, 双精度
    float16: 10,   # FP16, 半精度[1,7](@ref)
    bfloat16: 7,   # BF16, Brain浮点16位[7](@ref)
 }
--- a/mindnlp/core/_jit_internal.py
+++ b/mindnlp/core/_jit_internal.py
@@ -95,3 +95,6 @@ class BroadcastingListCls:
 BroadcastingList1 = BroadcastingListCls()
 for i in range(2, 7):
    globals()[f"BroadcastingList{i}"] = BroadcastingList1

 def is_scripting():
    False
--- a/mindnlp/core/_op_prim/init.py
+++ b/mindnlp/core/_op_prim/init.py
--- a/mindnlp/core/_op_prim/ascend/init.py
+++ b/mindnlp/core/_op_prim/ascend/init.py
--- a/mindnlp/core/_op_prim/ascend/legacy.py
+++ b/mindnlp/core/_op_prim/ascend/legacy.py
--- a/mindnlp/core/_op_prim/ascend/pyboost.py
+++ b/mindnlp/core/_op_prim/ascend/pyboost.py
@@ -0,0 +1,877 @@
 from mindspore.ops.auto_generate.gen_ops_prim import *
 from mindspore.ops.auto_generate.pyboost_inner_prim import *

 abs_op = Abs().set_device('Ascend')

 acos_ext_op = AcosExt().set_device('Ascend')

 acosh_ext_op = AcoshExt().set_device('Ascend')

 adamw_op = AdamW().set_device('Ascend')

 adaptive_avg_pool1d_op = AdaptiveAvgPool1D().set_device('Ascend')

 adaptive_avg_pool2d_ext_op = AdaptiveAvgPool2DExt().set_device('Ascend')

 adaptive_avg_pool2d_grad_ext_op = AdaptiveAvgPool2DGradExt().set_device('Ascend')

 adaptive_avg_pool3d_ext_op = AdaptiveAvgPool3DExt().set_device('Ascend')

 adaptive_avg_pool3d_grad_ext_op = AdaptiveAvgPool3DGradExt().set_device('Ascend')

 adaptive_max_pool1d_op = AdaptiveMaxPool1D().set_device('Ascend')

 add_op = Add().set_device('Ascend')

 add_ext_op = AddExt().set_device('Ascend')

 add_layer_norm_grad_op = AddLayerNormGrad().set_device('Ascend')

 add_layernorm_v2_op = AddLayerNormV2().set_device('Ascend')

 add_rms_norm_op = AddRmsNorm().set_device('Ascend')

 add_scalar_op = AddScalar().set_device('Ascend')

 addbmm_op = Addbmm().set_device('Ascend')

 addcdiv_ext_op = AddcdivExt().set_device('Ascend')

 addcmul_ext_op = AddcmulExt().set_device('Ascend')

 addmm_op = Addmm().set_device('Ascend')

 addmv_op = Addmv().set_device('Ascend')

 all_gather_matmul_op = AllGatherMatmul().set_device('Ascend')

 arange_op = Arange().set_device('Ascend')

 argmax_ext_op = ArgMaxExt().set_device('Ascend')

 argmin_ext_op = ArgMinExt().set_device('Ascend')

 argsort_op = ArgSort().set_device('Ascend')

 as_strided_op = AsStrided().set_device('Ascend')

 asin_ext_op = AsinExt().set_device('Ascend')

 asinh_ext_op = AsinhExt().set_device('Ascend')

 atan2_ext_op = Atan2Ext().set_device('Ascend')

 atan_ext_op = AtanExt().set_device('Ascend')

 atanh_op = Atanh().set_device('Ascend')

 avg_pool1d_op = AvgPool1D().set_device('Ascend')

 avg_pool2d_op = AvgPool2D().set_device('Ascend')

 avg_pool2d_grad_op = AvgPool2DGrad().set_device('Ascend')

 avg_pool3d_ext_op = AvgPool3DExt().set_device('Ascend')

 avg_pool3d_grad_ext_op = AvgPool3DGradExt().set_device('Ascend')

 baddbmm_op = Baddbmm().set_device('Ascend')

 batch_norm_elemt_op = BatchNormElemt().set_device('Ascend')

 batch_norm_elemt_grad_op = BatchNormElemtGrad().set_device('Ascend')

 batch_norm_ext_op = BatchNormExt().set_device('Ascend')

 batch_norm_gather_stats_with_counts_op = BatchNormGatherStatsWithCounts().set_device('Ascend')

 batch_norm_reduce_grad_op = BatchNormReduceGrad().set_device('Ascend')

 batch_norm_stats_op = BatchNormStats().set_device('Ascend')

 bernoulli_ext_op = BernoulliExt().set_device('Ascend')

 binary_cross_entropy_with_logits_backward_op = BinaryCrossEntropyWithLogitsBackward().set_device('Ascend')

 bincount_ext_op = BincountExt().set_device('Ascend')

 bitwise_and_scalar_op = BitwiseAndScalar().set_device('Ascend')

 bitwise_and_tensor_op = BitwiseAndTensor().set_device('Ascend')

 bitwise_not_op = BitwiseNot().set_device('Ascend')

 bitwise_or_scalar_op = BitwiseOrScalar().set_device('Ascend')

 bitwise_or_tensor_op = BitwiseOrTensor().set_device('Ascend')

 bitwise_xor_scalar_op = BitwiseXorScalar().set_device('Ascend')

 bitwise_xor_tensor_op = BitwiseXorTensor().set_device('Ascend')

 bmm_ext_op = BatchMatMulExt().set_device('Ascend')

 broadcast_to_view_op = BroadcastToView().set_device('Ascend')

 ceil_op = Ceil().set_device('Ascend')

 chunk_op = Chunk().set_device('Ascend')

 chunk_view_op = ChunkView().set_device('Ascend')

 clamp_scalar_op = ClampScalar().set_device('Ascend')

 clamp_tensor_op = ClampTensor().set_device('Ascend')

 clone_op = Clone().set_device('Ascend')

 col2im_ext_op = Col2ImExt().set_device('Ascend')

 col2im_grad_op = Col2ImGrad().set_device('Ascend')

 constant_pad_nd_op = ConstantPadND().set_device('Ascend')

 contiguous_op = Contiguous().set_device('Ascend')

 conv1d_ext_op = Conv1DExt().set_device('Ascend')

 conv1d_padding_op = Conv1DPadding().set_device('Ascend')

 conv2d_ext_op = Conv2DExt().set_device('Ascend')

 conv2d_padding_op = Conv2DPadding().set_device('Ascend')

 conv3d_ext_op = Conv3DExt().set_device('Ascend')

 conv3d_padding_op = Conv3DPadding().set_device('Ascend')

 conv_transpose2d_op = ConvTranspose2D().set_device('Ascend')

 convolution_op = Convolution().set_device('Ascend')

 convolution_grad_op = ConvolutionGrad().set_device('Ascend')

 convolution_str_op = ConvolutionStr().set_device('Ascend')

 convolution_str_grad_op = ConvolutionStrGrad().set_device('Ascend')

 copy_op = Copy().set_device('Ascend')

 cos_op = Cos().set_device('Ascend')

 cosh_op = Cosh().set_device('Ascend')

 count_nonzero_op = CountNonZero().set_device('Ascend')

 cummin_ext_op = CumminExt().set_device('Ascend')

 cumsum_ext_op = CumsumExt().set_device('Ascend')

 dense_op = Dense().set_device('Ascend')

 diag_ext_op = DiagExt().set_device('Ascend')

 dist_comm_all_gather_op = DistCommAllGather().set_device('Ascend')

 dist_comm_all_gather_into_tensor_op = DistCommAllGatherIntoTensor().set_device('Ascend')

 dist_comm_all_reduce_op = DistCommAllReduce().set_device('Ascend')

 dist_comm_all_to_all_v_op = DistCommAllToAllV().set_device('Ascend')

 dist_comm_all_to_all_v_single_op = DistCommAllToAllVSingle().set_device('Ascend')

 dist_comm_barrier_op = DistCommBarrier().set_device('Ascend')

 dist_comm_batch_isend_irecv_op = DistCommBatchIsendIrecv().set_device('Ascend')

 dist_comm_broadcast_op = DistCommBroadcast().set_device('Ascend')

 dist_comm_gather_op = DistCommGather().set_device('Ascend')

 dist_comm_gather_into_tensor_op = DistCommGatherIntoTensor().set_device('Ascend')

 dist_comm_irecv_op = DistCommIrecv().set_device('Ascend')

 dist_comm_isend_op = DistCommIsend().set_device('Ascend')

 dist_comm_reduce_op = DistCommReduce().set_device('Ascend')

 dist_comm_reduce_scatter_op = DistCommReduceScatter().set_device('Ascend')

 dist_comm_reduce_scatter_tensor_op = DistCommReduceScatterTensor().set_device('Ascend')

 dist_comm_scatter_op = DistCommScatter().set_device('Ascend')

 dist_comm_scatter_tensor_op = DistCommScatterTensor().set_device('Ascend')

 div_op = Div().set_device('Ascend')

 divmod_op = DivMod().set_device('Ascend')

 divmods_op = DivMods().set_device('Ascend')

 divs_op = Divs().set_device('Ascend')

 dot_op = Dot().set_device('Ascend')

 dropout_do_mask_ext_op = DropoutDoMaskExt().set_device('Ascend')

 dropout_ext_op = DropoutExt().set_device('Ascend')

 dropout_gen_mask_ext_op = DropoutGenMaskExt().set_device('Ascend')

 dropout_grad_ext_op = DropoutGradExt().set_device('Ascend')

 dynamic_quant_ext_op = DynamicQuantExt().set_device('Ascend')

 elu_grad_ext_op = EluGradExt().set_device('Ascend')

 embedding_op = Embedding().set_device('Ascend')

 embedding_dense_backward_op = EmbeddingDenseBackward().set_device('Ascend')

 equal_op = Equal().set_device('Ascend')

 equal_ext_op = EqualExt().set_device('Ascend')

 erf_op = Erf().set_device('Ascend')

 erfc_op = Erfc().set_device('Ascend')

 erfinv_op = Erfinv().set_device('Ascend')

 exp_op = Exp().set_device('Ascend')

 exp2_op = Exp2().set_device('Ascend')

 expand_as_op = ExpandAs().set_device('Ascend')

 expand_dims_op = ExpandDims().set_device('Ascend')

 expand_dims_view_op = ExpandDimsView().set_device('Ascend')

 expm1_op = Expm1().set_device('Ascend')

 eye_op = Eye().set_device('Ascend')

 fill_scalar_op = FillScalar().set_device('Ascend')

 fill_tensor_op = FillTensor().set_device('Ascend')

 flatten_ext_op = FlattenExt().set_device('Ascend')

 floor_op = Floor().set_device('Ascend')

 floor_div_op = FloorDiv().set_device('Ascend')

 floor_div_scalar_op = FloorDivScalar().set_device('Ascend')

 fmod_scalar_op = FmodScalar().set_device('Ascend')

 fmod_tensor_op = FmodTensor().set_device('Ascend')

 frac_op = Frac().set_device('Ascend')

 full_like_op = FullLike().set_device('Ascend')

 gather_d_op = GatherD().set_device('Ascend')

 gather_d_grad_v2_op = GatherDGradV2().set_device('Ascend')

 gcd_op = Gcd().set_device('Ascend')

 gelu_op = GeLU().set_device('Ascend')

 gelu_ext_op = GeluExt().set_device('Ascend')

 gelu_grad_op = GeLUGrad().set_device('Ascend')

 gelu_grad_ext_op = GeluGradExt().set_device('Ascend')

 generator_op = Generator().set_device('Ascend')

 gmm_op = Gmm().set_device('Ascend')

 gmm_backward_op = GmmBackward().set_device('Ascend')

 gmm_backward_fusion_op = GmmBackwardFusion().set_device('Ascend')

 gmm_v2_op = GmmV2().set_device('Ascend')

 gmm_v2_backward_op = GmmV2Backward().set_device('Ascend')

 gmm_v2_backward_fusion_op = GmmV2BackwardFusion().set_device('Ascend')

 greater_op = Greater().set_device('Ascend')

 greater_equal_op = GreaterEqual().set_device('Ascend')

 greater_equal_scalar_op = GreaterEqualScalar().set_device('Ascend')

 group_norm_op = GroupNorm().set_device('Ascend')

 group_norm_grad_op = GroupNormGrad().set_device('Ascend')

 grouped_matmul_v2_op = GroupedMatmulV2().set_device('Ascend')

 grouped_matmul_v4_op = GroupedMatmulV4().set_device('Ascend')

 hardtanh_op = Hardtanh().set_device('Ascend')

 hardtanh_grad_op = HardtanhGrad().set_device('Ascend')

 histc_ext_op = HistcExt().set_device('Ascend')

 hsigmoid_op = HSigmoid().set_device('Ascend')

 hsigmoid_grad_op = HSigmoidGrad().set_device('Ascend')

 hswish_op = HSwish().set_device('Ascend')

 hswish_grad_op = HSwishGrad().set_device('Ascend')

 im2col_ext_op = Im2ColExt().set_device('Ascend')

 index_op = Index().set_device('Ascend')

 index_add_ext_op = IndexAddExt().set_device('Ascend')

 index_fill_scalar_op = IndexFillScalar().set_device('Ascend')

 index_fill_tensor_op = IndexFillTensor().set_device('Ascend')

 index_select_op = IndexSelect().set_device('Ascend')

 inner_comm_all_gather_op = InnerCommAllGather().set_device('Ascend')

 inner_comm_all_reduce_op = InnerCommAllReduce().set_device('Ascend')

 inner_comm_all_to_all_v_op = InnerCommAllToAllV().set_device('Ascend')

 inner_comm_irecv_op = InnerCommIrecv().set_device('Ascend')

 inner_comm_isend_op = InnerCommIsend().set_device('Ascend')

 inner_comm_reduce_scatter_op = InnerCommReduceScatter().set_device('Ascend')

 inner_index_op = InnerIndex().set_device('Ascend')

 inner_inplace_index_put_op = InnerInplaceIndexPut().set_device('Ascend')

 inner_non_zero_op = InnerNonZero().set_device('Ascend')

 inplace_add_ext_op = InplaceAddExt().set_device('Ascend')

 inplace_addmm_op = InplaceAddmm().set_device('Ascend')

 inplace_adds_ext_op = InplaceAddsExt().set_device('Ascend')

 inplace_clamp_scalar_op = InplaceClampScalar().set_device('Ascend')

 inplace_clamp_tensor_op = InplaceClampTensor().set_device('Ascend')

 inplace_copy_op = InplaceCopy().set_device('Ascend')

 inplace_div_op = InplaceDiv().set_device('Ascend')

 inplace_divmod_op = InplaceDivMod().set_device('Ascend')

 inplace_divmods_op = InplaceDivMods().set_device('Ascend')

 inplace_divs_op = InplaceDivs().set_device('Ascend')

 inplace_elu_op = InplaceElu().set_device('Ascend')

 inplace_erfinv_op = InplaceErfinv().set_device('Ascend')

 inplace_exp_op = InplaceExp().set_device('Ascend')

 inplace_exponential_op = InplaceExponential().set_device('Ascend')

 inplace_fill_diagonal_op = InplaceFillDiagonal().set_device('Ascend')

 inplace_fill_scalar_op = InplaceFillScalar().set_device('Ascend')

 inplace_fill_tensor_op = InplaceFillTensor().set_device('Ascend')

 inplace_floor_op = InplaceFloor().set_device('Ascend')

 inplace_floor_divide_op = InplaceFloorDivide().set_device('Ascend')

 inplace_floor_divides_op = InplaceFloorDivides().set_device('Ascend')

 inplace_grouped_matmul_add_op = InplaceGroupedMatmulAdd().set_device('Ascend')

 inplace_hardtanh_op = InplaceHardtanh().set_device('Ascend')

 inplace_index_add_op = InplaceIndexAddExt().set_device('Ascend')

 inplace_index_put_op = InplaceIndexPut().set_device('Ascend')

 inplace_log_op = InplaceLog().set_device('Ascend')

 inplace_masked_fill_scalar_op = InplaceMaskedFillScalar().set_device('Ascend')

 inplace_masked_fill_tensor_op = InplaceMaskedFillTensor().set_device('Ascend')

 inplace_mul_op = InplaceMul().set_device('Ascend')

 inplace_muls_op = InplaceMuls().set_device('Ascend')

 inplace_normal_op = InplaceNormal().set_device('Ascend')

 inplace_put_op = InplacePut().set_device('Ascend')

 inplace_random_op = InplaceRandom().set_device('Ascend')

 inplace_relu_op = InplaceReLU().set_device('Ascend')

 inplace_scatter_add_op = InplaceScatterAdd().set_device('Ascend')

 inplace_scatter_src_op = InplaceScatterSrc().set_device('Ascend')

 inplace_scatter_src_reduce_op = InplaceScatterSrcReduce().set_device('Ascend')

 inplace_scatter_value_op = InplaceScatterValue().set_device('Ascend')

 inplace_scatter_value_reduce_op = InplaceScatterValueReduce().set_device('Ascend')

 inplace_stop_gradient_op = InplaceStopGradient().set_device('Ascend')

 inplace_sub_ext_op = InplaceSubExt().set_device('Ascend')

 inplace_sub_scalar_op = InplaceSubScalar().set_device('Ascend')

 inplace_tanh_op = InplaceTanh().set_device('Ascend')

 inplace_threshold_op = InplaceThreshold().set_device('Ascend')

 inplace_uniform_op = InplaceUniform().set_device('Ascend')

 inplace_zero_op = InplaceZero().set_device('Ascend')

 isfinite_op = IsFinite().set_device('Ascend')

 isinf_op = IsInf().set_device('Ascend')

 isneginf_op = IsNegInf().set_device('Ascend')

 kl_div_op = KLDiv().set_device('Ascend')

 kl_div_grad_op = KLDivGrad().set_device('Ascend')

 kthvalue_op = Kthvalue().set_device('Ascend')

 kv_cache_scatter_update_op = KVCacheScatterUpdate().set_device('Ascend')

 l1_loss_backward_ext_op = L1LossBackwardExt().set_device('Ascend')

 l1_loss_ext_op = L1LossExt().set_device('Ascend')

 layer_norm_ext_op = LayerNormExt().set_device('Ascend')

 layer_norm_grad_ext_op = LayerNormGradExt().set_device('Ascend')

 leaky_relu_ext_op = LeakyReLUExt().set_device('Ascend')

 leaky_relu_grad_ext_op = LeakyReLUGradExt().set_device('Ascend')

 lerp_op = Lerp().set_device('Ascend')

 lerp_scalar_op = LerpScalar().set_device('Ascend')

 less_op = Less().set_device('Ascend')

 less_equal_op = LessEqual().set_device('Ascend')

 lin_space_ext_op = LinSpaceExt().set_device('Ascend')

 linalg_qr_op = LinalgQr().set_device('Ascend')

 linalg_vector_norm_op = LinalgVectorNorm().set_device('Ascend')

 log_op = Log().set_device('Ascend')

 log10_op = Log10().set_device('Ascend')

 log1p_op = Log1p().set_device('Ascend')

 log2_op = Log2().set_device('Ascend')

 log_softmax_ext_op = LogSoftmaxExt().set_device('Ascend')

 logaddexp_op = LogAddExp().set_device('Ascend')

 logaddexp2_op = LogAddExp2().set_device('Ascend')

 logical_and_op = LogicalAnd().set_device('Ascend')

 logical_not_op = LogicalNot().set_device('Ascend')

 logical_or_op = LogicalOr().set_device('Ascend')

 logical_xor_op = LogicalXor().set_device('Ascend')

 logsigmoid_op = LogSigmoid().set_device('Ascend')

 logsigmoid_grad_op = LogSigmoidGrad().set_device('Ascend')

 logsumexp_op = LogSumExp().set_device('Ascend')

 masked_fill_op = MaskedFill().set_device('Ascend')

 masked_select_op = MaskedSelect().set_device('Ascend')

 masked_select_grad_op = MaskedSelectGrad().set_device('Ascend')

 matmul_allreduce_add_rmsnorm_op = MatmulAllReduceAddRmsNorm().set_device('Ascend')

 matmul_ext_op = MatMulExt().set_device('Ascend')

 matmul_reduce_scatter_op = MatmulReduceScatter().set_device('Ascend')

 matrix_inverse_ext_op = MatrixInverseExt().set_device('Ascend')

 max_op = Max().set_device('Ascend')

 max_dim_op = MaxDim().set_device('Ascend')

 max_unpool2d_ext_op = MaxUnpool2DExt().set_device('Ascend')

 maximum_op = Maximum().set_device('Ascend')

 mean_ext_op = MeanExt().set_device('Ascend')

 median_dim_op = MedianDim().set_device('Ascend')

 median_ext_op = MedianExt().set_device('Ascend')

 min_op = Min().set_device('Ascend')

 min_dim_op = MinDim().set_device('Ascend')

 minimum_op = Minimum().set_device('Ascend')

 mish_ext_op = MishExt().set_device('Ascend')

 mish_grad_ext_op = MishGradExt().set_device('Ascend')

 mm_ext_op = Mm().set_device('Ascend')

 moe_compute_expert_tokens_op = MoeComputeExpertTokens().set_device('Ascend')

 moe_finalize_routing_op = MoeFinalizeRouting().set_device('Ascend')

 moe_gating_top_k_softmax_op = MoeGatingTopKSoftmax().set_device('Ascend')

 moe_init_routing_op = MoeInitRouting().set_device('Ascend')

 moe_init_routing_v2_op = MoeInitRoutingV2().set_device('Ascend')

 moe_token_permute_op = MoeTokenPermute().set_device('Ascend')

 moe_token_permute_grad_op = MoeTokenPermuteGrad().set_device('Ascend')

 moe_token_unpermute_op = MoeTokenUnpermute().set_device('Ascend')

 moe_token_unpermute_grad_op = MoeTokenUnpermuteGrad().set_device('Ascend')

 mse_loss_ext_op = MSELossExt().set_device('Ascend')

 mse_loss_grad_ext_op = MSELossGradExt().set_device('Ascend')

 mul_op = Mul().set_device('Ascend')

 muls_op = Muls().set_device('Ascend')

 multi_scale_deformable_attn_op = MultiScaleDeformableAttn().set_device('Ascend')

 multi_scale_deformable_attn_grad_op = MultiScaleDeformableAttnGrad().set_device('Ascend')

 multinomial_ext_op = MultinomialExt().set_device('Ascend')

 mv_op = Mv().set_device('Ascend')

 nansum_op = Nansum().set_device('Ascend')

 narrow_op = Narrow().set_device('Ascend')

 narrow_view_op = NarrowView().set_device('Ascend')

 neg_op = Neg().set_device('Ascend')

 new_ones_op = NewOnes().set_device('Ascend')

 new_zeros_op = NewZeros().set_device('Ascend')

 nllloss_2d_op = NLLLoss2d().set_device('Ascend')

 nllloss_2d_grad_op = NLLLoss2dGrad().set_device('Ascend')

 non_zero_op = NonZero().set_device('Ascend')

 non_zero_ext_op = NonZeroExt().set_device('Ascend')

 norm_op = Norm().set_device('Ascend')

 normal_float_float_op = NormalFloatFloat().set_device('Ascend')

 normal_float_tensor_op = NormalFloatTensor().set_device('Ascend')

 normal_tensor_float_op = NormalTensorFloat().set_device('Ascend')

 normal_tensor_tensor_op = NormalTensorTensor().set_device('Ascend')

 not_equal_op = NotEqual().set_device('Ascend')

 ones_like_ext_op = OnesLikeExt().set_device('Ascend')

 outer_op = Outer().set_device('Ascend')

 pixel_shuffle_op = PixelShuffle().set_device('Ascend')

 polar_op = Polar().set_device('Ascend')

 pow_op = Pow().set_device('Ascend')

 pow_scalar_tensor_op = PowScalarTensor().set_device('Ascend')

 pow_tensor_scalar_op = PowTensorScalar().set_device('Ascend')

 prelu_op = PReLU().set_device('Ascend')

 prelu_grad_op = PReLUGrad().set_device('Ascend')

 prod_ext_op = ProdExt().set_device('Ascend')

 quant_v2_op = QuantV2().set_device('Ascend')

 rand_ext_op = RandExt().set_device('Ascend')

 rand_like_ext_op = RandLikeExt().set_device('Ascend')

 randint_op = RandInt().set_device('Ascend')

 randint_like_op = RandIntLike().set_device('Ascend')

 randn_op = Randn().set_device('Ascend')

 randn_like_op = RandnLike().set_device('Ascend')

 randperm_ext_op = RandpermExt().set_device('Ascend')

 reciprocal_op = Reciprocal().set_device('Ascend')

 reflection_pad_1d_op = ReflectionPad1D().set_device('Ascend')

 reflection_pad_1d_grad_op = ReflectionPad1DGrad().set_device('Ascend')

 reflection_pad_2d_op = ReflectionPad2D().set_device('Ascend')

 reflection_pad_2d_grad_op = ReflectionPad2DGrad().set_device('Ascend')

 reflection_pad_3d_op = ReflectionPad3D().set_device('Ascend')

 reflection_pad_3d_grad_op = ReflectionPad3DGrad().set_device('Ascend')

 relu_op = ReLU().set_device('Ascend')

 relu_grad_op = ReluGrad().set_device('Ascend')

 remainder_scalar_tensor_op = RemainderScalarTensor().set_device('Ascend')

 remainder_tensor_scalar_op = RemainderTensorScalar().set_device('Ascend')

 remainder_tensor_tensor_op = RemainderTensorTensor().set_device('Ascend')

 repeat_op = Repeat().set_device('Ascend')

 repeat_interleave_grad_op = RepeatInterleaveGrad().set_device('Ascend')

 repeat_interleave_int_op = RepeatInterleaveInt().set_device('Ascend')

 repeat_interleave_tensor_op = RepeatInterleaveTensor().set_device('Ascend')

 replication_pad_1d_op = ReplicationPad1D().set_device('Ascend')

 replication_pad_1d_grad_op = ReplicationPad1DGrad().set_device('Ascend')

 replication_pad_2d_op = ReplicationPad2D().set_device('Ascend')

 replication_pad_2d_grad_op = ReplicationPad2DGrad().set_device('Ascend')

 replication_pad_3d_op = ReplicationPad3D().set_device('Ascend')

 replication_pad_3d_grad_op = ReplicationPad3DGrad().set_device('Ascend')

 reshape_op = Reshape().set_device('Ascend')

 rms_norm_grad_op = RmsNormGrad().set_device('Ascend')

 rotary_position_embedding_op = RotaryPositionEmbedding().set_device('Ascend')

 rotary_position_embedding_grad_op = RotaryPositionEmbeddingGrad().set_device('Ascend')

 round_op = Round().set_device('Ascend')

 rsqrt_op = Rsqrt().set_device('Ascend')

 scatter_op = Scatter().set_device('Ascend')

 scatter_add_ext_op = ScatterAddExt().set_device('Ascend')

 scatter_value_op = ScatterValue().set_device('Ascend')

 select_op = Select().set_device('Ascend')

 select_ext_view_op = SelectExtView().set_device('Ascend')

 select_v2_op = SelectV2().set_device('Ascend')

 selu_ext_op = SeLUExt().set_device('Ascend')

 selu_grad_op = SeluGrad().set_device('Ascend')

 sigmoid_op = Sigmoid().set_device('Ascend')

 sigmoid_grad_op = SigmoidGrad().set_device('Ascend')

 sign_op = Sign().set_device('Ascend')

 silent_check_v2_op = SilentCheckV2().set_device('Ascend')

 silent_check_v3_op = SilentCheckV3().set_device('Ascend')

 silu_op = SiLU().set_device('Ascend')

 silu_grad_op = SiLUGrad().set_device('Ascend')

 sin_op = Sin().set_device('Ascend')

 sinc_op = Sinc().set_device('Ascend')

 sinh_op = Sinh().set_device('Ascend')

 slice_op = Slice().set_device('Ascend')

 slice_ext_op = SliceExt().set_device('Ascend')

 slice_ext_view_op = SliceExtView().set_device('Ascend')

 softmax_backward_op = SoftmaxBackward().set_device('Ascend')

 softplus_ext_op = SoftplusExt().set_device('Ascend')

 softplus_grad_ext_op = SoftplusGradExt().set_device('Ascend')

 sort_ext_op = SortExt().set_device('Ascend')

 speed_fusion_attention_op = SpeedFusionAttention().set_device('Ascend')

 speed_fusion_attention_grad_op = SpeedFusionAttentionGrad().set_device('Ascend')

 split_tensor_op = SplitTensor().set_device('Ascend')

 split_tensor_view_op = SplitTensorView().set_device('Ascend')

 split_with_size_op = SplitWithSize().set_device('Ascend')

 split_with_size_view_op = SplitWithSizeView().set_device('Ascend')

 sqrt_op = Sqrt().set_device('Ascend')

 square_op = Square().set_device('Ascend')

 std_op = Std().set_device('Ascend')

 std_mean_op = StdMean().set_device('Ascend')

 sub_op = Sub().set_device('Ascend')

 sub_ext_op = SubExt().set_device('Ascend')

 sub_scalar_op = SubScalar().set_device('Ascend')

 sum_ext_op = SumExt().set_device('Ascend')

 swiglu_op = Swiglu().set_device('Ascend')

 swiglu_grad_op = SwigluGrad().set_device('Ascend')

 t_ext_op = TExt().set_device('Ascend')

 take_op = Take().set_device('Ascend')

 tan_op = Tan().set_device('Ascend')

 tanh_op = Tanh().set_device('Ascend')

 tanh_grad_op = TanhGrad().set_device('Ascend')

 threshold_op = Threshold().set_device('Ascend')

 threshold_grad_op = ThresholdGrad().set_device('Ascend')

 topk_ext_op = TopkExt().set_device('Ascend')

 trace_ext_op = TraceExt().set_device('Ascend')

 transpose_op = Transpose().set_device('Ascend')

 transpose_ext_view_op = TransposeExtView().set_device('Ascend')

 transpose_view_op = TransposeView().set_device('Ascend')

 triangular_solve_op = TriangularSolve().set_device('Ascend')

 tril_ext_op = TrilExt().set_device('Ascend')

 trunc_op = Trunc().set_device('Ascend')

 uniform_ext_op = UniformExt().set_device('Ascend')

 unique2_op = Unique2().set_device('Ascend')

 unique_dim_op = UniqueDim().set_device('Ascend')

 unstack_ext_view_op = UnstackExtView().set_device('Ascend')

 upsample_bicubic2d_op = UpsampleBicubic2D().set_device('Ascend')

 upsample_bicubic2d_grad_op = UpsampleBicubic2DGrad().set_device('Ascend')

 upsample_bilinear2d_op = UpsampleBilinear2D().set_device('Ascend')

 upsample_bilinear2d_grad_op = UpsampleBilinear2DGrad().set_device('Ascend')

 upsample_linear1d_op = UpsampleLinear1D().set_device('Ascend')

 upsample_linear1d_grad_op = UpsampleLinear1DGrad().set_device('Ascend')

 upsample_nearest1d_op = UpsampleNearest1D().set_device('Ascend')

 upsample_nearest1d_grad_op = UpsampleNearest1DGrad().set_device('Ascend')

 upsample_nearest2d_op = UpsampleNearest2D().set_device('Ascend')

 upsample_nearest2d_grad_op = UpsampleNearest2DGrad().set_device('Ascend')

 upsample_nearest3d_op = UpsampleNearest3D().set_device('Ascend')

 upsample_nearest3d_grad_op = UpsampleNearest3DGrad().set_device('Ascend')

 var_op = Var().set_device('Ascend')

 var_mean_op = VarMean().set_device('Ascend')

 view_as_op = ViewAs().set_device('Ascend')

 xlogy_op = Xlogy().set_device('Ascend')

 xlogy_scalar_other_op = XLogYScalarOther().set_device('Ascend')

 xlogy_scalar_self_op = XLogYScalarSelf().set_device('Ascend')

 zeros_like_ext_op = ZerosLikeExt().set_device('Ascend')

--- a/mindnlp/core/_op_prim/cpu/init.py
+++ b/mindnlp/core/_op_prim/cpu/init.py
--- a/mindnlp/core/_op_prim/cpu/legacy.py
+++ b/mindnlp/core/_op_prim/cpu/legacy.py
--- a/mindnlp/core/_op_prim/gpu/init.py
+++ b/mindnlp/core/_op_prim/gpu/init.py
--- a/mindnlp/core/_op_prim/gpu/legacy.py
+++ b/mindnlp/core/_op_prim/gpu/legacy.py
--- a/mindnlp/core/_prims/ascend/init.py
+++ b/mindnlp/core/_prims/ascend/init.py
@@ -0,0 +1,2 @@
 from . import aclop
 from . import pyboost
--- a/mindnlp/core/_prims/ascend/aclop.py
+++ b/mindnlp/core/_prims/ascend/aclop.py
@@ -0,0 +1,82 @@
 import re
 import inspect
 from mindspore import ops
 from mindspore.ops._primitive_cache import _get_cache_prim
 from mindspore.ops.operations._grad_ops import StridedSliceGrad

 __all__ = []

 def camel_to_snake_case(camel_case_str):
    snake_case_str = re.sub(r'(?<!^)(?=[A-Z])', '_', camel_case_str).lower()
    return snake_case_str

 op_func_no_init = '''
 def {name}(*args):
    op = _get_cache_prim(ops.{op})().set_device('CPU')
    return op(*args)
 '''

 op_func_with_init = '''
 def {name}(*args):
    op = _get_cache_prim(ops.{op})(*args[-{idx}:]).set_device('CPU')
    return op(*args[:-{idx}])
 '''


 old_op_list = list(filter(lambda s: s[0].isupper(), dir(ops)))
 for old_op_name in old_op_list:
    if old_op_name in ['P', 'Print', 'Assert', 'Custom', 'CustomOpBuilder', 'DataType', 'ReduceOp', 'TBERegOp', 'Tensor']:
        continue
    # print(old_op_name)
    ops_class = getattr(ops, old_op_name, None)
    init_signature = inspect.signature(ops_class.__init__)
    if len(init_signature.parameters) > 1:
        name = camel_to_snake_case(old_op_name)
        init_args = list(init_signature.parameters.keys())
        init_args.pop(0)
        exec(op_func_with_init.format(name=name, op=old_op_name, idx=len(init_args)), globals())

    else:
        name = camel_to_snake_case(old_op_name)
        exec(op_func_no_init.format(name=name, op=old_op_name), globals())

    __all__.append(name)
    # print(old_op_name, init_signature.parameters, call_signature.parameters)
    # print(old_op_name, len(init_signature.parameters), len(call_signature.parameters))
    # break

 # normal_op = ops.StandardNormal().set_device('CPU')
 # def normal(size):
 #     return normal_op(size)

 # __all__.append('normal')

 dyn_shape_op = ops.TensorShape().set_device('CPU')
 def dyn_shape(self):
    return dyn_shape_op(self)

 __all__.append('dyn_shape')

 # def strided_slice(input, begin, end, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask):
 #     strided_slice_op = ops.StridedSlice(begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask).set_device('CPU')
 #     return strided_slice_op(input, begin, end, strides)

 # __all__.append('strided_slice')

 # def broadcast_to(input, shape):
 #     broadcast_to_op = ops.BroadcastTo(shape).set_device('CPU')
 #     return broadcast_to_op(input)

 # __all__.append('broadcast_to')

 def strided_slice_grad(input, begin, end, strides, update, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0):
    strided_slice_grad = _get_cache_prim(StridedSliceGrad)(begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask).set_device('CPU')
    return strided_slice_grad(update, input.shape, begin, end, strides)

 __all__.append('strided_slice_grad')

 # full_op = ops.FillV2().set_device('CPU')
 # def full(shape, value):
 #     return full_op(shape, value)

 # __all__.append('full')
--- a/mindnlp/core/_prims/ascend/ascend.py
+++ b/mindnlp/core/_prims/ascend/ascend.py
--- a/mindnlp/core/_prims/ascend/pyboost.py
+++ b/mindnlp/core/_prims/ascend/pyboost.py
@@ -0,0 +1,26 @@
 from mindspore.ops.auto_generate import gen_ops_prim
 from mindspore.ops.auto_generate import pyboost_inner_prim
 from mindspore._c_expression import _empty_instance

 gen_ops_list = list(filter(lambda s: s.startswith("pyboost"), dir(gen_ops_prim)))
 pyboost_inner_list = list(filter(lambda s: s.endswith("_impl"), dir(pyboost_inner_prim)))

 __all__ = []

 for pyboost_op_name in gen_ops_list:
    op_name = pyboost_op_name.replace('pyboost_', '') + '_op'
    func_name = op_name.replace('_op', '')
    op_instance = getattr(gen_ops_prim, op_name, None)
    if op_instance is not None:
        __all__.append(func_name)
        globals()[func_name] = getattr(gen_ops_prim, op_name).__class__().set_device('Ascend')

 for op_name in pyboost_inner_list:
    func_name = op_name.replace('_impl', '')
    __all__.append(func_name)
    globals()[func_name] = getattr(pyboost_inner_prim, op_name).__class__()

 def empty(*args, **kwargs):
    return _empty_instance(*args, **kwargs, device='Ascend')

 __all__.append('empty')
--- a/mindnlp/core/_prims/ascend_310b/init.py
+++ b/mindnlp/core/_prims/ascend_310b/init.py
--- a/mindnlp/core/_prims/ascend_310b/ascend_310b.py
+++ b/mindnlp/core/_prims/ascend_310b/ascend_310b.py
--- a/mindnlp/core/_prims/cpu.py
+++ b/mindnlp/core/_prims/cpu.py
@@ -1,211 +0,0 @@
 import numbers
 from mindspore.ops.auto_generate import gen_ops_prim
 from mindspore.ops._primitive_cache import _get_cache_prim
 from mindspore._c_expression import _empty_instance
 from mindspore.ops.operations._grad_ops import StridedSliceGrad

 import mindspore
 from mindspore import ops

 from mindnlp import core

 __all__ = []
 op_list = list(filter(lambda s: s.endswith("_op"), dir(gen_ops_prim)))

 for op_name in op_list:
    func_name = op_name.replace('_op', '')
    __all__.append(func_name)
    globals()[func_name] = getattr(gen_ops_prim, op_name).__class__().set_device('CPU')

 def empty(*args, **kwargs):
    return _empty_instance(*args, **kwargs, device='CPU')

 normal_op = ops.StandardNormal().set_device('CPU')
 def normal(*args, **kwargs):
    return normal_op(*args, **kwargs)

 __all__.append('normal')

 full_op = ops.FillV2().set_device('CPU')
 def full(*args):
    return full_op(*args)

 __all__.append('full')

 range_op = ops.Range().set_device('CPU')
 def arange(start, end, step, dtype):
    return cast(range_op(start, end, step), dtype)

 __all__.append('arange')


 broadcast_to_op = ops.Primitive('BroadcastTo').set_device('CPU')
 def broadcast_to(*args):
    return broadcast_to_op(*args)

 __all__.append('broadcast_to')

 def concat(tensors, dim):
    concat_op = ops.Concat(dim).set_device('CPU')
    return concat_op(tensors)

 __all__.append('concat')

 zeros_op = ops.Zeros().set_device('CPU')
 def zeros(*args):
    return zeros_op(*args)

 __all__.append('zeros')

 ones_op = ops.Ones().set_device('CPU')
 def ones(*args):
    return ones_op(*args)

 __all__.append('ones')

 uniform_real_op = ops.UniformReal().set_device('CPU')
 def uniform_real(*args):
    return uniform_real_op(*args)

 __all__.append('uniform_real')

 def pad_v3(input_x, padding, mode='constant', value=None):
    pad_op = ops.PadV3(mode=mode, paddings_contiguous=True).set_device('CPU')
    if isinstance(value, (float, int)):
        value = core.tensor(value, dtype=input_x.dtype)
    return pad_op(input_x, padding, value)

 __all__.append('pad_v3')

 reduce_any_op = ops.ReduceAny().set_device('CPU')
 reduce_any_keepdim_op = ops.ReduceAny(True).set_device('CPU')
 def reduce_any(input, dim, keepdim):
    if keepdim:
        return reduce_any_keepdim_op(input, dim)
    return reduce_any_op(input, dim)

 __all__.append('reduce_any')

 reduce_all_op = ops.ReduceAll().set_device('CPU')
 reduce_all_keepdim_op = ops.ReduceAll(True).set_device('CPU')
 def reduce_all(input, dim, keepdim):
    if keepdim:
        return reduce_all_keepdim_op(input, dim)
    return reduce_all_op(input, dim)

 __all__.append('reduce_all')

 def isclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False):
    is_close = _get_cache_prim(ops.IsClose)(rtol=rtol, atol=atol, equal_nan=equal_nan).set_device('CPU')
    return is_close(input, other)

 __all__.append('isclose')

 tile_op = ops.Primitive('Tile').set_device('CPU')
 def tile(*args):
    return tile_op(*args)

 __all__.append('tile')

 def randint(low, high, shape, dtype, generator):
    rand_op = ops.UniformInt().set_device('CPU')
    output = rand_op(shape, mindspore.Tensor(low, mindspore.int32), mindspore.Tensor(high, mindspore.int32))
    return cast(output, dtype)
    # return mindspore.Tensor(np.random.randint(low, high, shape))

 cast_op = ops.Cast().set_device('CPU')
 def cast(input, dtype):
    return cast_op(input, dtype)

 __all__.append('cast')

 def tril_ext(input, diagonal):
    tril_op = ops.Tril(diagonal).set_device('CPU')
    return tril_op(input)

 def strided_slice(input, begin, end, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask):
    strided_slice_op = _get_cache_prim(ops.StridedSlice)(begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask).set_device('CPU')
    return strided_slice_op(input, begin, end, strides)

 __all__.append('strided_slice')

 def strided_slice_grad(input, begin, end, strides, update, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0):
    strided_slice_grad = _get_cache_prim(StridedSliceGrad)(begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask).set_device('CPU')
    return strided_slice_grad(update, input.shape, begin, end, strides)

 __all__.append('strided_slice_grad')

 def squeeze(input, dim):
    squeeze_op = ops.Squeeze(dim).set_device('CPU')
    return squeeze_op(input)

 __all__.append('squeeze')

 def sort_ext(input, dim, descending, stable):
    sort_op = ops.Sort(dim, descending).set_device('CPU')
    return sort_op(input)

 __all__.append('sort_ext')

 def stack(tensors, dim):
    stack_op = ops.Stack(dim).set_device('CPU')
    return stack_op(tensors)

 __all__.append('stack')

 def gather(input_params, input_indices, axis, batch_dims=0):
    gather_op = _get_cache_prim(ops.Gather)(batch_dims).set_device('CPU')
    return gather_op(input_params, input_indices, axis)

 __all__.append('gather')

 def softmax(input, dim):
    softmax_op = ops.Softmax(dim).set_device('CPU')
    return softmax_op(input)

 __all__.append('softmax')

 def topk(input, k, sorted=True):
    topk_op = ops.TopK(sorted).set_device('CPU')
    return topk_op(input, k)

 __all__.append('topk')

 dyn_shape_op = ops.TensorShape().set_device('CPU')
 def dyn_shape(self):
    return dyn_shape_op(self)

 __all__.append('dyn_shape')

 bitwise_and_op = ops.BitwiseAnd().set_device('CPU')
 def bitwise_and_scalar(input, other):
    return bitwise_and_op(input, other)

 bitwise_right_shift_op = ops.RightShift().set_device('CPU')
 def bitwise_right_shift(input, other):
    if isinstance(input, numbers.Number):
        if not isinstance(input, int):
            raise TypeError(f"For 'bitwise_left_shift', 'input' must be an integer, but got input:{type(input)}.")
        input = cast(input, other.dtype)
    elif isinstance(other, numbers.Number):
        if not isinstance(other, int):
            raise TypeError(f"For 'bitwise_left_shift', 'other' must be an integer, but got other:{type(other)}.")
        other = cast(other, input.dtype)
    return bitwise_right_shift_op(input, other)

 __all__.append('bitwise_right_shift')

 embedding_op = ops.Gather().set_device('CPU')
 def embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq):
    return embedding_op(weight, input, 0)

 __all__.append('embedding')


 def randn(size, seed, offset, dtype):
    rand_op = ops.StandardNormal()
    output = rand_op(size)
    return output

 __all__.append('randn')

--- a/mindnlp/core/_prims/cpu/init.py
+++ b/mindnlp/core/_prims/cpu/init.py
@@ -0,0 +1,2 @@
 from . import ms
 from . import numpy
--- a/mindnlp/core/_prims/cpu/ms.py
+++ b/mindnlp/core/_prims/cpu/ms.py
@@ -0,0 +1,155 @@
 import re
 import inspect
 import ctypes
 import numpy as np
 from mindnlp import core
 from mindspore import ops
 from mindspore.ops.auto_generate import gen_ops_prim
 from mindspore._c_expression import _empty_instance
 from mindspore.ops._primitive_cache import _get_cache_prim
 from mindspore.ops.operations._grad_ops import StridedSliceGrad

 gen_ops_list = list(filter(lambda s: s.startswith("pyboost"), dir(gen_ops_prim)))

 __all__ = []

 def camel_to_snake_case(camel_case_str):
    snake_case_str = re.sub(r'(?<!^)(?=[A-Z])', '_', camel_case_str).lower()
    return snake_case_str

 op_func_no_init = '''
 def {name}(*args):
    op = _get_cache_prim(ops.{op})().set_device('CPU')
    return op(*args)
 '''

 op_func_with_init = '''
 def {name}(*args):
    op = _get_cache_prim(ops.{op})(*args[-{idx}:]).set_device('CPU')
    return op(*args[:-{idx}])
 '''


 for pyboost_op_name in gen_ops_list:
    op_name = pyboost_op_name.replace('pyboost_', '') + '_op'
    func_name = op_name.replace('_op', '')
    op_instance = getattr(gen_ops_prim, op_name, None)
    if op_instance is not None:
        __all__.append(func_name)
        globals()[func_name] = getattr(gen_ops_prim, op_name).__class__().set_device('CPU')

 def empty(*args, **kwargs):
    return _empty_instance(*args, **kwargs, device='CPU')

 __all__.append('empty')

 old_op_list = list(filter(lambda s: s[0].isupper(), dir(ops)))
 for old_op_name in old_op_list:
    if old_op_name in ['P', 'Print', 'Assert', 'Custom', 'CustomOpBuilder', 'DataType', 'ReduceOp', 'TBERegOp', 'Tensor']:
        continue
    # print(old_op_name)
    ops_class = getattr(ops, old_op_name, None)
    init_signature = inspect.signature(ops_class.__init__)
    if len(init_signature.parameters) > 1:
        name = camel_to_snake_case(old_op_name)
        init_args = list(init_signature.parameters.keys())
        init_args.pop(0)
        exec(op_func_with_init.format(name=name, op=old_op_name, idx=len(init_args)), globals())

    else:
        name = camel_to_snake_case(old_op_name)
        exec(op_func_no_init.format(name=name, op=old_op_name), globals())

    __all__.append(name)
    # print(old_op_name, init_signature.parameters, call_signature.parameters)
    # print(old_op_name, len(init_signature.parameters), len(call_signature.parameters))
    # break

 # normal_op = ops.StandardNormal().set_device('CPU')
 # def normal(size):
 #     return normal_op(size)

 # __all__.append('normal')
 dyn_shape_op = ops.TensorShape().set_device('CPU')
 def dyn_shape(self):
    return dyn_shape_op(self)

 __all__.append('dyn_shape')

 # def strided_slice(input, begin, end, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask):
 #     strided_slice_op = ops.StridedSlice(begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask).set_device('CPU')
 #     return strided_slice_op(input, begin, end, strides)

 # __all__.append('strided_slice')

 # def broadcast_to(input, shape):
 #     broadcast_to_op = ops.BroadcastTo(shape).set_device('CPU')
 #     return broadcast_to_op(input)

 # __all__.append('broadcast_to')

 def strided_slice_grad(input, begin, end, strides, update, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0):
    strided_slice_grad = _get_cache_prim(StridedSliceGrad)(begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask).set_device('CPU')
    return strided_slice_grad(update, input.shape, begin, end, strides)

 __all__.append('strided_slice_grad')

 # full_op = ops.FillV2().set_device('CPU')
 # def full(shape, value):
 #     return full_op(shape, value)

 # __all__.append('full')

 def numpy_to_tensor_overwrite(np_array, torch_tensor):
    if not np_array.flags.c_contiguous:
        np_array = np.ascontiguousarray(np_array)

    tensor_ptr = torch_tensor.data_ptr()
        
    ctypes.memmove(tensor_ptr, np_array.ctypes.data, torch_tensor.nbytes)
    
    return torch_tensor

 def inplace_uniform(input, from_, to_, seed, offset):
    np.random.seed(seed.item())
    out = np.random.uniform(from_, to_, input.shape).astype(core.dtype2np[input.dtype])
    numpy_to_tensor_overwrite(out, input)
    return input

 __all__.append('inplace_uniform')

 def inplace_normal(input, mean, std, seed, offset):
    np.random.seed(seed.item())
    out = np.random.normal(mean, std, input.shape).astype(core.dtype2np[input.dtype])
    numpy_to_tensor_overwrite(out, input)

    return input

 __all__.append('inplace_normal')

 # class GetItem(core.autograd.Function):
 #     @staticmethod
 #     def forward(ctx, input, slice):
 #         if isinstance(slice, tuple):
 #             new_slice = ()
 #             for s in slice:
 #                 if isinstance(s, core.Tensor):
 #                     s = s.numpy()
 #                 new_slice += (s,)
 #         else:
 #             new_slice = slice
 #         out = input.asnumpy()[new_slice]

 #         ctx.save_for_backward(input)
 #         ctx.slice = slice
 #         if not isinstance(out, np.ndarray):
 #             out = np.array(out)
 #         return core.Tensor.from_numpy(out)

 #     @staticmethod
 #     def backward(ctx, grad_output):
 #         input, = ctx.saved_tensors
 #         slice = ctx.slice
 #         grad_input = core.zeros_like(input)
 #         grad_input[slice] = grad_output
 #         return grad_input
--- a/mindnlp/core/_prims/cpu/numpy.py
+++ b/mindnlp/core/_prims/cpu/numpy.py
--- a/mindnlp/core/_tensor.py
+++ b/mindnlp/core/_tensor.py
@@ -24,7 +24,12 @@ from ._bind import get_device_in_context, device_, get_default_dtype
 from ._utils import _rebuild_tensor_v2
 from ._C.size import Size
 from .configs import DEVICE_TARGET, CPU_USE_NUMPY_OP
 from .dispatcher import device_map

 device_map = {
    'cpu': 'CPU',
    'npu': 'Ascend',
    'cuda': 'GPU'
 }

 if DEVICE_TARGET == 'Ascend':
    import acl
@@ -134,6 +139,8 @@ def tensor(data, *, dtype=None, device=None, requires_grad=False):

    if isinstance(data, float) and data == float('-inf'):
        data = core.finfo(get_default_dtype()).min
    elif isinstance(data, list) and float('-inf') in data:
        data = [core.finfo(get_default_dtype()).min if d == float('-inf') else d for d in data]

    if dtype is not None:
        tensor = Tensor(data, dtype=dtype)
@@ -145,7 +152,8 @@ def tensor(data, *, dtype=None, device=None, requires_grad=False):
        device.type = 'npu'
    if device.type not in ['meta', 'cpu']:
        tensor = tensor.to(device)
    tensor.requires_grad_(requires_grad)
    if requires_grad:
        tensor.requires_grad_(requires_grad)
    return tensor

 def scalar_tensor(*args, **kwargs):
@@ -203,7 +211,7 @@ class TensorPlaceHolder:
        return self.shape[0]

    def __repr__(self) -> str:
        # self.data_sync(True)
        self.data_sync(True)
        return Tensor_.__repr__(self)[:-1] + f', device={self.device})'

    def __format__(self, format_spec):
@@ -226,16 +234,19 @@ class TensorPlaceHolder:
                if isinstance(s, range):
                    s = list(s)
                if isinstance(s, np.ndarray):
                    s = tensor(s)
                    s = tensor(s, device=self.device)
                new_slices += (s,)
            slices = new_slices
        if self.device.type == 'npu':
            out = ops.tensor_getitem(self, slices)
        elif self.device.type == 'meta':
            out = ops.getitem_np(self, slices)
        else:
            if CPU_USE_NUMPY_OP:
                out = ops.getitem_np(self, slices)
            else:
                out = ops.getitem(self, slices)
            # if CPU_USE_NUMPY_OP:
            #     out = ops.getitem_np(self, slices)
            # else:
            #     out = ops.getitem(self, slices)
            out = ops.tensor_getitem(self, slices)

        out._device = self.device
        return out
@@ -264,7 +275,11 @@ class TensorPlaceHolder:

        if self.device.type == 'meta':
            return self
        elif self.device.type == 'npu':

        if value.device != self.device:
            value._device = self.device

        if self.device.type == 'npu':
            if value.device != self.device:
                value._device = self.device
            out = ops.tensor_setitem(self, slices, value)
@@ -913,6 +928,7 @@ class TensorPlaceHolder:
    def data(self):
        out = Tensor(self)
        out._device = self.device
        out._base = self
        return out

    @data.setter
@@ -920,13 +936,15 @@ class TensorPlaceHolder:
        if isinstance(self, StubTensor) and isinstance(new_value, StubTensor):
            self.stub = new_value.stub
        else:
            if self.device.type == 'cpu' and new_value.device.type == 'cpu' \
                and self.shape == new_value.shape and self.dtype == new_value.dtype:
                src_ct = ctypes.c_void_p(new_value.data_ptr())
                dst_ct = ctypes.c_void_p(self.data_ptr())
                ctypes.memmove(dst_ct, src_ct, self.nbytes)
            else:
                self.assign_value(new_value)
            # if self.device.type == 'cpu' and new_value.device.type == 'cpu' \
            #     and self.shape == new_value.shape and self.dtype == new_value.dtype:
            #     src_ct = ctypes.c_void_p(new_value.data_ptr())
            #     dst_ct = ctypes.c_void_p(self.data_ptr())
            #     ctypes.memmove(dst_ct, src_ct, self.nbytes)
            # else:
            if getattr(self, '_base', None) is not None:
                self._base.assign_value(new_value)
            self.assign_value(new_value)
        self._device = new_value.device

    # Tensor.data_ptr
@@ -970,7 +988,8 @@ class TensorPlaceHolder:
    # Tensor.diagonal_scatter

    # Tensor.fill_diagonal_

    def fill_diagonal_(self, value, wrap=False):
        return ops.inplace_fill_diagonal(self, value, wrap)

    # Tensor.fmax

@@ -1092,7 +1111,8 @@ class TensorPlaceHolder:
        return self.expand(other.size())

    # Tensor.exponential_

    def exponential_(self, lambd=1, *, generator=None):
        return ops.inplace_exponential(self, lambd, generator)

    # Tensor.fix

--- a/mindnlp/core/configs.py
+++ b/mindnlp/core/configs.py
@@ -8,11 +8,12 @@ SUPPORT_BF16 = DEVICE_TARGET == 'Ascend' and SOC not in ['ascend910', 'ascend310
 ON_A1 = SOC == 'ascend910'
 ON_A2 = SOC in ['ascend910b', 'ascend910_93']
 ON_ORANGE_PI = '310b' in SOC
 USE_PYBOOST = DEVICE_TARGET == 'Ascend'
 DEFAULT_DTYPE = mindspore.float32
 MS27 = '.'.join(mindspore.__version__.split('.')[:2]) >= '2.7'

 CPU_USE_NUMPY_OP = DEVICE_TARGET != 'CPU'
 # OP backend select
 USE_PYBOOST = True
 CPU_USE_NUMPY_OP = False

 def set_pyboost(mode: bool):
    """set global pyboost"""
@@ -21,4 +22,13 @@ def set_pyboost(mode: bool):

 def use_pyboost():
    """set global pyboost"""
    return USE_PYBOOST
    return USE_PYBOOST

 def set_cpu_use_numpy(mode: bool):
    """set global pyboost"""
    global CPU_USE_NUMPY_OP
    CPU_USE_NUMPY_OP = mode

 def cpu_use_numpy():
    """set global pyboost"""
    return CPU_USE_NUMPY_OP
--- a/mindnlp/core/cpu/init.py
+++ b/mindnlp/core/cpu/init.py
--- a/mindnlp/core/cuda/init.py
+++ b/mindnlp/core/cuda/init.py
@@ -6,7 +6,9 @@ from mindspore.runtime import memory_reserved as ms_memory_reserved, \
    memory_allocated as ms_memory_allocated, StreamCtx as StreamContext, Stream, empty_cache, \
    reset_peak_memory_stats, reset_max_memory_allocated, max_memory_allocated, synchronize, \
    current_stream
 from mindspore.device_context.gpu import device_count 
 from mindspore.device_context.gpu import device_count as ms_device_count
 from mindspore.hal import get_device_properties
 from mindspore.communication import GlobalComm, get_group_size

 from mindnlp import core

@@ -14,6 +16,13 @@ FloatTensor = core.FloatTensor
 HalfTensor = core.FloatTensor
 BFloat16Tensor = core.BFloat16Tensor

 def device_count():
    if not is_available():
        return 0
    if GlobalComm.INITED:
        return get_group_size()
    return 1

 def manual_seed_all(seed: int):
    manual_seed(seed)

--- a/mindnlp/core/dispatcher.py
+++ b/mindnlp/core/dispatcher.py
@@ -1,99 +1,8 @@
 from mindnlp import core
 from ._prims import ascend, cpu, numpy, meta, ascend_310b
 from .configs import DEVICE_TARGET, CPU_USE_NUMPY_OP, SOC
 from ._apis import npu, cpu, gpu, meta
 from .configs import DEVICE_TARGET, SOC
 from ._bind import is_autocast_enabled

 device_map = {"cpu": "CPU", "npu": "Ascend", "cuda": "GPU"}

 """
 __matmul__, addbmm, addmm, addmv, addr, baddbmm, bmm, chain_matmul, multi_dot,
 conv1d, conv2d, conv3d, conv_transpose1d, conv_transpose2d, conv_transpose3d, GRUCell,
 linear, LSTMCell, matmul, mm, mv, prelu, RNNCell
 """
 AMP_AUTO_WHITE_LIST = [
    "dense",
    "matmul",
    "addbmm",
    "addmm",
    "addmv",
    "addr",
    "baddbmm",
    "bmm",
    "chain_matmul",
    "multi_dot",
    "conv1d",
    "conv2d",
    "conv3d",
    "conv_transpose1d",
    "conv_transpose2d",
    "conv_transpose3d",
    "mm",
    "mv",
    "prelu",
 ]


 """
 __pow__, __rdiv__, __rpow__, __rtruediv__, acos, asin, binary_cross_entropy_with_logits,
 cosh, cosine_embedding_loss, cdist, cosine_similarity, cross_entropy,
 cumprod, cumsum, dist, erfinv, exp, expm1, group_norm, hinge_embedding_loss,
 kl_div, l1_loss, layer_norm, log, log_softmax, log10, log1p, log2, margin_ranking_loss, mse_loss,
 multilabel_margin_loss, multi_margin_loss, nll_loss, norm, normalize, pdist, poisson_nll_loss,
 pow, prod, reciprocal, rsqrt, sinh, smooth_l1_loss, soft_margin_loss, softmax, softmin, softplus,
 sum, renorm, tan, triplet_margin_loss
 """

 AMP_AUTO_BLACK_LIST = [
    'acos',
    'asin',
    'binary_cross_entropy_with_logits',
    'cosh',
    'cosine_embedding_loss',
    'cdist',
    'cosine_similarity',
    'cross_entropy',
    'cumprod',
    'cumsum',
    'dist',
    'erfinv',
    'exp',
    'expm1',
    'group_norm',
    'hinge_embedding_loss',
    'kl_div',
    'l1_loss',
    'layer_norm',
    'log',
    'log_softmax',
    'log10',
    'log1p',
    'log2',
    'margin_ranking_loss',
    'mse_loss',
    'multilabel_margin_loss',
    'multi_margin_loss',
    'nll_loss',
    'norm',
    'normalize',
    'pdist',
    'poisson_nll_loss',
    'pow',
    'prod',
    'reciprocal',
    'rsqrt',
    'sinh',
    'smooth_l1_loss',
    'soft_margin_loss',
    'softmax',
    'softmin',
    'softplus',
    'sum',
    'renorm',
    'tan',
    'triplet_margin_loss',
 ]


 class SingletonMeta(type):
    _instances = {}

@@ -104,13 +13,14 @@ class SingletonMeta(type):
        return cls._instances[cls]


 class Dispatcher(metaclass=SingletonMeta):
    def __init__(self):
        self._registry = {"cpu": {}, "npu": {}, "gpu": {}, "numpy": {}, "meta": {}}

    def register(self, func_name, device, func):
        self._registry[device][func_name] = func
 api_map = {
    'cpu': cpu,
    'npu': npu,
    'meta': meta,
    'cuda': gpu
 }

 class Dispatcher(metaclass=SingletonMeta):
    def dispatch(self, func_name, *args, **kwargs):
        device = kwargs.pop("device", None)
        if isinstance(device, str):
@@ -138,16 +48,8 @@ class Dispatcher(metaclass=SingletonMeta):

        device_type = device.type

        if CPU_USE_NUMPY_OP and device_type == "cpu":
            device_type = "numpy"

        # if is_autocast_enabled(device_type):
        #     if func_name in AMP_AUTO_WHITE_LIST or func_name.replace('_ext', '') in AMP_AUTO_WHITE_LIST:
        #         func_name = func_name + "_fp16"

        #     elif func_name in AMP_AUTO_BLACK_LIST or func_name.replace('_ext', '') in AMP_AUTO_BLACK_LIST:
        #         func_name = func_name + "_fp32"
        func = self._registry[device_type].get(func_name, None)
        # func = self._registry[device_type].get(func_name, None)
        func = getattr(api_map[device_type], func_name, None)
        if func is None:
            raise RuntimeError(
                f"No implementation for function: {func_name} on {device_type}."
@@ -156,18 +58,4 @@ class Dispatcher(metaclass=SingletonMeta):


 dispatcher = Dispatcher()
 if SOC == "ascend310b":
    for func_name in ascend_310b.__all__:
        dispatcher.register(func_name, "npu", getattr(ascend_310b, func_name))
 else:
    for func_name in ascend.__all__:
        dispatcher.register(func_name, "npu", getattr(ascend, func_name))

 for func_name in cpu.__all__:
    dispatcher.register(func_name, "cpu", getattr(cpu, func_name))

 for func_name in numpy.__all__:
    dispatcher.register(func_name, "numpy", getattr(numpy, func_name))

 for func_name in meta.__all__:
    dispatcher.register(func_name, "meta", getattr(meta, func_name))
--- a/mindnlp/core/executor.py
+++ b/mindnlp/core/executor.py
@@ -2,9 +2,6 @@ from mindnlp import core
 from .dispatcher import dispatcher

 def execute(func_name, *args, **kwargs):
    requires_grad = kwargs.pop('requires_grad', False)
    user_created = kwargs.pop('user_created', False)

    out, device = dispatcher.dispatch(func_name, *args, **kwargs)
    if not isinstance(out, (tuple, list)):
        out._device = device
--- a/mindnlp/core/nn/functional.py
+++ b/mindnlp/core/nn/functional.py
@@ -6,18 +6,15 @@ from typing import Optional, Tuple, List

 from mindnlp import core
 from mindnlp.core.executor import execute
 from mindnlp.core._C import default_generator
 from mindnlp.core.nn.modules.utils import _pair

 from ..configs import ON_ORANGE_PI, use_pyboost, ON_A1, ON_A2
 from ..configs import ON_A2, ON_A1

 generator_step_ = 12

 def gelu(input, *, approximate='none'):
    if input.device.type == 'npu':
        return execute('gelu_ext', input, approximate)
    if approximate == 'tanh':
        return execute('gelu', input)
    return input * 0.5 * (1.0 + core.erf(input / core.sqrt(2.0)))

    return execute('gelu', input, approximate)

 def relu(input, inplace=False):
    if inplace:
@@ -53,13 +50,13 @@ def glu(input, dim=-1):
    return execute('glu', input, dim)

 def softplus(input, beta=1, threshold=20):
    return execute('softplus_ext', input, beta, threshold)
    return execute('softplus', input, beta, threshold)

 def logsigmoid(input):
    return execute('logsigmoid', input)[0]

 def leaky_relu(input, alpha=0.2):
    return execute('leaky_relu_ext', input, alpha)
    return execute('leaky_relu', input, alpha)

 def prelu(input, weight):
    return execute('prelu', input, weight)
@@ -114,11 +111,6 @@ def avg_pool2d(input, kernel_size, stride=None, padding=0, ceil_mode=False, coun
    return execute('avg_pool2d', input, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)

 def avg_pool3d(input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True, divisor_override=None):
    if use_pyboost() and has_avg_pool3d:
        return mint.nn.functional.avg_pool3d(input, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)

    if divisor_override is None:
        divisor_override = 0
    return ops.avg_pool3d(input, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)


@@ -126,12 +118,15 @@ def adaptive_avg_pool1d(input, output_size):
    return execute('adaptive_avg_pool1d', input, output_size)

 def adaptive_avg_pool2d(input, output_size):
    return execute('adaptive_avg_pool2d_ext', input, output_size)
    return execute('adaptive_avg_pool2d', input, output_size)

 def dropout(input, p=0.5, training=True, inplace=False):
    if not training or p==0:
        return input
    out, _ = execute('dropout_ext', input, p)
    seed, offset = default_generator._step(generator_step_)
    seed._device = input.device
    offset._device = input.device
    out, _ = execute('dropout', input, p, seed, offset)
    if inplace:
        input.copy_(out)
        return input
@@ -143,21 +138,7 @@ def dropout2d(input, p=0.5, training=False):
    out, _ = execute('dropout2d', input, p)
    return out

 def drop_and_mask(keep_prob, seed=None):
    seed0, seed1 = _get_seed(seed, "dropout")
    dropout_op = ops.Dropout(keep_prob=keep_prob, Seed0=seed0, Seed1=seed1)
    dropout_op = _set_prim_op_user_data(dropout_op, "random_cache", False)
    out, mask = dropout_op(input)
    return out, mask

 def linear(input, weight, bias=None):
    if ON_ORANGE_PI:
        input = input.to(core.float16)
        weight = weight.to(core.float16)
        if bias is not None:
            bias = bias.to(core.float16)
            return execute('dense', input, weight) + bias
        return execute('dense', input, weight)
    return execute('dense', input, weight, bias)

 def binary_cross_entropy_with_logits(input, target, weight=None, reduction='mean', pos_weight=None):
@@ -165,31 +146,36 @@ def binary_cross_entropy_with_logits(input, target, weight=None, reduction='mean
        target = target.unsqueeze(1).expand_as(input).to(input.dtype)
    
    return execute('binary_cross_entropy_with_logits', input, target, weight, pos_weight, reduction)
    return ops.binary_cross_entropy_with_logits(input, target.astype(input.dtype), weight, pos_weight, reduction)

 def gumbel_softmax(logits: core.Tensor, tau: float = 1, hard: bool = False, eps: float = 1e-10, dim: int = -1) -> core.Tensor:
    if eps != 1e-10:
        warnings.warn("`eps` parameter is deprecated and has no effect.")

    uniform_samples = _get_cache_prim(ops.UniformReal)()(logits.shape)
    gumbels = -ops.log(-ops.log(uniform_samples + eps) + eps) # ~Gumbel(0, 1)
    if eps != 1e-10:
        warnings.warn("`eps` parameter is deprecated and has no effect.")

    gumbels = (
        -core.empty_like(logits, memory_format=core.legacy_contiguous_format)
        .exponential_()
        .log()
    )  # ~Gumbel(0,1)
    gumbels = (logits + gumbels) / tau  # ~Gumbel(logits,tau)
    y_soft = softmax(gumbels, dim)
    y_soft = gumbels.softmax(dim)

    if hard:
        # Straight through.
        index = y_soft.argmax(dim)
        y_hard = one_hot(index, logits.shape[dim])
        ret = ops.stop_gradient(y_hard - y_soft) + y_soft
        index = y_soft.max(dim, keepdim=True)[1]
        y_hard = core.zeros_like(
            logits, memory_format=core.legacy_contiguous_format
        ).scatter_(dim, index, 1.0)
        ret = y_hard - y_soft.detach() + y_soft
    else:
        # Reparametrization trick.
        ret = y_soft
    return ret

 def log_softmax(input, dim=None, dtype=None):
    if input.device.type == 'cpu':
        return execute('log_softmax', input, dim)
    return execute('log_softmax_ext', input, dim, dtype)
    return execute('log_softmax', input, dim, dtype)

 def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False):
    return execute('embedding', input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq)
@@ -309,6 +295,12 @@ def pad(input, pad, mode='constant', value=None):
        if mode == "replicate":
            mode = "edge"
            return execute('pad_v3', input, new_pad, mode)
        if input.dtype.is_floating_point:
            value = float(value)
        elif input.dtype == core.bool:
            value = bool(value)
        elif input.dtype in [core.int32, core.int64]:
            value = int(value)
        return execute('pad_v3', input, new_pad, mode, value)
    out = input
    if (isinstance(pad, tuple) and not pad):
@@ -332,9 +324,9 @@ def pad(input, pad, mode='constant', value=None):
    return out

 def nll_loss(input, target, weight=None, ignore_index=-100, reduction='mean'):
    if input.device.type == 'npu':
        return _nllloss_nd(input, target, weight, ignore_index, reduction)
    return _inner_nll_loss(input, target, weight, ignore_index, reduction)
    # if input.device.type == 'npu':
    return _nllloss_nd(input, target, weight, ignore_index, reduction)
    # return _inner_nll_loss(input, target, weight, ignore_index, reduction)

 def _inner_nll_loss(inputs, target, weight=None, ignore_index=-100, reduction='mean', label_smoothing=0.0):
    ndim = inputs.ndim
@@ -362,29 +354,29 @@ def _nll_loss(inputs, target, target_dim=-1, weight=None, ignore_index=None, red
    if target.ndim == inputs.ndim - 1:
        target = target.expand_dims(target_dim)
    if ignore_index is not None:
        non_pad_mask = ops.equal(target, ignore_index)
        target = target.masked_fill(non_pad_mask, ops.cast(0, target.dtype))
        non_pad_mask = core.eq(target, ignore_index)
        target = target.masked_fill(non_pad_mask, core.cast(0, target.dtype))
    else:
        non_pad_mask = target
    if weight is not None:
        loss_weights = ops.gather(weight, target, 0)
        loss_weights = core.gather(weight, target, 0)
        orig_shape = inputs.shape
        if inputs.ndim != 2:
            inputs = inputs.view(orig_shape[:2] + (-1,))
            weight = weight.view(weight.shape + (1,))
        weighted_inputs = inputs * weight
        weighted_inputs = weighted_inputs.view(orig_shape)
        loss = ops.neg(ops.gather_d(weighted_inputs, target_dim, target))
        smooth_loss = ops.neg(weighted_inputs.sum(axis=target_dim, keepdims=True))
        loss = core.neg(core.gather_d(weighted_inputs, target_dim, target))
        smooth_loss = core.neg(weighted_inputs.sum(axis=target_dim, keepdims=True))
    else:
        loss = ops.neg(ops.gather_d(inputs, target_dim, target))
        smooth_loss = ops.neg(inputs.sum(axis=target_dim, keepdims=True))
        loss_weights = ops.ones_like(loss)
        loss = core.neg(core.gather_d(inputs, target_dim, target))
        smooth_loss = core.neg(inputs.sum(axis=target_dim, keepdims=True))
        loss_weights = core.ones_like(loss)

    if ignore_index is not None:
        loss = loss.masked_fill(non_pad_mask, ops.cast(0, loss.dtype))
        loss_weights = loss_weights.masked_fill(non_pad_mask, ops.cast(0, loss_weights.dtype))
        smooth_loss = smooth_loss.masked_fill(non_pad_mask, ops.cast(0, smooth_loss.dtype))
        loss = loss.masked_fill(non_pad_mask, core.cast(0, loss.dtype))
        loss_weights = loss_weights.masked_fill(non_pad_mask, core.cast(0, loss_weights.dtype))
        smooth_loss = smooth_loss.masked_fill(non_pad_mask, core.cast(0, smooth_loss.dtype))

    loss = loss.squeeze(target_dim)
    smooth_loss = smooth_loss.squeeze(target_dim)
@@ -493,7 +485,7 @@ def _cross_entropy_for_class_indices(input, target, weight, ingore_index, reduct
        else:
            smooth_loss = -input.sum(class_dim)
        ignore_mask = core.eq(target, ingore_index)
        smooth_loss = core.masked_fill(smooth_loss, ignore_mask, 0)
        smooth_loss = core.masked_fill(smooth_loss, ignore_mask, 0.)
        if reduction == "mean":
            true_mask = ~ignore_mask
            if weight is not None:
@@ -519,10 +511,10 @@ def _cross_entropy_for_class_indices(input, target, weight, ingore_index, reduct


 def mse_loss(input, target, reduction='mean'):
    return execute('mse_loss_ext', input, target, reduction)
    return execute('mse_loss', input, target, reduction)

 def l1_loss(input, target, reduction='mean'):
    return execute('l1_loss_ext', input, target, reduction)
    return execute('l1_loss', input, target, reduction)

 def smooth_l1_loss(input, target, beta=1.0, reduction='none'):
    input = input.to(core.float32)
@@ -534,10 +526,6 @@ def kl_div(logits, labels, reduction='mean', log_target=False):
        labels = ops.log(labels)
    return ops.kl_div(logits, labels, reduction)

 def manual_softmax(x, dim=-1):
    exp_x = ops.exp(x - ops.max(x, axis=dim, keepdims=True)[0])
    return exp_x / ops.sum(exp_x, dim=dim, keepdim=True)

 def softmax(input, dim=-1, *, dtype=None):
    if dtype is not None:
        input = input.to(dtype)
@@ -549,7 +537,7 @@ def layer_norm(input, normalized_shape, weight=None, bias=None, eps=1e-5):
        weight = core.ones(normalized_shape, dtype=input.dtype, device=input.device)
    if bias is None:
        bias = core.zeros(normalized_shape, dtype=input.dtype, device=input.device)
    return execute('layer_norm_ext', input, normalized_shape, weight, bias, eps)[0]
    return execute('layer_norm', input, normalized_shape, weight, bias, eps)[0]


 def interpolate(input, size=None, scale_factor=None, mode='nearest', align_corners=None, recompute_scale_factor=None, antialias=False):
@@ -749,7 +737,7 @@ def batch_norm(input, running_mean, running_var, weight=None, bias=None, trainin
        bias = core.zeros(input.shape[1], dtype=input.dtype, device=input.device)

    return execute(
        'batch_norm_ext',
        'batch_norm',
        input,
        running_mean,
        running_var,
@@ -763,17 +751,17 @@ def batch_norm(input, running_mean, running_var, weight=None, bias=None, trainin
 def conv1d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    if isinstance(padding, str):
        return execute('conv1d_padding', input, weight, bias, stride, padding, dilation, groups)
    return execute('conv1d_ext', input, weight, bias, stride, padding, dilation, groups)
    return execute('conv1d', input, weight, bias, stride, padding, dilation, groups)

 def conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    if isinstance(padding, str):
        return execute('conv2d_padding', input, weight, bias, stride, padding, dilation, groups)
    return execute('conv2d_ext', input, weight, bias, stride, padding, dilation, groups)
    return execute('conv2d', input, weight, bias, stride, padding, dilation, groups)

 def conv3d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    if isinstance(padding, str):
        return execute('conv3d_padding', input, weight, bias, stride, padding, dilation, groups)
    return execute('conv3d_ext', input, weight, bias, stride, padding, dilation, groups)
    return execute('conv3d', input, weight, bias, stride, padding, dilation, groups)

    pad_mode = 'pad'
    pad = padding
@@ -836,8 +824,7 @@ def _deconv_output_length(pad_mode, filter_size, stride_size, dilation_size, pad
    return length

 def conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1):
    if use_pyboost():
        return execute('conv_transpose2d', input, weight, bias, stride, padding, output_padding, groups, dilation)
    return execute('conv_transpose2d', input, weight, bias, stride, padding, output_padding, groups, dilation)

    # pad_mode = 'pad'
    # pad = padding
@@ -927,7 +914,7 @@ def max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1, ceil_mode
    input_ndim = input.ndim
    if input_ndim == 3:
        input = input.unsqueeze(1)
    out = execute('max_pool2d', input, kernel_size, stride, padding, dilation, ceil_mode=ceil_mode, return_indices=return_indices)
    out = execute('max_pool2d', input, kernel_size, stride, padding, dilation, ceil_mode, return_indices)
    if input_ndim == 3:
        out = out.squeeze(1)
    return out
@@ -959,28 +946,26 @@ def group_norm(input, num_groups, weight=None, bias=None, eps=1e-5):
        weight = core.ones([input.shape[1]], dtype=input.dtype, device=input.device)
    if bias is None:
        bias = core.zeros([input.shape[1]], dtype=input.dtype, device=input.device)
    return execute('group_norm', input, num_groups, weight, bias, eps)[0]

    # input_shape = input.shape
    # N = input_shape[0]
    # C = input_shape[1]
    # input_reshaped = input.view(1, N * num_groups, -1 if N!=0 else 1)
    # outputs = batch_norm(input_reshaped, None, None, None, None, True, 0., eps)
    # out = outputs.view(input_shape)
    # affine_param_shape = [1] * input.ndim
    # affine_param_shape[1] = C
    # affine_param_shape = tuple(affine_param_shape)
    # if weight is not None and bias is not None:
    #     if not ON_ORANGE_PI:
    #         out = bias.view(affine_param_shape).addcmul(out, weight.view(affine_param_shape), 1)
    #     else:
    #         out = core.addcmul(bias.view(affine_param_shape), out, weight.view(affine_param_shape), value=1)

    # elif weight is not None:
    #     out = out.mul(weight.view(affine_param_shape))
    # elif bias is not None:
    #     out = out.add(bias.view(affine_param_shape))
    # return out
    if input.device.type == 'npu':
        return execute('group_norm', input, num_groups, weight, bias, eps)[0]

    input_shape = input.shape
    N = input_shape[0]
    C = input_shape[1]
    input_reshaped = input.view(1, N * num_groups, -1 if N!=0 else 1)
    outputs = batch_norm(input_reshaped, None, None, None, None, True, 0., eps)
    out = outputs.view(input_shape)
    affine_param_shape = [1] * input.ndim
    affine_param_shape[1] = C
    affine_param_shape = tuple(affine_param_shape)
    if weight is not None and bias is not None:
        out = core.addcmul(bias.view(affine_param_shape), out, weight.view(affine_param_shape), value=1)

    elif weight is not None:
        out = out.mul(weight.view(affine_param_shape))
    elif bias is not None:
        out = out.add(bias.view(affine_param_shape))
    return out


 def _in_projection(
@@ -1576,18 +1561,16 @@ def _none_or_dtype(input: Optional[core.Tensor]) -> Optional[int]:
    raise RuntimeError("input to _none_or_dtype() must be None or core.Tensor")

 def unfold(input, kernel_size, dilation=1, padding=0, stride=1):
    if ON_A1:
        return execute('im2col', input, kernel_size, dilation, padding, stride)
    return execute('im2col_ext', input, kernel_size, dilation, padding, stride)
    return execute('im2col', input, _pair(kernel_size), _pair(dilation), _pair(padding), _pair(stride))

 def fold(input, output_size, kernel_size, dilation=1, padding=0, stride=1):
    return execute('col2im_ext', input, output_size, kernel_size, dilation, padding, stride)
    return execute('col2im', input, output_size, kernel_size, dilation, padding, stride)

 def ctc_loss(log_probs, targets, input_lengths, target_lengths, blank=0, reduction='mean', zero_infinity=False):
    return execute('ctc_loss', log_probs, targets, input_lengths, target_lengths, blank, reduction, zero_infinity)

 def one_hot(tensor, num_classes=-1):
    return execute('one_hot_ext', tensor, num_classes)
    return execute('one_hot', tensor, num_classes)

 def pixel_shuffle(input, upscale_factor):
    return execute('pixel_shuffle', input, upscale_factor)
--- a/mindnlp/core/nn/init.py
+++ b/mindnlp/core/nn/init.py
@@ -565,7 +565,6 @@ def kaiming_uniform_(
    with core.no_grad():
        return tensor.uniform_(-bound, bound, generator=generator)


 def kaiming_normal_(
    tensor: Tensor,
    a: float = 0,
--- a/mindnlp/core/nn/modules/adaptive.py
+++ b/mindnlp/core/nn/modules/adaptive.py
@@ -5,10 +5,10 @@ from collections import namedtuple
 from typing import List, Sequence

 from mindnlp.core import Tensor
 import mindnlp.core.nn.functional as F

 from . import Sequential, ModuleList, Linear
 from .module import Module
 from ..functional import log_softmax
 from ... import ops

 __all__ = ['AdaptiveLogSoftmaxWithLoss']
@@ -223,7 +223,7 @@ class AdaptiveLogSoftmaxWithLoss(Module):
                cluster_index = self.shortlist_size + i - 1

                gather_inds = ops.index_fill(gather_inds, 0, row_indices, cluster_index)
                cluster_logprob = log_softmax(cluster_output, dim=1)
                cluster_logprob = F.log_softmax(cluster_output, dim=1)
                local_logprob = cluster_logprob.gather(1, relative_target.unsqueeze(1))
                output = ops.index_add(output, 0, row_indices, local_logprob.squeeze(1))

@@ -235,7 +235,7 @@ class AdaptiveLogSoftmaxWithLoss(Module):
                               "were found. ")

        head_output = self.head(input)
        head_logprob = log_softmax(head_output, dim=1)
        head_logprob = F.log_softmax(head_output, dim=1)
        output += ops.gather(head_logprob, 1, gather_inds.unsqueeze(1)).squeeze()
        loss = (-output).mean()

@@ -247,13 +247,13 @@ class AdaptiveLogSoftmaxWithLoss(Module):
    def _get_full_log_prob(self, input, head_output):
        """Given input tensor, and output of ``self.head``, compute the log of the full distribution."""
        out = ops.zeros((head_output.shape[0], self.n_classes), dtype=input.dtype)
        head_logprob = log_softmax(head_output, dim=1)
        head_logprob = F.log_softmax(head_output, dim=1)

        out[:, :self.shortlist_size] = head_logprob[:, :self.shortlist_size]

        for i, (start_idx, stop_idx) in enumerate(zip(self.cutoffs, self.cutoffs[1:])):
            cluster_output = self.tail[i](input)
            cluster_logprob = log_softmax(cluster_output, dim=1)
            cluster_logprob = F.log_softmax(cluster_output, dim=1)
            output_logprob = cluster_logprob + head_logprob[:, self.shortlist_size + i].unsqueeze(1)

            out[:, start_idx:stop_idx] = output_logprob
--- a/mindnlp/core/nn/modules/rnn.py
+++ b/mindnlp/core/nn/modules/rnn.py
@@ -247,7 +247,7 @@ class _DynamicLSTMCPUGPU(Module):
                has_bias = False
            else:
                has_bias = True
                if self.is_gpu:
                if x.device.type == 'cuda':
                    weights = ops.concat((
                        w_ih.view(-1, 1, 1),
                        w_hh.view(-1, 1, 1),
@@ -261,12 +261,13 @@ class _DynamicLSTMCPUGPU(Module):
                        w_hh.view(-1, 1, 1),
                        bias.view(-1, 1, 1)
                    ))
            _lstm = _get_cache_prim(LSTMOP)(input_size, hidden_size, 1, has_bias, False, 0.0)
            output, h_n, c_n, _, _ = _lstm(
            output, h_n, c_n, _, _ = execute(
                'lstm',
                x,
                h_0[0].unsqueeze(0),
                h_0[1].unsqueeze(0),
                weights.astype(x.dtype)
                weights.astype(x.dtype),
                input_size, hidden_size, 1, has_bias, False, 0.0, 0
            )
        return output, (h_n, c_n)

--- a/mindnlp/core/nn/utils/parametrize.py
+++ b/mindnlp/core/nn/utils/parametrize.py
@@ -78,7 +78,7 @@ def _register_parameter_or_buffer(module, name, X):


 def _maybe_set(dest: Tensor, src: Tensor) -> None:
    dest.assign_value(src)  # type: ignore[call-overload]
    dest.copy_(src)  # type: ignore[call-overload]


 class ParametrizationList(ModuleList):
--- a/mindnlp/core/npu/init.py
+++ b/mindnlp/core/npu/init.py
@@ -39,9 +39,11 @@ def manual_seed_all(seed: int):
    manual_seed(seed)

 def device_count():
    if not is_available():
        return 0
    if GlobalComm.INITED:
        return get_group_size()
    return ms_device_count()
    return 1

 def current_device():
    return core.device('npu', 0)
--- a/mindnlp/core/ops/array.py
+++ b/mindnlp/core/ops/array.py
@@ -8,7 +8,6 @@ import mindspore
 from mindnlp import core
 from mindnlp.core.executor import execute
 from .other import broadcast_tensors, broadcast_to
 from ..configs import ON_ORANGE_PI


 def t(input):
@@ -74,8 +73,6 @@ def chunk(input, chunks, dim=0):

 # gather
 def gather(input, dim, index):
    if ON_ORANGE_PI:
        return torch_gather(input, index, dim)
    return execute("gather_d", input, dim, index)

 def torch_gather(x, indices, axis=1):
@@ -131,7 +128,9 @@ def index_add(input, dim, index, source, *, alpha=1):

 # index_select
 def index_select(input, dim, index):
    return execute("index_select", input, dim, index)
    if input.device.type in ['npu', 'meta']:
        return execute("index_select", input, dim, index)
    return execute("gather", input, index, dim, 0)

 # masked_select
 def masked_select(input, mask):
@@ -167,15 +166,69 @@ def movedim(x, source, destination):
        >>> print(output.shape)
        (4, 3, 5)
    """
    ndim = x.ndim
    if len(source) != len(destination):
        raise ValueError(
            f"For `source` and `destination` arguments, the number of elements must be the same, but got 'source':"
            f" {len(source)} and 'destination': {len(destination)}.")
    perm = _get_moved_perm(ndim, source, destination)
    return permute(x, perm)
    return movedim(x, source, destination)

 # moveaxis
 def moveaxis(a, source, destination):
    """Raises ValueError if source, destination not in (-ndim(a), ndim(a))."""
    if not source and not destination:
        return a

    if isinstance(source, int):
        source = (source,)
    if isinstance(destination, int):
        destination = (destination,)
    if len(source) != len(destination):
        raise ValueError('The lengths of source and destination must equal')

    a_rank = a.ndim

    def _correct_axis(axis, rank):
        if axis < 0:
            return axis + rank
        return axis

    source = tuple(_correct_axis(axis, a_rank) for axis in source)
    destination = tuple(_correct_axis(axis, a_rank) for axis in destination)

    if a.ndim is not None:
        perm = [i for i in range(a_rank) if i not in source]
        for dest, src in sorted(zip(destination, source)):
            assert dest <= len(perm)
            perm.insert(dest, src)
    else:
        r = core.range(0, a_rank, 1)

        def _remove_indices(a, b):
            """Remove indices (`b`) from `a`."""
            items = core.unbind(
                core.sort(core.stack(b))
            )

            i = 0
            result = []

            for item in items:
                result.append(a[i:item])
                i = item + 1

            result.append(a[i:])

            return core.concat(result, 0)

        minus_sources = _remove_indices(r, source)
        minus_dest = _remove_indices(r, destination)

        perm = execute('scatter_nd', 
            core.unsqueeze(minus_dest, 1), minus_sources, [a_rank]
        )
        perm = execute('tensor_scatter_update',
            perm, core.unsqueeze(destination, 1), source
        )
    a = core.permute(a, tuple(perm))

    return a

 def _get_moved_perm(ndim, source, destination):
    """
    Helper function for movedim, returns permutation after moving axis
@@ -203,6 +256,7 @@ def _get_moved_perm(ndim, source, destination):
 # narrow
 def narrow(input, dim, start, length):
    length = length.item() if not isinstance(length, int) else length
    start = start.item() if not isinstance(start, int) else start
    return execute("narrow", input, dim, start, length)


@@ -219,7 +273,7 @@ def nonzero(input, *, as_tuple=False):
 # permute
 def permute(input, dims):
    assert isinstance(dims, tuple)
    return execute("transpose_view", input, dims)
    return execute("permute", input, dims)


 # reshape
@@ -305,8 +359,6 @@ def squeeze(input, dim=None):


 def stack(tensors, dim=0):
    if tensors[0].device.type == "npu":
        return execute("stack_ext", tensors, dim)
    return execute("stack", tensors, dim)


@@ -373,6 +425,7 @@ def _take_along_dim_helper(self, indices, dim):

 # take_along_dim
 def take_along_dim(input, indices, dim=None, *, out=None):
    input = input.clone() # input wiil be modified on CPU
    if dim:
        self_broadcasted, indices_broadcasted, dim = _take_along_dim_helper(input, indices, dim)
        return gather(self_broadcasted, dim, indices_broadcasted)
@@ -400,17 +453,23 @@ def tensor_split(input, indices_or_sections, dim=0):
 def tile(input, dims):
    if isinstance(dims[0], (tuple, list)):
        dims = dims[0]
    return execute("tile", input, tuple(dims))

    new_dims = ()
    for d in dims:
        if not isinstance(d, int):
            d = d.item()
        new_dims += (d,)
    return execute("tile", input, tuple(new_dims))


 # transpose
 def transpose(input, dim0, dim1):
    return execute("transpose_ext_view", input, dim0, dim1)
    return execute("transpose_view", input, dim0, dim1)


 # unbind
 def unbind(input, dim=0):
    return execute("unstack_ext_view", input, dim)
    return execute("unstack_view", input, dim)


 # unravel_index
@@ -418,7 +477,7 @@ def unbind(input, dim=0):

 # unsqueeze
 def unsqueeze(input, dim):
    return execute("expand_dims_view", input, dim)
    return execute("expand_dims", input, dim)


 # vsplit
@@ -430,9 +489,6 @@ def unsqueeze(input, dim):
 def where(condition, input=None, other=None):
    if input is None and other is None:
        return nonzero(condition, as_tuple=True)
    if ON_ORANGE_PI:
        out = condition * input + (~condition) * other
        return out
    return execute("select", condition, input, other)


@@ -469,7 +525,7 @@ def _do_slice(self, dim: int, index: slice, self_shape: list):
    end = _get_index(index.stop, self_shape[dim])
    if start == 0 and end == self_shape[dim] and step == 1:
        return self
    return execute('slice_ext', self, dim, start, end, step)
    return execute('slice', self, dim, start, end, step)

 def _wrap_index_to_tuple(index):
    """Wrap index to tuple"""
@@ -494,7 +550,7 @@ def _count_indexed_dims(indexes):
            count += 1
    return count

 def _record_tensor_index(index, remain_indexes, dim):
 def _record_tensor_index(index, remain_indexes, dim, device):
    """Record indexes remained to be used by aclnnIndex/aclnnIndexPut"""
    if len(remain_indexes) > dim:
        remain_indexes[dim] = index
@@ -502,7 +558,10 @@ def _record_tensor_index(index, remain_indexes, dim):

    while dim > len(remain_indexes):
        # use empty_tensor with dim_num 9 to indicate unused dim
        remain_indexes.append(empty_tensor_9d)
        if device.type == 'npu':
            remain_indexes.append(empty_tensor_9d)
        else:
            remain_indexes.append(slice(None, None, None))

    remain_indexes.append(index)
    return remain_indexes
@@ -513,7 +572,7 @@ def _process_dim_in_multi_dim_index(prev_result, orig_tensor, index, dim, indexe
    if isinstance(index, bool):
        result = unsqueeze(prev_result, dim)
        index_for_bool = tensor_1d if index else empty_tensor_1d
        _record_tensor_index(index_for_bool, remain_indexes, dim)
        _record_tensor_index(index_for_bool, remain_indexes, dim, prev_result.device)
        prev_shape.insert(dim, 1)
        dim += 1
        return result, dim, remain_indexes, prev_shape
@@ -544,11 +603,11 @@ def _process_dim_in_multi_dim_index(prev_result, orig_tensor, index, dim, indexe
            # process index with Tensor bool type
            result = unsqueeze(prev_result, dim)
            index_for_bool = tensor_1d if index else empty_tensor_1d
            _record_tensor_index(index_for_bool, remain_indexes, dim)
            _record_tensor_index(index_for_bool, remain_indexes, dim, prev_result.device)
            prev_shape.insert(dim, 1)
            dim += 1
            return result, dim, remain_indexes, prev_shape
        _record_tensor_index(index, remain_indexes, dim)
        _record_tensor_index(index, remain_indexes, dim, prev_result.device)
        dim += 1
        return result, dim, remain_indexes, prev_shape
    raise IndexError(f"Invalid tensor index type {index}")
@@ -597,7 +656,11 @@ def tensor_getitem(self, index):
    self_viewed, remain_indexes = _process_multi_dim_index(self, indexes, remain_indexes, indexed_dims)
    if not remain_indexes:
        return self_viewed
    return execute('index', self_viewed, remain_indexes)

    if self.device.type == 'npu':
        return execute('index', self_viewed, remain_indexes)

    return getitem(self_viewed, tuple(remain_indexes) if len(remain_indexes) > 1 else remain_indexes[0])


 def tensor_setitem(self, index, value):
@@ -634,7 +697,11 @@ def tensor_setitem(self, index, value):
    if not remain_indexes:
        execute('inplace_copy', self_viewed, value)
        return self
    execute('inplace_index_put', self_viewed, remain_indexes, value, False) # accumulate=False
    
    if self.device.type == 'npu':
        execute('inplace_index_put', self_viewed, remain_indexes, value, False) # accumulate=False
    else:
        setitem(self_viewed, tuple(remain_indexes) if len(remain_indexes) > 1 else remain_indexes[0], value)
    return self

 _SLICE_ERROR = (
@@ -642,18 +709,23 @@ _SLICE_ERROR = (
    'newaxis (`None`) and integer or boolean arrays are valid indices'
 )

 def _as_index(idx, need_scalar=True):

 def _as_index(idx, device, need_scalar=True):
    """Helper function to parse idx as an index.
    """
    if isinstance(idx, numbers.Integral):
        return idx, True

    idx = core.tensor(idx)
    if not isinstance(idx, core.Tensor):
        idx = core.tensor(idx, dtype=core.int64, device=device)
    if need_scalar and idx.ndim not in (None, 0):
        raise IndexError(_SLICE_ERROR + ', got {!r}'.format(idx))

    if idx.ndim == 0:
        return idx.item(), True

    if idx.device != device:
        idx._device = device
    return idx, False

 def cumprod(x, axis=0, exclusive=False, reverse=False):
@@ -676,66 +748,6 @@ def cumprod(x, axis=0, exclusive=False, reverse=False):

    return result

 def moveaxis(a, source, destination):
    """Raises ValueError if source, destination not in (-ndim(a), ndim(a))."""
    if not source and not destination:
        return a

    if isinstance(source, int):
        source = (source,)
    if isinstance(destination, int):
        destination = (destination,)
    if len(source) != len(destination):
        raise ValueError('The lengths of source and destination must equal')

    a_rank = a.ndim

    def _correct_axis(axis, rank):
        if axis < 0:
            return axis + rank
        return axis

    source = tuple(_correct_axis(axis, a_rank) for axis in source)
    destination = tuple(_correct_axis(axis, a_rank) for axis in destination)

    if a.ndim is not None:
        perm = [i for i in range(a_rank) if i not in source]
        for dest, src in sorted(zip(destination, source)):
            assert dest <= len(perm)
            perm.insert(dest, src)
    else:
        r = core.range(0, a_rank, 1)

        def _remove_indices(a, b):
            """Remove indices (`b`) from `a`."""
            items = core.unbind(
                core.sort(core.stack(b))
            )

            i = 0
            result = []

            for item in items:
                result.append(a[i:item])
                i = item + 1

            result.append(a[i:])

            return core.concat(result, 0)

        minus_sources = _remove_indices(r, source)
        minus_dest = _remove_indices(r, destination)

        perm = execute('scatter_nd', 
            core.unsqueeze(minus_dest, 1), minus_sources, [a_rank]
        )
        perm = execute('tensor_scatter_update',
            perm, core.unsqueeze(destination, 1), source
        )
    a = core.permute(a, tuple(perm))

    return a

 def _slice_helper(tensor, slice_spec, do_update=False, updates=None):
    """Helper function for __getitem__ and _with_index_update_helper.
    """
@@ -772,7 +784,7 @@ def _slice_helper(tensor, slice_spec, do_update=False, updates=None):
            # strides.append(1)
            new_axis_mask |= (1 << index)
        else:
            s, is_scalar = _as_index(s, False)
            s, is_scalar = _as_index(s, tensor.device, False)
            if is_scalar:
                begin.append(s)
                end.append(s + 1)
@@ -788,6 +800,8 @@ def _slice_helper(tensor, slice_spec, do_update=False, updates=None):
                advanced_indices.append((index, s, ellipsis_mask != 0))

    if do_update and not advanced_indices:
        if 0 in updates.shape:
            return tensor
        return strided_slice_update(
            tensor,
            begin,
@@ -841,7 +855,7 @@ def _slice_helper(tensor, slice_spec, do_update=False, updates=None):
    if not dims_contiguous or updates is not None:
        if range(len(dims)) != dims:
            tensor = moveaxis(tensor, dims, range(len(dims)))
        tensor_shape_prefix = core.tensor(tensor.shape[: len(dims)])
        tensor_shape_prefix = core.tensor(tensor.shape[: len(dims)], device=stacked_indices.device)
        stacked_indices = where(
            stacked_indices < 0,
            stacked_indices + tensor_shape_prefix,
@@ -854,7 +868,7 @@ def _slice_helper(tensor, slice_spec, do_update=False, updates=None):
            # only in this case the result dimensions of advanced indexing are in
            # the middle of `updates`. In the non-contiguous case, those dimensions
            # are always at the front.
            if dims_contiguous:
            if dims_contiguous and updates.ndim > 1:
                batch_size = stacked_indices.ndim - 1
                batch_start = dims[0]
                if batch_start < 0:
@@ -866,6 +880,7 @@ def _slice_helper(tensor, slice_spec, do_update=False, updates=None):
                updates = moveaxis(
                    updates, range_(batch_start, batch_size), range(batch_size)
                )
            updates = updates.broadcast_to(stacked_indices.shape[:-1] + tensor.shape[stacked_indices.shape[-1]:])
            tensor = execute('tensor_scatter_update', tensor, stacked_indices, updates)
            if range(len(dims)) != dims:
                tensor = moveaxis(tensor, range(len(dims)), dims)
@@ -909,7 +924,7 @@ def _slice_helper(tensor, slice_spec, do_update=False, updates=None):
        flat_shape = shape_tensor[:axis] + (-1,) + shape_tensor[axis + len(dims) :]
        tensor = tensor.reshape(flat_shape)

    return execute('gather', tensor, stacked_indices, axis)
    return execute('gather', tensor, stacked_indices, axis, 0)

 def _as_spec_tuple(slice_spec):
    """Convert slice_spec to tuple."""
@@ -930,8 +945,10 @@ def getitem(self, slice_spec):
            isinstance(slice_spec, core.Tensor)
            and slice_spec.dtype == core.bool
        )
    ):
        return masked_select(self, slice_spec)
    ):  
        if self.shape == slice_spec.shape:
            return masked_select(self, slice_spec)
        slice_spec = nonzero(slice_spec, as_tuple=True)

    if not isinstance(slice_spec, tuple):
        slice_spec = _as_spec_tuple(slice_spec)
@@ -948,7 +965,10 @@ def setitem(a, slice_spec, updates):
            and slice_spec.dtype == core.bool
        )
    ):
        slice_spec = nonzero(slice_spec)
        if slice_spec.shape == a.shape and (isinstance(updates, numbers.Number) or updates.ndim == 0):
            a.masked_fill_(slice_spec, updates)
            return a
        slice_spec = nonzero(slice_spec, as_tuple=True)

    if not isinstance(slice_spec, tuple):
        slice_spec = _as_spec_tuple(slice_spec)
@@ -963,9 +983,12 @@ def strided_slice_update(input, begin, end, strides, update, begin_mask=0, end_m
    sliced_tensor = execute('strided_slice', input, begin, end, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask)
    if update.shape != sliced_tensor.shape:
        update = update.broadcast_to(sliced_tensor.shape)
        update = update - sliced_tensor
    update = update - sliced_tensor
    updated_tensor = execute('strided_slice_grad', input, begin, end, strides, update, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask)
    input.data = input + updated_tensor
    out = input + updated_tensor
    if input.dtype == core.bool:
        out = out.astype(core.bool)
    input.copy_(out)
    return input

 def getitem_np(input, slice):
--- a/mindnlp/core/ops/blas.py
+++ b/mindnlp/core/ops/blas.py
@@ -22,7 +22,7 @@ def baddbmm(input, batch1, batch2, *, beta=1, alpha=1):

 # bmm
 def bmm(input, other):
    return execute('bmm_ext', input, other)
    return execute('bmm', input, other)

 # chain_matmul

@@ -61,7 +61,7 @@ def dot(input, other):

 # matmul
 def matmul(input, other):
    return execute('matmul_ext', input, other)
    return execute('matmul', input, other)

 # matrix_power

--- a/mindnlp/core/ops/comparison.py
+++ b/mindnlp/core/ops/comparison.py
@@ -19,16 +19,11 @@ def argsort(input, dim=-1, descending=False, stable=False):
 def eq(input, other):
    if not isinstance(other, numbers.Number) and other.device != input.device:
        other = other.to(input.device)
    return execute('equal', input, other)
    return execute('eq', input, other)

 # equal
 def equal(input, other):
    if input.device.type == 'npu':
        return execute('equal_ext', input, other)
    # if input.shape != other.shape:
    #     return False
    out = eq(input, other)
    return out.all()
    return execute('equal', input, other)

 # ge
 def ge(input, other):
@@ -44,7 +39,10 @@ def greater(input, other):

 # isclose
 def isclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False):
    if not isinstance(atol, numbers.Number):
        atol = atol.item()
    return execute('isclose', input, other, rtol, atol, equal_nan)
    

 # isfinite
 def isfinite(input):
@@ -53,7 +51,8 @@ def isfinite(input):
 # isin
 def in1d(ar1, ar2, invert=False):
    ar1 = core.unsqueeze(ar1.ravel(), -1)
    ar2 = ar2.ravel()
    if not isinstance(ar2, numbers.Number):
        ar2 = ar2.ravel()
    included = core.eq(ar1, ar2)
    # ops.reduce_sum only supports float
    res = core.sum(included.to(core.float32), -1).to(core.bool_)
@@ -62,11 +61,8 @@ def in1d(ar1, ar2, invert=False):
    return res

 def isin(elements, test_elements, invert=False):
    if elements.device.type != 'cpu':
        res = in1d(elements, test_elements, invert=invert)
        return core.reshape(res, elements.shape)

    return execute('isin', elements, test_elements)
    res = in1d(elements, test_elements, invert=invert)
    return core.reshape(res, elements.shape)

 # isinf
 def isinf(input):
@@ -108,6 +104,8 @@ def maximum(input, other):

 # minimum
 def minimum(input, other):
    if other.device != input.device:
        other = other.to(input.device)
    return execute('minimum', input, other)

 # fmax
@@ -124,32 +122,13 @@ def not_equal(input, other):

 # sort
 def sort(input, *, dim=-1, descending=False, stable=False):
    out = execute('sort_ext', input, dim, descending, stable)
    out = execute('sort', input, dim, descending, stable)
        
    return sort_out(values=out[0], indices=out[1])

 # topk
 def topk(input, k, dim=-1, largest=True, sorted=True):
    if input.device.type == 'npu':
        out = execute('topk_ext', input, k, dim, largest, sorted)
    else:
        if not largest:
            input = -input
        if dim is None or dim == input.ndim - 1:
            if not largest:
                res = execute('topk', input, k, sorted)
                values, indices = -res[0], res[1]
                return topk_out(values=values, indices=indices)
            out =  execute('topk', input, k, sorted)
            return topk_out(values=out[0], indices=out[1])
        input = input.swapaxes(dim, input.ndim - 1)
        output = execute('topk', input, k, sorted)
        values = output[0].swapaxes(dim, input.ndim - 1)
        indices = output[1].swapaxes(dim, input.ndim - 1)
        if not largest:
            res = (-values, indices)
        else:
            res = (values, indices)
        out = res
    out = execute('topk', input, k, dim, largest, sorted)
    return topk_out(values=out[0], indices=out[1])


--- a/mindnlp/core/ops/creation.py
+++ b/mindnlp/core/ops/creation.py
@@ -51,7 +51,7 @@ def zeros(*size, out=None, dtype=None, layout=None, device=None, requires_grad=F
            s = s.item()
        new_size += (s,)

    output = execute('zeros', new_size, dtype, device=device, requires_grad=requires_grad, user_created=True)
    output = execute('zeros', new_size, dtype, device=device)
    if out is None:
        return output
    out.data = output
@@ -63,10 +63,7 @@ def zeros_like(input, *, dtype=None, layout=None, device=None, requires_grad=Fal
        dtype = input.dtype
    if device is None:
        device = input.device
    if device.type == 'cpu':
        return execute('zeros_like', input, device=device, requires_grad=requires_grad, user_created=True)
    return execute('zeros_like_ext', input, dtype,
                   device=device, requires_grad=requires_grad, user_created=True)
    return execute('zeros_like', input, dtype, device=device)

 # ones
 def ones(*size, out=None, dtype=None, layout=None, device=None, requires_grad=False, **kwargs):
@@ -87,7 +84,7 @@ def ones(*size, out=None, dtype=None, layout=None, device=None, requires_grad=Fa
        new_size += (s,)

    output = execute('ones', new_size, dtype,
                     device=device, requires_grad=requires_grad, user_created=True)
                     device=device)
    if out is None:
        return output
    out.data = output
@@ -101,10 +98,7 @@ def ones_like(input, *, dtype=None, layout=None, device=None, requires_grad=Fals
        device = input.device
    if isinstance(device, str):
        device = core.device(device)
    if device.type == 'cpu':
        return execute('ones_like', input, device=device, requires_grad=requires_grad, user_created=True)
    return execute('ones_like_ext', input, dtype,
                   device=device, requires_grad=requires_grad, user_created=True)
    return execute('ones_like', input, dtype, device=device)

 # arange
 def arange(start=0, end=None, step=1, *, out=None, dtype=None, layout=None, device=None, requires_grad=False):
@@ -121,7 +115,7 @@ def arange(start=0, end=None, step=1, *, out=None, dtype=None, layout=None, devi
    end = end.item() if isinstance(end, (core.Tensor, np.integer)) else end
    step = step.item() if isinstance(step, (core.Tensor, np.integer)) else step

    output = execute('arange', start, end, step, dtype, device=device, requires_grad=requires_grad, user_created=True)
    output = execute('arange', start, end, step, dtype, device=device)
    if out is None:
        return output
    out.data = output
@@ -136,7 +130,7 @@ def range(start=0, end=None, step=1, *, out=None, dtype=None, layout=None, devic
    if device is None:
        device = get_device_in_context()
    output = execute('range', start, end + 1, step, 1000000,
                     device=device, requires_grad=requires_grad, user_created=True)
                     device=device)
    if out is None:
        return output
    out.data = output
@@ -155,8 +149,7 @@ def linspace(start, end, steps, *, out=None, dtype=None, layout=None, device=Non
    end = end.item() if isinstance(end, (core.Tensor, np.integer)) else end
    steps = steps.item() if isinstance(steps, (core.Tensor, np.integer)) else steps

    output = execute('lin_space_ext', start, end, steps, dtype,
                        device=device, requires_grad=requires_grad, user_created=True)
    output = execute('linspace', start, end, steps, dtype, device=device)
    if out is None:
        return output
    out.data = output
@@ -173,7 +166,7 @@ def eye(n, m=None, *, out=None, dtype=None, layout=None, device=None, requires_g
    if m is None:
        m = n
    output = execute('eye', n, m, dtype,
                     device=device, requires_grad=requires_grad, user_created=True)
                     device=device)
    if out is None:
        return output
    out.data = output
@@ -205,7 +198,7 @@ def empty(*size, out=None, dtype=None, layout=None, device=None,
 def empty_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=None):
    if device is None:
        device = input.device
    return empty(input.shape, dtype=input.dtype, layout=layout, device=device, requires_grad=requires_grad)
    return empty(input.shape, dtype=input.dtype, layout=layout, device=device)

 # empty_strided

@@ -216,16 +209,13 @@ def full(size, fill_value, *, out=None, dtype=None, layout=None, device=None, re
    #     dtype = get_default_dtype()
    if device is None:
        device = get_device_in_context()
    if device.type == 'cpu':
        output = execute('full', size, fill_value, device=device, requires_grad=requires_grad, user_created=True)
    size = tuple([s if isinstance(s, int) else s.item() for s in size])
    if isinstance(fill_value, numbers.Number):
        output = execute('fill_scalar', size, fill_value, dtype,
                            device=device)
    else:
        size = [s if isinstance(s, int) else s.item() for s in size]
        if isinstance(fill_value, numbers.Number):
            output = execute('fill_scalar', size, fill_value, dtype,
                             device=device, requires_grad=requires_grad, user_created=True)
        else:
            output = execute('fill_tensor', size, fill_value, dtype,
                             device=device, requires_grad=requires_grad, user_created=True)
        output = execute('fill_tensor', size, fill_value, dtype,
                            device=device)
    if out is None:
        return output
    out.data = output
@@ -235,7 +225,7 @@ def full(size, fill_value, *, out=None, dtype=None, layout=None, device=None, re
 def full_like(input, fill_value, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=None):
    if dtype is None:
        dtype = input.dtype
    return full(input.shape, fill_value, dtype=dtype, layout=layout, device=input.device, requires_grad=requires_grad)
    return full(input.shape, fill_value, dtype=dtype, layout=layout, device=input.device)

 # quantize_per_tensor

--- a/mindnlp/core/ops/inplace.py
+++ b/mindnlp/core/ops/inplace.py
@@ -12,12 +12,7 @@ def inplace_copy(self, other):
    return self

 def inplace_zero(input):
    if input.device.type == 'npu':
        execute('inplace_zero', input)
    elif input.device.type == 'meta':
        pass
    else:
        input.data = core.zeros_like(input)
    execute('inplace_zero', input)
    return input

 def inplace_fill(input, value):
@@ -25,6 +20,7 @@ def inplace_fill(input, value):
        execute('inplace_fill_scalar', input, value)
    else:
        execute('inplace_fill_tensor', input, value)

    return input

 def inplace_normal(input, mean=0, std=1, *, generator=None):
@@ -37,7 +33,6 @@ def inplace_normal(input, mean=0, std=1, *, generator=None):
        std = std.item()

    execute('inplace_normal', input, mean, std, generator, device=input.device)

    return input

 # uniform_
@@ -62,22 +57,32 @@ def inplace_uniform(input, *args, **kwargs):
        generator_ = default_generator

    execute("inplace_uniform", input, from_, to_, generator_)

    return input

 def inplace_add(input, other, alpha):
    if isinstance(other, numbers.Number):
        other = core.tensor(other, dtype=input.dtype, device=input.device)
    execute('inplace_add_ext', input, other, alpha)
    execute('inplace_add', input, other, alpha)
    return input


 def inplace_random(self, from_=0, to=None, *, generator=None):
    if not generator:
        generator = default_generator
    seed, offset = generator._step(  # pylint: disable=protected-access
        generator_step_)
    execute('inplace_random', self, from_, to, seed, offset, device=self.device)
    execute('inplace_random', self, from_, to, generator, device=self.device)

    return self

 def inplace_exponential(self, lambd, generator):
    if not generator:
        generator = default_generator
    execute('inplace_exponential', self, lambd, generator, device=self.device)
    return self

 def inplace_fill_diagonal(input, value, wrap):
    execute("inplace_fill_diagonal", input, value, wrap)
    return input

 __all__ = [
    'inplace_copy',
@@ -86,5 +91,7 @@ __all__ = [
    'inplace_fill',
    'inplace_uniform',
    'inplace_add',
    'inplace_random'
    'inplace_random',
    'inplace_exponential',
    'inplace_fill_diagonal'
 ]
--- a/mindnlp/core/ops/other.py
+++ b/mindnlp/core/ops/other.py
@@ -1,10 +1,9 @@
 """other op"""
 import numpy as np
 import mindspore
 from mindspore.ops import gather
 from mindnlp import core
 from mindnlp.core.executor import execute
 from ..configs import ON_A1
 from ..configs import ON_A2

 # atleast_2d

@@ -14,7 +13,7 @@ from ..configs import ON_A1

 # bincount
 def bincount(input, weights=None, minlength=0):
    return execute('bincount_ext', input, weights, minlength)
    return execute('bincount', input, weights, minlength)

 # block_diag

@@ -28,7 +27,12 @@ def broadcast_tensors(*tensors):

 # broadcast_to
 def broadcast_to(input, shape):
    return execute('broadcast_to', input, shape)
    new_shape = ()
    for s in shape:
        if not isinstance(s, int):
            s = s.item()
        new_shape += (s,)
    return execute('broadcast_to', input, new_shape)


 # broadcast_shapes
@@ -74,9 +78,7 @@ def cdist(x1, x2, p=2.0, compute_mode="use_mm_for_euclid_dist_if_necessary"):

 # clone
 def clone(input, *, memory_format=core.preserve_format):
    if input.device.type == 'npu':
        return execute('clone', input)
    return execute('identity', input)
    return execute('clone', input)


 # combinations
@@ -100,14 +102,14 @@ def clone(input, *, memory_format=core.preserve_format):
 def cumsum(input, dim=None, dtype=None, **kwargs):
    dim = kwargs.pop('axis', dim)
    if input.dtype in [core.int64, core.bool]:
        return execute('cumsum_ext', input.int(), dim, None).long()
        return execute('cumsum', input.int(), dim, None).long()
    if dtype is not None and dtype == core.int64:
        return execute('cumsum_ext', input, dim, None).long()
    return execute('cumsum_ext', input, dim, dtype)
        return execute('cumsum', input, dim, None).long()
    return execute('cumsum', input, dim, dtype)

 # diag
 def diag(input, diagonal=0, *, out=None):
    return execute('diag_ext', input, diagonal)
    return execute('diag', input, diagonal)

 # diag_embed

@@ -548,7 +550,7 @@ def einsum(equation, *operands):
    You can use this operator to perform diagonal, reducesum, transpose, matmul, mul, inner product operations, etc.

    Note:
        The sublist format is also supported. For example, einsum_ext(op1, sublist1, op2, sublist2, ..., sublist_out).
        The sublist format is also supported. For example, einsum(op1, sublist1, op2, sublist2, ..., sublist_out).
        In this format, equation can be derived by the sublists which are made up of Python's Ellipsis and list of
        integers in [0, 52). Each operand is followed by a sublist and an output sublist is at the end.
        Dynamic shape, dynamic rank input is not supported in `graph mode (mode=mindspore.GRAPH_MODE)
@@ -585,50 +587,50 @@ def einsum(equation, *operands):
        >>> from mindspore import Tensor, ops
        >>> x = Tensor(np.array([1.0, 2.0, 4.0]), mindspore.float32)
        >>> equation = "i->"
        >>> output = ops.einsum_ext(equation, x)
        >>> output = ops.einsum(equation, x)
        >>> print(output)
        7.0
        >>> x = Tensor(np.array([1.0, 2.0, 4.0]), mindspore.float32)
        >>> y = Tensor(np.array([2.0, 4.0, 3.0]), mindspore.float32)
        >>> equation = "i,i->i"
        >>> output = ops.einsum_ext(equation, x, y)
        >>> output = ops.einsum(equation, x, y)
        >>> print(output)
        [ 2. 8. 12.]
        >>> x = Tensor(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), mindspore.float32)
        >>> y = Tensor(np.array([[2.0, 3.0], [1.0, 2.0], [4.0, 5.0]]), mindspore.float32)
        >>> equation = "ij,jk->ik"
        >>> output = ops.einsum_ext(equation, x, y)
        >>> output = ops.einsum(equation, x, y)
        >>> print(output)
        [[16. 22.]
         [37. 52.]]
        >>> x = Tensor(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), mindspore.float32)
        >>> equation = "ij->ji"
        >>> output = ops.einsum_ext(equation, x)
        >>> output = ops.einsum(equation, x)
        >>> print(output)
        [[1. 4.]
         [2. 5.]
         [3. 6.]]
        >>> x = Tensor(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), mindspore.float32)
        >>> equation = "ij->j"
        >>> output = ops.einsum_ext(equation, x)
        >>> output = ops.einsum(equation, x)
        >>> print(output)
        [5. 7. 9.]
        >>> x = Tensor(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), mindspore.float32)
        >>> equation = "...->"
        >>> output = ops.einsum_ext(equation, x)
        >>> output = ops.einsum(equation, x)
        >>> print(output)
        21.0
        >>> x = Tensor(np.array([1.0, 2.0, 3.0]), mindspore.float32)
        >>> y = Tensor(np.array([2.0, 4.0, 1.0]), mindspore.float32)
        >>> equation = "j,i->ji"
        >>> output = ops.einsum_ext(equation, x, y)
        >>> output = ops.einsum(equation, x, y)
        >>> print(output)
        [[ 2. 4. 1.]
         [ 4. 8. 2.]
         [ 6. 12. 3.]]
        >>> x = mindspore.Tensor([1, 2, 3, 4], mindspore.float32)
        >>> y = mindspore.Tensor([1, 2], mindspore.float32)
        >>> output = ops.einsum_ext(x, [..., 1], y, [..., 2], [..., 1, 2])
        >>> output = ops.einsum(x, [..., 1], y, [..., 2], [..., 1, 2])
        >>> print(output)
        [[1. 2.]
         [2. 4.]
@@ -637,20 +639,13 @@ def einsum(equation, *operands):
    """
    if isinstance(operands[0], (list, tuple)):
        operands = operands[0]
    if operands[0].device.type != 'npu':
        return execute('einsum', equation, operands)
    _equation, _operands = _einsum_convert_sublist(equation, *operands)
    _einsum_check_inputargs(_equation, _operands)
    return _einsum(_equation, _operands)

 # flatten
 def flatten(input, start_dim=0, end_dim=-1):
    if input.device.type == 'cpu':
        if end_dim < 0:
            end_dim = input.ndim + end_dim
        new_shape = input.shape[:start_dim] + (-1,) + input.shape[end_dim + 1:]
        return input.reshape(new_shape)
    return execute('flatten_ext', input, start_dim, end_dim)
    return execute('flatten', input, start_dim, end_dim)


 # flip
@@ -704,57 +699,139 @@ def ravel(input):


 # repeat_interleave
 def repeat_interleave(input, repeats, dim=None, *, output_size=None):
    if input.device.type == 'npu' and ON_A1:

        if isinstance(repeats, core.Tensor):
            repeats = repeats.tolist()
        if not isinstance(repeats, (tuple, list)):
            repeats = (repeats,)
        for index, element in enumerate(repeats):
            if not isinstance(element, int):
                raise TypeError(f"For 'Tensor.repeat', each element in {repeats} should be int, but got "
                                f"{type(element)} at index {index}.")
        if dim is None:
            input = input.ravel()
            dim = 0

        dim = dim + input.ndim if dim < 0 else dim


        if sum(repeats) == 0:
            out_shape = list(input.shape)
            out_shape[dim] = 0
            return core.Tensor(shape=tuple(out_shape), dtype=input.dtype)

        if len(repeats) == 1:
            repeats = repeats[0]
            if input.dtype == mindspore.bool_:
                input = input.to(mindspore.int32)
                out = execute('repeat_elements', input, repeats, dim)
                return out.to(mindspore.bool_)
            return execute('repeat_elements', input, repeats, dim)
        size = input.shape[dim]
        if len(repeats) != size:
            raise ValueError(f"For 'Tensor.repeat', the length of 'repeats' must be the same as the shape of the "
                                f"original tensor in the 'axis' dimension, but got the length of 'repeats' "
                                f"{len(repeats)}, the shape of the original tensor in the 'axis' dimension {size}.")
        subs = core.split(input, 1, dim)
        repeated_subs = []
        for sub, rep in zip(subs, repeats):
            if rep != 0:
                repeated_subs.append(execute('repeat_elements', sub, rep, dim))
        return core.concat(repeated_subs, dim)
 def efficient_repeat_interleave(input_tensor, repeats, dim=None):
    """
    高效实现 core.repeat_interleave 的功能，支持 repeats 为 int 或 list/tensor。
    
    参数:
        input_tensor (Tensor): 输入张量。
        repeats (int 或 list 或 Tensor): 每个元素的重复次数。
        dim (int, optional): 沿着哪个维度进行重复。如果为None，则先将输入张量展平。
    
    返回:
        Tensor: 重复后的张量。
    """
    if dim is None:
        input_tensor = input_tensor.flatten()
        dim = 0

    # 确保 dim 是有效的维度
    if dim < 0:
        dim += input_tensor.dim()

    # 将 repeats 统一转换为 LongTensor 并确保其在正确的设备上
    if isinstance(repeats, int):
        return execute('repeat_interleave_int', input, repeats, dim, None)
    return execute('repeat_interleave_tensor', input, repeats, dim, None)
        repeats_tensor = core.tensor([repeats], device=input_tensor.device, dtype=core.long)
        uniform_repeat = True
    elif isinstance(repeats, (list, tuple)):
        repeats_tensor = core.tensor(repeats, device=input_tensor.device, dtype=core.long)
        uniform_repeat = False
    elif isinstance(repeats, core.Tensor):
        repeats_tensor = repeats.to(device=input_tensor.device, dtype=core.long)
        uniform_repeat = False
    else:
        raise TypeError("repeats must be an int, a list, or a core.Tensor")

    # 获取输入张量在目标维度上的大小
    dim_size = input_tensor.size(dim)
    
    if uniform_repeat:
        # ✅ 优化路径：当所有元素重复次数相同时，使用 expand 和 reshape 避免循环
        # 此方法利用广播机制，非常高效
        unsqueezed_tensor = input_tensor.unsqueeze(dim + 1)
        expanded_shape = list(input_tensor.shape)
        expanded_shape[dim] = -1
        expanded_shape.insert(dim + 1, repeats_tensor.item())
        expanded_tensor = unsqueezed_tensor.expand(*expanded_shape)
        
        final_shape = list(input_tensor.shape)
        final_shape[dim] *= repeats_tensor.item()
        output = expanded_tensor.reshape(*final_shape)
    else:
        # 🔄 当重复次数不同时，需要构建索引
        # 检查 repeats_tensor 的长度是否与目标维度的长度匹配
        if len(repeats_tensor) != dim_size:
            raise ValueError(f"repeats must have length {dim_size} along dimension {dim}, but got {len(repeats_tensor)}")
        
        # 生成索引：例如 repeats_tensor = [2, 3, 1] -> index = [0, 0, 1, 1, 1, 2]
        # 使用 cumsum 计算总重复次数以预分配空间
        total_repeats = repeats_tensor.sum().item()
        index = core.zeros(total_repeats, dtype=core.long, device=input_tensor.device)
        
        # 计算每个块的起始位置
        # start_positions = core.cat([core.tensor([0], device=input_tensor.device), core.cumsum(repeats_tensor, dim=0)[:-1]])
        
        # 使用 scatter 或高级索引填充（这里用循环填充，但可考虑更底层的优化）
        # 注意：对于非常大的非均匀重复，此部分可能成为瓶颈
        current_pos = 0
        for i in range(dim_size):
            repeat_count = repeats_tensor[i].item()
            index[current_pos:current_pos + repeat_count] = i
            current_pos += repeat_count

        output = input_tensor.index_select(dim, index)

    return output

 def repeat_interleave(input, repeats, dim=None, *, output_size=None):
    if input.device.type == 'npu' and ON_A2:
        if isinstance(repeats, int):
            return execute('repeat_interleave_int', input, repeats, dim, None)
        return execute('repeat_interleave_tensor', input, repeats, dim, None)
    return efficient_repeat_interleave(input, repeats, dim)


 # roll
 def roll(input, shifts, dims=None):
    return execute('roll', input, shifts, dims)

    if input.device.type == 'npu':
        return execute('roll', input, shifts, dims)
    # 处理 dims 为 None 的情况：先展平，操作后再恢复形状[4,6](@ref)
    if dims is None:
        original_shape = input.shape
        flattened = input.flatten()
        rolled_flattened = roll(flattened, shifts, dims=0)
        return rolled_flattened.reshape(original_shape)
    
    # 确保 shifts 和 dims 为元组以便统一处理[1,6](@ref)
    if not isinstance(shifts, tuple):
        shifts = (shifts,)
    if not isinstance(dims, tuple):
        dims = (dims,)
    
    # 检查 shifts 和 dims 长度是否匹配
    if len(shifts) != len(dims):
        raise ValueError("shifts 和 dims 必须具有相同的长度")
    
    result = input.clone()  # 创建输入张量的副本以避免修改原张量
    
    # 对每个需要移动的维度依次进行处理[2](@ref)
    for shift, dim in zip(shifts, dims):
        # 确保维度有效
        if dim >= result.dim():
            raise ValueError("维度索引超出张量的维度范围")
        
        # 获取该维度的长度
        dim_size = result.size(dim)
        # 处理负的 shift 值：正向移动 shift + dim_size 等同于反向移动 dim_size - shift
        effective_shift = shift % dim_size
        if effective_shift == 0:
            continue  # 移动 0 步，无需操作
        
        # 沿指定维度切片并重新拼接[1,3](@ref)
        # 将张量沿该维度分成两部分：[第一部分: 从开始到 (dim_size - effective_shift)], [第二部分: 从 (dim_size - effective_shift) 到结束]
        # 然后交换这两部分的位置
        slices_pre = [slice(None)] * result.dim()
        slices_pre[dim] = slice(dim_size - effective_shift, None)
        part1 = result[slices_pre]
        
        slices_post = [slice(None)] * result.dim()
        slices_post[dim] = slice(0, dim_size - effective_shift)
        part2 = result[slices_post]
        
        # 沿该维度拼接两部分
        result = core.concat((part1, part2), dim)
    
    return result

 # searchsorted
 def searchsorted(
@@ -774,7 +851,7 @@ def searchsorted(

 # tril
 def tril(input, diagonal=0):
    return execute('tril_ext', input, diagonal)
    return execute('tril', input, diagonal)


 # tril_indices
@@ -921,6 +998,8 @@ def contains(self, key):
 def stop_gradient(input):
    return execute('stop_gradient', input)

 def detach(input):
    return stop_gradient(input)

 def _get_unfold_indices(input_shape, dimension, size, step):
    if dimension < 0:
@@ -935,7 +1014,7 @@ def _get_unfold_indices(input_shape, dimension, size, step):
 def unfold(input, dimension, size, step):
    _indices, _dimension = _get_unfold_indices(input.shape, dimension, size, step)
    indices = core.tensor(_indices, device=input.device)
    output = execute('gather', input, indices, _dimension)
    output = execute('gather', input, indices, _dimension, 0)
    output = core.moveaxis(output, _dimension + 1, -1)
    return output

@@ -944,13 +1023,24 @@ def contiguous(input):
    return execute('contiguous', input)

 def dyn_shape(input):
    return execute('dyn_shape', input)
    return execute('tensor_shape', input)

 def cross(input, other, dim=None, *, out=None):
    if dim is None:
        dim = -65530
    return execute('cross', input, other, dim)

 def cosine_similarity(x1, x2, dim=1, eps=1e-8):
    dot_product = core.sum(x1 * x2, dim=dim)
    
    # 2. 计算L2范数 (||x|| 和 ||y||)
    norm_vec1 = core.norm(x1, p=2, dim=dim)
    norm_vec2 = core.norm(x2, p=2, dim=dim)
    
    # 3. 计算余弦相似度: (x · y) / (||x|| * ||y|| + eps)
    cosine_sim = dot_product / (norm_vec1 * norm_vec2 + eps)
    
    return cosine_sim

 __all__ = [
    "bincount",
@@ -985,5 +1075,7 @@ __all__ = [
    "diff",
    'view_as_complex',
    'view_as_real',
    'bucketize'
    'bucketize',
    'cosine_similarity',
    'detach'
 ]
--- a/mindnlp/core/ops/pointwise.py
+++ b/mindnlp/core/ops/pointwise.py
@@ -28,7 +28,7 @@ def arrcos(input):

 # acosh
 def acosh(input):
    return execute("acosh_ext", input)
    return execute("acosh", input)


 # arccosh
@@ -38,19 +38,16 @@ def arccosh(input):

 # add
 def add(input, other, *, alpha=1):
    if alpha != 1:
        return execute("add_ext", input, other, alpha)
    return execute('add', input, other)

    return execute("add", input, other, alpha)

 # addcdiv
 def addcdiv(input, tensor1, tensor2, *, value=1):
    return execute("addcdiv_ext", input, tensor1, tensor2, value)
    return execute("addcdiv", input, tensor1, tensor2, value)


 # addcmul
 def addcmul(input, tensor1, tensor2, *, value=1):
    return execute("addcmul_ext", input, tensor1, tensor2, value)
    return execute("addcmul", input, tensor1, tensor2, value)


 # angle
@@ -60,7 +57,7 @@ def angle(input):

 # asin
 def asin(input):
    return execute("asin_ext", input)
    return execute("asin", input)


 # arcsin
@@ -70,7 +67,7 @@ def arcsin(input):

 # asinh
 def asinh(input):
    return execute("asinh_ext", input)
    return execute("asinh", input)


 # arcsinh
@@ -80,7 +77,7 @@ def arcsinh(input):

 # atan
 def atan(input):
    return execute("atan_ext", input)
    return execute("atan", input)


 # arctan
@@ -100,7 +97,7 @@ def arctanh(input):

 # atan2
 def atan2(input, other):
    return execute("atan2_ext", input, other)
    return execute("atan2", input, other)


 # arctan2
@@ -155,6 +152,13 @@ def clamp(input, min=None, max=None):
        return execute("clamp_scalar", input, min, max)
    return execute("clamp_tensor", input, min, max)

 def clamp_min(self, min):
    return clamp(self, min, None)


 def clamp_max(self, max):
    return clamp(self, None, max)


 # clip
 def clip(input, min=None, max=None):
@@ -457,8 +461,8 @@ def remainder(input, other):


 # round
 def round(input):
    return execute("round", input)
 def round(input, *, decimals=0):
    return execute("round", input, decimals)


 # rsqrt
@@ -521,10 +525,7 @@ def sub(input, other, *, alpha=1, out=None):
        device = other.device
    else:
        device = input.device
    if device == 'cpu':
        output = execute("sub", input, alpha * other)
    else:
        output = execute("sub_ext", input, other, alpha)
    output = execute("sub", input, other, alpha)
    if out is None:
        return output
    out.copy_(output)
@@ -571,9 +572,7 @@ def relu(input):


 def log_softmax(input, dim=None, dtype=None):
    if input.device.type == 'cpu':
        return execute('log_softmax', input, dim)
    return execute('log_softmax_ext', input, dim, dtype)
    return execute('log_softmax', input, dim, dtype)


 __all__ = [
@@ -604,6 +603,8 @@ __all__ = [
    "bitwise_right_shift",
    "ceil",
    "clamp",
    "clamp_min",
    "clamp_max",
    "clip",
    "cos",
    "cosh",
--- a/mindnlp/core/ops/random.py
+++ b/mindnlp/core/ops/random.py
@@ -12,7 +12,7 @@ generator_step_ = 12
 def bernoulli(input, *, generator=None, out=None, **kwargs):
    if generator is None:
        generator = default_generator
    output = execute("bernoulli_ext", input, generator)
    output = execute("bernoulli", input, generator)
    if out is None:
        return output
    out.data = output
@@ -22,10 +22,12 @@ def bernoulli(input, *, generator=None, out=None, **kwargs):
 # multinomial
 def multinomial(input, num_samples, replacement=False, *, generator=None, out=None):
    """custom multinomial"""
    if not isinstance(num_samples, int):
        num_samples = num_samples.item()
    if generator is None:
        generator = default_generator
    if not ON_A1:
        output = execute("multinomial_ext", input, num_samples, replacement, generator)
    if input.device.type == 'npu':
        output = execute("multinomial", input, num_samples, replacement, generator)

    else:
        if replacement:
@@ -60,7 +62,9 @@ def normal(mean=0.0, std=1.0, *, size=None, generator=None, out=None,
           dtype=None, layout=None, device=None, pin_memory=None, requires_grad=False):
    if generator is None:
        generator = default_generator
    seed, offset = generator._step(generator_step_)  # pylint: disable=protected-access

    if dtype is None:
        dtype = get_default_dtype()
    if device is None:
        if out is None:
            device = get_device_in_context()
@@ -70,28 +74,16 @@ def normal(mean=0.0, std=1.0, *, size=None, generator=None, out=None,
    is_mean_tensor = isinstance(mean, core.Tensor)
    is_std_tensor = isinstance(std, core.Tensor)

    if device.type == 'cpu':
        if is_mean_tensor and is_std_tensor:
            size = (mean * std).shape
        if is_mean_tensor and not is_std_tensor:
            size = mean.shape
        if not is_mean_tensor and is_std_tensor:
            size = std.shape
        if out is not None:
            size = out.shape
        output = execute('normal', size)
        output = output * std - mean

    else:
        if is_mean_tensor and is_std_tensor:
            output = execute("normal_tensor_tensor", mean, std, seed, offset, device=device)
        if is_mean_tensor and not is_std_tensor:
            output = execute("normal_tensor_float", mean, std, seed, offset, device=device)
        if not is_mean_tensor and is_std_tensor:
            output = execute("normal_float_tensor", mean, std, seed, offset, device=device)
        if out is not None:
            size = out.shape
        output = execute("normal_float_float", float(mean), float(std), size, seed, offset, device=device)
    if is_mean_tensor and is_std_tensor:
        output = execute("normal_tensor_tensor", mean, std, size, dtype, generator, device=device)
    if is_mean_tensor and not is_std_tensor:
        output = execute("normal_tensor_float", mean, std, size, dtype, generator, device=device)
    if not is_mean_tensor and is_std_tensor:
        output = execute("normal_float_tensor", mean, std, size, dtype, generator, device=device)
    if out is not None:
        size = out.shape
    output = execute("normal_float_float", float(mean), float(std), size, dtype, generator, device=device)

    if out is None:
        return output
@@ -120,18 +112,15 @@ def rand(
        dtype = get_default_dtype()
    if not generator:
        generator = default_generator
    seed, offset = generator._step(generator_step_)  # pylint: disable=protected-access

    if size and isinstance(size[0], (tuple, list)):
        size = size[0]
    output = execute(
        "rand_ext",
        "rand",
        size,
        seed,
        offset,
        generator,
        dtype,
        device=device,
        requires_grad=requires_grad,
        user_created=True,
    )
    if out is None:
        return output
@@ -156,17 +145,13 @@ def rand_like(

    if dtype is None:
        dtype = input.dtype
    seed, offset = default_generator._step(  # pylint: disable=protected-access
        generator_step_
    )

    return execute(
        "rand_like_ext",
        "rand_like",
        input,
        seed,
        offset,
        default_generator,
        dtype,
        device=device,
        requires_grad=requires_grad,
    )


@@ -197,10 +182,11 @@ def randint(
    output = execute(
        "randint",
        low, high, size,
        dtype,
        generator,
        dtype,
        device=device,
    )

    if out is None:
        return output
    out.data = output
@@ -228,11 +214,12 @@ def randint_like(

    if dtype is None:
        dtype = input.dtype

    seed, offset = default_generator._step(  # pylint: disable=protected-access
        generator_step_
    )
    return execute(
        "randint_like_ext",
        "randint_like",
        input,
        low,
        high,
@@ -240,7 +227,6 @@ def randint_like(
        offset,
        dtype,
        device=device,
        requires_grad=requires_grad,
    )


@@ -264,18 +250,15 @@ def randn(
        dtype = get_default_dtype()
    if not generator:
        generator = default_generator
    seed, offset = generator._step(generator_step_)  # pylint: disable=protected-access

    if size and isinstance(size[0], (tuple, list)):
        size = size[0]
    output = execute(
        "randn",
        size,
        seed,
        offset,
        generator,
        dtype,
        device=device,
        requires_grad=requires_grad,
        user_created=True,
    )
    if out is None:
        return output
@@ -300,17 +283,12 @@ def randn_like(

    if dtype is None:
        dtype = input.dtype
    seed, offset = default_generator._step(  # pylint: disable=protected-access
        generator_step_
    )
    return execute(
        "rand_like_ext",
        "rand_like",
        input,
        seed,
        offset,
        default_generator,
        dtype,
        device=device,
        requires_grad=requires_grad,
    )


@@ -333,15 +311,12 @@ def randperm(

    if not generator:
        generator = default_generator
    seed, offset = generator._step(generator_step_)  # pylint: disable=protected-access
    output = execute(
        "randperm_ext",
        "randperm",
        n,
        seed,
        offset,
        generator,
        dtype,
        device=device,
        requires_grad=requires_grad,
    )
    if out is None:
        return output
--- a/mindnlp/core/ops/reduction.py
+++ b/mindnlp/core/ops/reduction.py
@@ -9,11 +9,11 @@ min_out = namedtuple('min_out', ['values', 'indices'])

 # argmax
 def argmax(input, dim=None, keepdim=False):
    return execute('argmax_ext', input, dim, keepdim)
    return execute('argmax', input, dim, keepdim)

 # argmin
 def argmin(input, dim=None, keepdim=False):
    return execute('argmin_ext', input, dim, keepdim)
    return execute('argmin', input, dim, keepdim)

 # amax
 def amax(input, dim, keepdim=False):
@@ -37,6 +37,8 @@ def all(input, dim=None, keepdim=False, *, dtype=None, **kwargs):

 # any
 def any(input, dim=None, keepdim=False):
    if dim is None:
        dim = ()
    return execute('reduce_any', input, dim, keepdim)

 # max
@@ -77,7 +79,7 @@ def logsumexp(input, dim, keepdim=False):
 # mean
 def mean(input, dim=None, keepdim=False, *, dtype=None, **kwargs):
    dim = kwargs.pop('axis', dim)
    return execute('mean_ext', input, dim, keepdim, dtype)
    return execute('mean', input, dim, keepdim, dtype)

 # nanmean

@@ -85,7 +87,7 @@ def mean(input, dim=None, keepdim=False, *, dtype=None, **kwargs):
 # median
 def median(input, dim=-1, keepdim=False):
    if dim is None:
        return execute('median_ext', input)
        return execute('median', input)
    return execute('median_dim', input, dim, keepdim)

 # nanmedian
@@ -95,7 +97,7 @@ def median(input, dim=-1, keepdim=False):


 # norm
 def vector_norm_ext(input, p=2, dim=None, keepdim=False, *, dtype=None):
 def vector_norm(input, p=2, dim=None, keepdim=False, *, dtype=None):
    if float(p) in [0.0, 1.0, 2.0, 3.0]:
        return execute('linalg_vector_norm', input, float(p), dim, keepdim, dtype)
    if input.dtype in [core.bfloat16, core.float16, core.float32]:
@@ -107,12 +109,12 @@ def vector_norm_ext(input, p=2, dim=None, keepdim=False, *, dtype=None):
    input = input.to(core.float32)
    return execute('lp_norm_v2', input, p, dim, keepdim, 0.0).to(cast_dtype)

 def matrix_norm_ext(A, ord='fro', dim=(-2, -1), keepdim=False, *, dtype=None):
 def matrix_norm(A, ord='fro', dim=(-2, -1), keepdim=False, *, dtype=None):
    ndim = A.ndim
    row_axis, col_axis = _check_matrix_norm_axis(dim, ndim)
    _check_matrix_norm_ord(ord)
    if ord == 'fro':
        return vector_norm_ext(A, 2, dim, keepdim, dtype=dtype)
        return vector_norm(A, 2, dim, keepdim, dtype=dtype)
    if ord == 'nuc':
        res = _multi_svd_norm(A, row_axis, col_axis, 'sum')
        return _reshape_matrix_norm(A, res, dim, keepdim)
@@ -127,24 +129,24 @@ def matrix_norm_ext(A, ord='fro', dim=(-2, -1), keepdim=False, *, dtype=None):
    if not keepdim and col_axis > row_axis:
        col_axis -= 1
    if ord < 0:
        return amin(vector_norm_ext(A, 1, row_axis, keepdim, dtype=dtype), col_axis, keepdim)
    return amax(vector_norm_ext(A, 1, row_axis, keepdim, dtype=dtype), col_axis, keepdim)
        return amin(vector_norm(A, 1, row_axis, keepdim, dtype=dtype), col_axis, keepdim)
    return amax(vector_norm(A, 1, row_axis, keepdim, dtype=dtype), col_axis, keepdim)

 def norm(input, p='fro', dim=None, keepdim=False, dtype=None):
    if not isinstance(input, core.Tensor):
        raise TypeError(f"For `norm_ext`, the `input` must be Tensor!, but get {type(input)}.")
        raise TypeError(f"For `norm`, the `input` must be Tensor!, but get {type(input)}.")
    if isinstance(p, (bool, int, float)):
        return vector_norm_ext(input, p, dim, keepdim, dtype=dtype)
        return vector_norm(input, p, dim, keepdim, dtype=dtype)
    if p == 'fro':
        if isinstance(dim, (list, tuple)) and len(dim) > 2:
            raise ValueError(f"For `norm_ext`, the size of `dim` cannot be greater than 2 "
            raise ValueError(f"For `norm`, the size of `dim` cannot be greater than 2 "
                             f"when the norm mode is `fro`.")
        return execute('linalg_vector_norm', input, 2.0, dim, keepdim,
                       dtype if dtype is None else dtype)
    if p == 'nuc':
        dim = tuple(range(input.ndim)) if dim is None else dim
        return matrix_norm_ext(input, p, dim, keepdim, dtype=dtype)
    raise ValueError(f"For `norm_ext`, the value of `p` must be one of [int, float, inf, -inf, 'fro', 'nuc',] "
        return matrix_norm(input, p, dim, keepdim, dtype=dtype)
    raise ValueError(f"For `norm`, the value of `p` must be one of [int, float, inf, -inf, 'fro', 'nuc',] "
                     f"but got `{p}`.")

 # nansum
@@ -153,7 +155,7 @@ def nansum(input, dim=None, keepdim=False, *, dtype=None):

 # prod
 def prod(input, dim=None, keepdim=False, *, dtype=None):
    return execute('prod_ext', input, dim, keepdim, dtype)
    return execute('prod', input, dim, keepdim, dtype)

 # quantile

@@ -173,7 +175,7 @@ def sum(input, dim=None, keepdim=False, *, dtype=None, **kwargs):
    dim = kwargs.pop('axis', dim)
    if 0 in input.shape:
        return core.tensor(0, dtype=dtype, device=input.device)
    return execute('sum_ext', input, dim, keepdim, dtype)
    return execute('sum', input, dim, keepdim, dtype)

 # unique
 def unique(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
@@ -210,7 +212,7 @@ def var_mean(input, dim=None, *, correction=1, keepdim=False):
    return execute('var_mean', input, dim, correction, keepdim)

 # count_nonzero
 def count_nonzero(input, dim=None):
 def count_nonzero(input, dim=-1):
    return execute('count_nonzero', input, dim)

 __all__ = ['all', 'amax', 'amin', 'aminmax', 'any', 'argmax', 'argmin', 'count_nonzero',
--- a/mindnlp/core/random.py
+++ b/mindnlp/core/random.py
@@ -54,7 +54,7 @@ def manual_seed(seed):
            is raised. Negative inputs are remapped to positive values with the formula
            `0xffff_ffff_ffff_ffff + seed`.
    """
    mindspore.set_seed(seed)
    # mindspore.set_seed(seed + 1)
    seed = int(seed)
    # set_seed(seed)
    return default_generator.manual_seed(seed)
--- a/mindnlp/transformers/init.py
+++ b/mindnlp/transformers/init.py
@@ -56,6 +56,9 @@ transformers.modeling_utils._get_resolved_checkpoint_files = _get_resolved_check
 transformers.tokenization_utils_base.PreTrainedTokenizerBase.apply_chat_template = apply_chat_template_wrapper(
    transformers.tokenization_utils_base.PreTrainedTokenizerBase.apply_chat_template
 )
 transformers.tokenization_utils_base.PreTrainedTokenizerBase.__call__ = apply_chat_template_wrapper(
    transformers.tokenization_utils_base.PreTrainedTokenizerBase.__call__
 )

 transformers.pipelines.pipeline = dtype_wrapper(transformers.pipelines.pipeline)
 transformers.modeling_utils.caching_allocator_warmup = empty_fn
--- a/mindnlp/transformers/masking_utils.py
+++ b/mindnlp/transformers/masking_utils.py
@@ -419,7 +419,6 @@ def sdpa_mask_older_torch(

    if padding_mask is not None:
        causal_mask = causal_mask * padding_mask[:, None, None, :]

    # # Due to a bug in versions of torch<2.5, we need to update the mask in case a query is not attending to any
    # # tokens (due to padding). See details in https://github.com/pytorch/pytorch/issues/110213
    # if not _is_torch_greater_or_equal_than_2_5 and allow_torch_fix:
--- a/mindnlp/utils/safetensors_patch.py
+++ b/mindnlp/utils/safetensors_patch.py
@@ -84,13 +84,15 @@ class PySafeSlice:
    def ndim(self):
        return len(self.shape)

    def get(self, *args, **kwargs):
    def get(self, slice=None):
        nbytes = int(np.prod(self.shape)) * np.dtype(self.dtype).itemsize
        buffer = bytearray(nbytes)
        self.bufferfile.seek(self.start_offset)
        self.bufferfile.readinto(buffer)
        array = np.frombuffer(buffer, dtype=self.dtype).reshape(self.shape)
        array = array.reshape(self.shape)
        if slice is not None:
            array = array[slice]
        if not SUPPORT_BF16 and self.info["dtype"] == 'BF16':
            array = array.astype(np.float16)
        tensor = core.from_numpy(array)
@@ -128,7 +130,7 @@ class PySafeSlice:
        return self.nelements * self.bits

    def __getitem__(self, slice):
        return self.get()[slice]
        return self.get(slice)

 def getSize(fileobject):
    fileobject.seek(0, 2)  # move the cursor to the end of the file
--- a/tools/init.py
+++ b/tools/init.py
--- a/tools/op_auto_gen.py
+++ b/tools/op_auto_gen.py
@@ -0,0 +1,92 @@
 import re
 import inspect
 import importlib
 import argparse

 import mindspore
 from mindspore import ops
 from mindspore.ops.auto_generate import gen_ops_prim
 from mindspore.ops.auto_generate import pyboost_inner_prim

 def camel_to_snake_case_improved(camel_case_str):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', camel_case_str)
    snake_case_str = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
    return snake_case_str

 op_func_no_init = '''
 {name}_op = {op}().set_device('{device}')
 def {name}(*args):
    return {name}_op(*args)
 '''

 op_func_with_init = '''
 def {name}(*args):
    op = _get_cache_prim({op})(*args[-{idx}:]).set_device('{device}')
    return op(*args[:-{idx}])
 '''

 def gen_legacy_op(gen_file, device='CPU'):
    op_list = list(filter(lambda s: s[0].isupper(), ops.operations.__all__))
    grad_op = list(filter(lambda s: s[0].isupper(), dir(mindspore.ops.operations._grad_ops)))

    op_dict = {
        'mindspore.ops.operations._grad_ops': grad_op,
        'mindspore.ops.operations': op_list
    }

    with open(gen_file, 'w') as f:
        f.write("from mindspore.ops.operations import *\n"
                "from mindspore.ops.operations._grad_ops import *\n"
                "from mindspore.ops._primitive_cache import _get_cache_prim\n\n")
        for op_module, op_list in op_dict.items():
            for old_op_name in op_list:
                if old_op_name in ['P', 'Print', 'Assert', 'Custom', 'CustomOpBuilder', 'DataType', 'ReduceOp', 'TBERegOp', 'Tensor']:
                    continue

                op_mod = importlib.import_module(op_module)
                ops_class = getattr(op_mod, old_op_name, None)
                init_signature = inspect.signature(ops_class.__init__)
                name = camel_to_snake_case_improved(old_op_name)
                if len(init_signature.parameters) > 1:
                    init_args = list(init_signature.parameters.keys())
                    init_args.pop(0)
                    code = op_func_with_init.format(name=name, op=old_op_name, idx=len(init_args), device=device)

                else:
                    code = op_func_no_init.format(name=name, op=old_op_name, device=device)
                f.write(code + '\n')
    f.close()

 def gen_aclnn_op(gen_file, device):
    gen_ops_list = list(filter(lambda s: s.startswith("pyboost"), dir(gen_ops_prim)))
    pyboost_inner_list = list(filter(lambda s: s.endswith("_impl"), dir(pyboost_inner_prim)))

    with open(gen_file, 'w') as f:
        f.write("from mindspore.ops.auto_generate.gen_ops_prim import *\n"
                "from mindspore.ops.auto_generate.pyboost_inner_prim import *\n\n")

        for pyboost_op_name in gen_ops_list:
            op_name = pyboost_op_name.replace('pyboost_', '') + '_op'
            op_instance = getattr(gen_ops_prim, op_name, None)
            if op_instance is not None:
                f.write(f"{op_name} = {getattr(gen_ops_prim, op_name).__class__.__name__}().set_device('Ascend')\n\n")

        # for op_name in pyboost_inner_list:
        #     f.write(f"{op_name} = {getattr(pyboost_inner_prim, op_name).__class__.__name__}()\n\n")
    f.close()

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    # 添加位置参数
    parser.add_argument('output_file', type=str)
    parser.add_argument('--device', type=str, choices=['CPU', 'GPU', 'Ascend'])
    parser.add_argument('--op_type', type=str, default='legacy', required=False ,choices=['legacy', 'pyboost'])


    args = parser.parse_args()
    print(args)
    if args.op_type == 'legacy':
        gen_legacy_op(args.output_file, args.device)
    elif args.op_type == 'pyboost':
        gen_aclnn_op(args.output_file, args.device)
Author	SHA1	Message	Date
nate.river	714a7ef299	fix a class on GPU (#2166 )	3 months ago
nate.river	86570a7306	fix tokenizer not support return_tensor='ms' (#2165 )	3 months ago
nate.river	5c81102b28	refactor prim op for different backend (#2164 )	3 months ago