2 Commits

Author SHA1 Message Date
  Cyril Vallez b19ed98405
until lfm2_moe 4 hours ago
  Cyril Vallez 3ac9de9c2a
all until gpt2 18 hours ago
56 changed files with 338 additions and 28 deletions
Split View
  1. +3
    -0
      src/transformers/models/convbert/modeling_convbert.py
  2. +2
    -1
      src/transformers/models/csm/modeling_csm.py
  3. +2
    -0
      src/transformers/models/csm/modular_csm.py
  4. +9
    -0
      src/transformers/models/d_fine/modeling_d_fine.py
  5. +13
    -0
      src/transformers/models/d_fine/modular_d_fine.py
  6. +5
    -0
      src/transformers/models/dab_detr/modeling_dab_detr.py
  7. +6
    -0
      src/transformers/models/data2vec/modeling_data2vec_text.py
  8. +2
    -0
      src/transformers/models/deberta/modeling_deberta.py
  9. +2
    -0
      src/transformers/models/deberta_v2/modeling_deberta_v2.py
  10. +3
    -1
      src/transformers/models/decision_transformer/modeling_decision_transformer.py
  11. +2
    -0
      src/transformers/models/deepseek_v3/modular_deepseek_v3.py
  12. +5
    -0
      src/transformers/models/deformable_detr/modeling_deformable_detr.py
  13. +5
    -0
      src/transformers/models/detr/modeling_detr.py
  14. +6
    -0
      src/transformers/models/dia/modeling_dia.py
  15. +6
    -0
      src/transformers/models/dia/modular_dia.py
  16. +11
    -9
      src/transformers/models/distilbert/modeling_distilbert.py
  17. +12
    -0
      src/transformers/models/donut/modeling_donut_swin.py
  18. +3
    -0
      src/transformers/models/edgetam/modular_edgetam.py
  19. +18
    -1
      src/transformers/models/edgetam_video/modular_edgetam_video.py
  20. +5
    -0
      src/transformers/models/electra/modeling_electra.py
  21. +14
    -0
      src/transformers/models/encodec/modeling_encodec.py
  22. +7
    -0
      src/transformers/models/eomt/modular_eomt.py
  23. +3
    -0
      src/transformers/models/ernie/modular_ernie.py
  24. +2
    -0
      src/transformers/models/esm/modeling_esm.py
  25. +5
    -0
      src/transformers/models/esm/modeling_esmfold.py
  26. +5
    -0
      src/transformers/models/evolla/modular_evolla.py
  27. +6
    -1
      src/transformers/models/falcon_mamba/modular_falcon_mamba.py
  28. +11
    -9
      src/transformers/models/flaubert/modeling_flaubert.py
  29. +3
    -0
      src/transformers/models/flava/modeling_flava.py
  30. +12
    -0
      src/transformers/models/florence2/modular_florence2.py
  31. +6
    -0
      src/transformers/models/fnet/modeling_fnet.py
  32. +3
    -0
      src/transformers/models/gemma3/modular_gemma3.py
  33. +29
    -0
      src/transformers/models/gemma3n/modular_gemma3n.py
  34. +3
    -0
      src/transformers/models/git/modeling_git.py
  35. +0
    -1
      src/transformers/models/glm4v/modular_glm4v.py
  36. +7
    -0
      src/transformers/models/gpt2/modeling_gpt2.py
  37. +3
    -0
      src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
  38. +12
    -1
      src/transformers/models/gpt_neo/modeling_gpt_neo.py
  39. +6
    -0
      src/transformers/models/granite_speech/modeling_granite_speech.py
  40. +5
    -0
      src/transformers/models/grounding_dino/modeling_grounding_dino.py
  41. +1
    -0
      src/transformers/models/groupvit/modeling_groupvit.py
  42. +2
    -0
      src/transformers/models/ibert/modeling_ibert.py
  43. +9
    -0
      src/transformers/models/idefics/modeling_idefics.py
  44. +4
    -2
      src/transformers/models/imagegpt/modeling_imagegpt.py
  45. +2
    -0
      src/transformers/models/instructblip/modeling_instructblip.py
  46. +5
    -0
      src/transformers/models/janus/modular_janus.py
  47. +5
    -1
      src/transformers/models/kosmos2/modeling_kosmos2.py
  48. +3
    -0
      src/transformers/models/kosmos2_5/modeling_kosmos2_5.py
  49. +8
    -1
      src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py
  50. +2
    -0
      src/transformers/models/layoutlm/modeling_layoutlm.py
  51. +6
    -0
      src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
  52. +2
    -0
      src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
  53. +6
    -0
      src/transformers/models/led/modeling_led.py
  54. +12
    -0
      src/transformers/models/levit/modeling_levit.py
  55. +3
    -0
      src/transformers/models/lfm2_moe/modular_lfm2_moe.py
  56. +6
    -0
      src/transformers/models/speecht5/modeling_speecht5.py

+ 3
- 0
src/transformers/models/convbert/modeling_convbert.py View File

@@ -118,6 +118,9 @@ class ConvBertPreTrainedModel(PreTrainedModel):
elif isinstance(module, GroupedLinearLayer):
init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
init.zeros_(module.bias)
elif isinstance(module, ConvBertEmbeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
init.zeros(module.token_type_ids)


class SeparableConv1D(nn.Module):


+ 2
- 1
src/transformers/models/csm/modeling_csm.py View File

@@ -421,7 +421,8 @@ class CsmPreTrainedModel(PreTrainedModel):
num_codebooks = module.num_codebooks
for i in range(num_codebooks - 1):
init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)

elif isinstance(module, CsmBackboneModelEmbeddings):
init.copy_(module.audio_tokens_offsets, torch.arange(self.config.num_codebooks) * self.config.vocab_size)

@auto_docstring
class CsmDepthDecoderModel(CsmPreTrainedModel):


+ 2
- 0
src/transformers/models/csm/modular_csm.py View File

@@ -149,6 +149,8 @@ class CsmPreTrainedModel(PreTrainedModel):
num_codebooks = module.num_codebooks
for i in range(num_codebooks - 1):
init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
elif isinstance(module, CsmBackboneModelEmbeddings):
init.copy_(module.audio_tokens_offsets, torch.arange(self.config.num_codebooks) * self.config.vocab_size)


@auto_docstring


+ 9
- 0
src/transformers/models/d_fine/modeling_d_fine.py View File

@@ -483,6 +483,9 @@ class DFinePreTrainedModel(PreTrainedModel):
init.constant_(module.attention_weights.weight, 0.0)
init.constant_(module.attention_weights.bias, 0.0)

num_points_scale = [1 / n for n in module.num_points_list for _ in range(n)]
init.copy_(module.num_points_scale, torch.tensor(num_points_scale, dtype=torch.float32))

if isinstance(module, DFineModel):
prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
bias = float(-math.log((1 - prior_prob) / prior_prob))
@@ -507,6 +510,12 @@ class DFinePreTrainedModel(PreTrainedModel):
init.ones_(module.weight)
init.zeros_(module.bias)

if isinstance(module, DFineFrozenBatchNorm2d):
init.ones_(module.weight)
init.zeros(module.bias)
module.zeros_(module.running_mean)
module.ones_(module.running_var)

if hasattr(module, "weight_embedding") and self.config.learn_initial_query:
init.xavier_uniform_(module.weight_embedding.weight)
if hasattr(module, "denoising_class_embed") and self.config.num_denoising > 0:


+ 13
- 0
src/transformers/models/d_fine/modular_d_fine.py View File

@@ -39,6 +39,7 @@ from ..rt_detr.modeling_rt_detr import (
RTDetrPreTrainedModel,
RTDetrRepVggBlock,
inverse_sigmoid,
RTDetrFrozenBatchNorm2d,
)
from ..rt_detr_v2.modeling_rt_detr_v2 import multi_scale_deformable_attention_v2

@@ -628,6 +629,9 @@ class DFinePreTrainedModel(RTDetrPreTrainedModel):
init.constant_(module.attention_weights.weight, 0.0)
init.constant_(module.attention_weights.bias, 0.0)

num_points_scale = [1 / n for n in module.num_points_list for _ in range(n)]
init.copy_(module.num_points_scale, torch.tensor(num_points_scale, dtype=torch.float32))

if isinstance(module, DFineModel):
prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
bias = float(-math.log((1 - prior_prob) / prior_prob))
@@ -652,6 +656,12 @@ class DFinePreTrainedModel(RTDetrPreTrainedModel):
init.ones_(module.weight)
init.zeros_(module.bias)

if isinstance(module, DFineFrozenBatchNorm2d):
init.ones_(module.weight)
init.zeros(module.bias)
module.zeros_(module.running_mean)
module.ones_(module.running_var)

if hasattr(module, "weight_embedding") and self.config.learn_initial_query:
init.xavier_uniform_(module.weight_embedding.weight)
if hasattr(module, "denoising_class_embed") and self.config.num_denoising > 0:
@@ -851,6 +861,9 @@ class DFineDecoder(RTDetrDecoder):
)


class DFineFrozenBatchNorm2d(RTDetrFrozenBatchNorm2d):
pass

class DFineModel(RTDetrModel):
def __init__(self, config: DFineConfig):
super().__init__(config)


+ 5
- 0
src/transformers/models/dab_detr/modeling_dab_detr.py View File

@@ -848,6 +848,11 @@ class DabDetrPreTrainedModel(PreTrainedModel):
init.constant_(module.class_embed.bias, bias_value)
elif isinstance(module, nn.PReLU):
module.reset_parameters()
elif isinstance(module, DabDetrFrozenBatchNorm2d):
init.ones_(module.weight)
init.zeros(module.bias)
module.zeros_(module.running_mean)
module.ones_(module.running_var)


# Modified from transformers.models.detr.modeling_detr.DetrEncoder with Detr->DabDetr,DETR->ConditionalDETR


+ 6
- 0
src/transformers/models/data2vec/modeling_data2vec_text.py View File

@@ -494,6 +494,12 @@ class Data2VecTextPreTrainedModel(PreTrainedModel):
"cross_attentions": Data2VecTextCrossAttention,
}

def _init_weights(self, module):
super()._init_weights(module)
if isinstance(module, Data2VecTextEmbeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
init.zeros(module.token_type_ids)


class Data2VecTextEncoder(nn.Module):
def __init__(self, config):


+ 2
- 0
src/transformers/models/deberta/modeling_deberta.py View File

@@ -624,6 +624,8 @@ class DebertaPreTrainedModel(PreTrainedModel):
init.zeros_(module.v_bias)
elif isinstance(module, (LegacyDebertaLMPredictionHead, DebertaLMPredictionHead)):
init.zeros_(module.bias)
elif isinstance(module, DebertaEmbeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))


@auto_docstring


+ 2
- 0
src/transformers/models/deberta_v2/modeling_deberta_v2.py View File

@@ -700,6 +700,8 @@ class DebertaV2PreTrainedModel(PreTrainedModel):
super()._init_weights(module)
if isinstance(module, (LegacyDebertaV2LMPredictionHead, DebertaV2LMPredictionHead)):
init.zeros_(module.bias)
elif isinstance(module, DebertaV2Embeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))


@auto_docstring


+ 3
- 1
src/transformers/models/decision_transformer/modeling_decision_transformer.py View File

@@ -94,7 +94,6 @@ class DecisionTransformerGPT2Attention(nn.Module):
),
persistent=False,
)
self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)

self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
@@ -385,6 +384,9 @@ class DecisionTransformerGPT2PreTrainedModel(PreTrainedModel):
if "c_proj" in name and "weight" in name:
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
init.normal_(p, mean=0.0, std=self.config.initializer_range / math.sqrt(2 * self.config.n_layer))
elif isinstance(module, DecisionTransformerGPT2Attention)
max_positions = module.config.max_position_embeddings
init.copy_(module.bias, torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(1, 1, max_positions, max_positions))


class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel):


+ 2
- 0
src/transformers/models/deepseek_v3/modular_deepseek_v3.py View File

@@ -315,6 +315,8 @@ class DeepseekV3PreTrainedModel(LlamaPreTrainedModel):
elif isinstance(module, DeepseekV3NaiveMoe):
init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)
elif isinstance(module, DeepseekV3TopkRouter):
init.zeros_(module.e_score_correction_bias)


class DeepseekV3Model(LlamaModel):


+ 5
- 0
src/transformers/models/deformable_detr/modeling_deformable_detr.py View File

@@ -965,6 +965,11 @@ class DeformableDetrPreTrainedModel(PreTrainedModel):
# Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
init.zeros_(module.weight[module.padding_idx])
elif isinstance(module, DeformableDetrFrozenBatchNorm2d):
init.ones_(module.weight)
init.zeros(module.bias)
module.zeros_(module.running_mean)
module.ones_(module.running_var)
if hasattr(module, "reference_points") and not self.config.two_stage:
init.xavier_uniform_(module.reference_points.weight, gain=1.0)
init.constant_(module.reference_points.bias, 0.0)


+ 5
- 0
src/transformers/models/detr/modeling_detr.py View File

@@ -750,6 +750,11 @@ class DetrPreTrainedModel(PreTrainedModel):
# Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
init.zeros_(module.weight[module.padding_idx])
elif isinstance(module, DetrFrozenBatchNorm2d):
init.ones_(module.weight)
init.zeros(module.bias)
module.zeros_(module.running_mean)
module.ones_(module.running_var)


class DetrEncoder(DetrPreTrainedModel):


+ 6
- 0
src/transformers/models/dia/modeling_dia.py View File

@@ -61,6 +61,12 @@ class DiaPreTrainedModel(PreTrainedModel):
main_input_name = "input_ids"
_no_split_modules = ["DiaEncoderLayer", "DiaDecoderLayer"]

def _init_weights(self, module):
super()._init_weights(module)
if isinstance(module, DiaMultiChannelEmbedding):
offsets = torch.arange(self.config.num_channels, dtype=torch.long) * self.config.vocab_size
init.copy_(module.offsets, offsets)


class DiaMultiChannelEmbedding(nn.Module):
"""In order to efficiently compute the audio embedding from the 9 different channels,


+ 6
- 0
src/transformers/models/dia/modular_dia.py View File

@@ -59,6 +59,12 @@ class DiaPreTrainedModel(PreTrainedModel):
main_input_name = "input_ids"
_no_split_modules = ["DiaEncoderLayer", "DiaDecoderLayer"]

def _init_weights(self, module):
super()._init_weights(module)
if isinstance(module, DiaMultiChannelEmbedding):
offsets = torch.arange(self.config.num_channels, dtype=torch.long) * self.config.vocab_size
init.copy_(module.offsets, offsets)


class DiaMultiChannelEmbedding(nn.Module):
"""In order to efficiently compute the audio embedding from the 9 different channels,


+ 11
- 9
src/transformers/models/distilbert/modeling_distilbert.py View File

@@ -305,15 +305,17 @@ class DistilBertPreTrainedModel(PreTrainedModel):
def _init_weights(self, module: nn.Module):
"""Initialize the weights."""
super()._init_weights(module)
if isinstance(module, Embeddings) and self.config.sinusoidal_pos_embds:
init.copy_(
module.position_embeddings.weight,
create_sinusoidal_embeddings(
self.config.max_position_embeddings,
self.config.dim,
torch.empty_like(module.position_embeddings.weight),
),
)
if isinstance(module, Embeddings):
if self.config.sinusoidal_pos_embds
init.copy_(
module.position_embeddings.weight,
create_sinusoidal_embeddings(
self.config.max_position_embeddings,
self.config.dim,
torch.empty_like(module.position_embeddings.weight),
),
)
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))


@auto_docstring


+ 12
- 0
src/transformers/models/donut/modeling_donut_swin.py View File

@@ -802,6 +802,18 @@ class DonutSwinPreTrainedModel(PreTrainedModel):
elif isinstance(module, DonutSwinSelfAttention):
init.zeros_(module.relative_position_bias_table)

coords_h = torch.arange(module.window_size[0])
coords_w = torch.arange(module.window_size[1])
coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
coords_flatten = torch.flatten(coords, 1)
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
relative_coords = relative_coords.permute(1, 2, 0).contiguous()
relative_coords[:, :, 0] += module.window_size[0] - 1
relative_coords[:, :, 1] += module.window_size[1] - 1
relative_coords[:, :, 0] *= 2 * module.window_size[1] - 1
relative_position_index = relative_coords.sum(-1)
init.copy_(module.relative_position_index, relative_position_index)


@auto_docstring
class DonutSwinModel(DonutSwinPreTrainedModel):


+ 3
- 0
src/transformers/models/edgetam/modular_edgetam.py View File

@@ -181,6 +181,9 @@ class EdgeTamPreTrainedModel(Sam2PreTrainedModel):
if isinstance(module, EdgeTamModel):
if module.no_memory_embedding is not None:
init.zeros_(module.no_memory_embedding)
elif hasattr(module, "positional_embedding"):
positional_embedding = module.scale * torch.randn((2, config.hidden_size // 2))
init.copy_(module.positional_embedding, positional_embedding)


@auto_docstring(


+ 18
- 1
src/transformers/models/edgetam_video/modular_edgetam_video.py View File

@@ -378,10 +378,12 @@ class EdgeTamVideoVisionRotaryEmbedding(Sam2VideoVisionRotaryEmbedding):
dim = config.memory_attention_hidden_size // (
config.memory_attention_downsample_rate * config.memory_attention_num_attention_heads
)
self.dim = dim
# Ensure even dimension for proper axial splitting
if dim % 4 != 0:
raise ValueError("Dimension must be divisible by 4 for axial RoPE")
end_x, end_y = config.memory_attention_rope_feat_sizes if end_x is None else (end_x, end_y)
self.end_x, self.end_y = end_x, end_y
freqs = 1.0 / (config.memory_attention_rope_theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))

# Generate 2D position indices for axial rotary embedding
@@ -662,7 +664,22 @@ class EdgeTamVideoFeedForward(Sam2VideoFeedForward):


class EdgeTamVideoPreTrainedModel(Sam2VideoPreTrainedModel):
pass
def _init_weights(self, module):
super()._init_weights()
if isinstance(module, EdgeTamVideoVisionRotaryEmbedding):
dim = self.dim
freqs = 1.0 / (self.config.memory_attention_rope_theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
# Generate 2D position indices for axial rotary embedding
flattened_indices = torch.arange(module.end_x * module.end_y, dtype=torch.long)
x_positions = flattened_indices % module.end_x
y_positions = torch.div(flattened_indices, module.end_x, rounding_mode="floor")
freqs_x = torch.outer(x_positions, freqs).float()
freqs_y = torch.outer(y_positions, freqs).float()
inv_freq = torch.cat([freqs_x, freqs_y], dim=-1)
inv_freq = inv_freq.repeat_interleave(2, dim=-1)

init.copy_(module.rope_embeddings_cos, inv_freq.cos())
init.copy_(module.rope_embeddings_sin, inv_freq.sin())


class EdgeTamVideoInferenceSession(Sam2VideoInferenceSession):


+ 5
- 0
src/transformers/models/electra/modeling_electra.py View File

@@ -532,6 +532,11 @@ class ElectraPreTrainedModel(PreTrainedModel):
"cross_attentions": ElectraCrossAttention,
}

def _init_weights(self, module):
super()._init_weights(module)
if isinstance(module, ElectraEmbeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
init.zeros(module.token_type_ids)

@dataclass
@auto_docstring(


+ 14
- 0
src/transformers/models/encodec/modeling_encodec.py View File

@@ -474,6 +474,20 @@ class EncodecPreTrainedModel(PreTrainedAudioTokenizerBase):
init.xavier_uniform_(param)
elif "bias" in name:
init.constant_(param, 0.0)
elif isinstance(module, EncodecConv1d):
kernel_size = module.conv.kernel_size[0]
stride = torch.tensor(module.conv.stride[0], dtype=torch.int64)
dilation = module.conv.dilation[0]
# Effective kernel size with dilations.
kernel_size = torch.tensor((kernel_size - 1) * dilation + 1, dtype=torch.int64)
init.copy_(module.stride, stride)
init.copy_(module.kernel_size, kernel_size)
init.copy_(module.padding_total, kernel_size - stride)
elif isinstance(module, EncodecEuclideanCodebook):
init.copy_(module.inited, torch.Tensor([True]))
init.zeros_(module.cluster_size)
init.zeros_(module.embed)
init.zeros_(module.embed_avg)


@auto_docstring(


+ 7
- 0
src/transformers/models/eomt/modular_eomt.py View File

@@ -425,6 +425,13 @@ class EomtPreTrainedModel(PreTrainedModel):
elif isinstance(module, EomtEmbeddings):
init.trunc_normal_(module.cls_token, mean=0.0, std=std)
init.zeros_(module.register_tokens)
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
elif isinstance(module, EomtLoss):
empty_weight = torch.ones(module.num_labels + 1)
empty_weight[-1] = module.eos_coef
init.copy_(module.empty_weight, empty_weight)
elif isinstance(module, EomtForUniversalSegmentation):
init.ones_(module.attn_mask_probs)


@auto_docstring(


+ 3
- 0
src/transformers/models/ernie/modular_ernie.py View File

@@ -172,6 +172,9 @@ class ErniePreTrainedModel(PreTrainedModel):
super()._init_weights(module)
if isinstance(module, ErnieLMPredictionHead):
init.zeros_(module.bias)
elif isinstance(module, ErnieEmbeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
init.zeros(module.token_type_ids)


class ErnieModel(BertModel):


+ 2
- 0
src/transformers/models/esm/modeling_esm.py View File

@@ -558,6 +558,8 @@ class EsmPreTrainedModel(PreTrainedModel):
super()._init_weights(module)
if isinstance(module, EsmLMHead):
init.zeros_(module.bias)
elif isinstance(module, EsmEmbeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))

def get_output_embeddings(self):
# NOTE: get_output_embeddings() must return None to prevent accidental weight tying.


+ 5
- 0
src/transformers/models/esm/modeling_esmfold.py View File

@@ -1979,6 +1979,11 @@ class EsmForProteinFolding(EsmPreTrainedModel):

_can_record_outputs = None

def _init_weights(self, module):
super()._init_weights(module)
if isinstance(module, EsmForProteinFolding):
init.copy_(module.af2_to_esm, module._af2_to_esm_from_vocab_list(module.config.vocab_list))

def __init__(self, config):
super().__init__(config)



+ 5
- 0
src/transformers/models/evolla/modular_evolla.py View File

@@ -203,6 +203,11 @@ class EvollaSaProtPreTrainedModel(PreTrainedModel):
],
}

def _init_weights(self, module):
super()._init_weights(module)
if isinstance(module, EvollaSaProtEmbeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))


class EvollaSaProtProteinEncoder(EvollaSaProtPreTrainedModel):
def __init__(self, config: SaProtConfig):


+ 6
- 1
src/transformers/models/falcon_mamba/modular_falcon_mamba.py View File

@@ -19,6 +19,7 @@ from typing import Optional
import torch
from torch import nn

from ... import initialization as init
from ...utils import auto_docstring, logging
from ...utils.import_utils import is_mambapy_available, is_torchdynamo_compiling
from ..mamba.configuration_mamba import MambaConfig
@@ -529,7 +530,11 @@ class FalconMambaBlock(MambaBlock):

@auto_docstring
class FalconMambaPreTrainedModel(MambaPreTrainedModel):
pass
def _init_weights(self, module):
super()._init_weights(module)
if isinstance(module, FalconMambaMixer):
init.ones_(module.b_c_rms)
init.ones_(module.dt_rms)


class FalconMambaOutput(MambaOutput):


+ 11
- 9
src/transformers/models/flaubert/modeling_flaubert.py View File

@@ -687,15 +687,17 @@ class FlaubertPreTrainedModel(PreTrainedModel):
if isinstance(module, nn.LayerNorm):
init.zeros_(module.bias)
init.ones_(module.weight)
if isinstance(module, FlaubertModel) and self.config.sinusoidal_embeddings:
init.copy_(
module.position_embeddings.weight,
create_sinusoidal_embeddings(
self.config.max_position_embeddings,
self.config.emb_dim,
out=torch.empty_like(module.position_embeddings.weight),
),
)
if isinstance(module, FlaubertModel):
if self.config.sinusoidal_embeddings:
init.copy_(
module.position_embeddings.weight,
create_sinusoidal_embeddings(
self.config.max_position_embeddings,
self.config.emb_dim,
out=torch.empty_like(module.position_embeddings.weight),
),
)
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))


@auto_docstring


+ 3
- 0
src/transformers/models/flava/modeling_flava.py View File

@@ -677,6 +677,9 @@ class FlavaPreTrainedModel(PreTrainedModel):
init.zeros_(module.position_embeddings)
if module.mask_token is not None:
init.zeros_(module.mask_token)
elif isinstance(module, FlavaTextEmbeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
init.zeros(module.token_type_ids)
elif isinstance(module, FlavaMultimodalModel):
if module.use_cls_token:
init.zeros_(module.cls_token)


+ 12
- 0
src/transformers/models/florence2/modular_florence2.py View File

@@ -1373,6 +1373,18 @@ class Florence2VisionPreTrainedModel(PreTrainedModel):

_can_compile_fullgraph = True

def _init_weights(self, module):
super()._init_weights(module)
if isinstance(module, Florence2VisionPositionalEmbeddingCosine1D):
pos_idx_to_embed = torch.empty((module.max_seq_len, module.embed_dim))
sine, cosine = module.get_sinusoid_embeddings(
max_positions=module.max_seq_len,
embed_dim=module.embed_dim,
)
pos_idx_to_embed[:, 0::2] = sine
pos_idx_to_embed[:, 1::2] = cosine
init.copy_(module.pos_idx_to_embed, pos_idx_to_embed)


@auto_docstring
class Florence2VisionBackbone(Florence2VisionPreTrainedModel):


+ 6
- 0
src/transformers/models/fnet/modeling_fnet.py View File

@@ -374,6 +374,12 @@ class FNetPreTrainedModel(PreTrainedModel):
base_model_prefix = "fnet"
supports_gradient_checkpointing = True

def _init_weights(self, module):
super()._init_weights(module)
if isinstance(module, FNetEmbeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
init.zeros(module.token_type_ids)


@dataclass
@auto_docstring(


+ 3
- 0
src/transformers/models/gemma3/modular_gemma3.py View File

@@ -352,6 +352,7 @@ class Gemma3TextScaledWordEmbedding(nn.Embedding):

def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: float = 1.0):
super().__init__(num_embeddings, embedding_dim, padding_idx)
self.scalar_embed_scale = embed_scale
self.register_buffer("embed_scale", torch.tensor(embed_scale), persistent=False)

def forward(self, input_ids: torch.Tensor):
@@ -576,6 +577,8 @@ class Gemma3PreTrainedModel(Gemma2PreTrainedModel):
# We initialize with 0s to be 1 centered as the RMSNorm here does (1 + weight)
elif "RMSNorm" in module.__class__.__name__:
init.zeros_(module.weight)
elif isinstance(module, Gemma3TextScaledWordEmbedding):
init.constant_(module.embed_scale, module.scalar_embed_scale)


def _bidirectional_window_overlay(sliding_window: int) -> Callable[[int, int, int, int], bool]:


+ 29
- 0
src/transformers/models/gemma3n/modular_gemma3n.py View File

@@ -1893,8 +1893,37 @@ class Gemma3nPreTrainedModel(Gemma2PreTrainedModel):
init.ones_(module.weight)
elif isinstance(module, Gemma3nAudioAttention):
init.zeros_(module.per_dim_scale)
q_scale = module.head_dim**-0.5
r_softplus_0 = 1.0 / torch.nn.functional.softplus(torch.tensor(0.0))
init.copy_(module.q_scale, q_scale * r_softplus_0)
init.constant_(module.softcap, module.attention_logits_soft_cap)

lower_causal_mask = torch.tril(torch.ones((module.context_size, module.chunk_size), dtype=torch.bool), diagonal=0).T
upper_causal_mask = torch.tril(
torch.ones((module.chunk_size, module.context_size), dtype=torch.bool),
diagonal=module.max_past_horizon + modle.max_future_horizon,
)
local_causal_valid_mask = torch.ones((module.chunk_size, module.context_size), dtype=torch.bool)
local_causal_valid_mask = local_causal_valid_mask * lower_causal_mask * upper_causal_mask
init.copy_(module.local_causal_valid_mask, local_causal_valid_mask)
elif isinstance(module, Gemma3nTextScaledWordEmbedding):
init.constant_(module.embed_scale, module.scalar_embed_scale)
elif isinstance(module, Gemma3nTextAltUp):
init.zeros_(module.correct_output_scale)
init.constant_(module.router_input_scale, self.config.hidden_size**-1.0)
elif isinstance(module, Gemma3nAudioRelativePositionEmbedding):
min_timescale = 1.0
max_timescale = 1.0e4
num_timescales = module.channels // 2
log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / max(num_timescales - 1, 1)
inv_timescales = min_timescale * torch.exp(torch.arange(num_timescales) * -log_timescale_increment)
init.copy_(module.inv_timescales, inv_timescales.float().unsqueeze(0).unsqueeze(0))
elif isinstance(module, Gemma3nTextModel):
init.constant_(module.per_layer_projection_scale, self.hidden_size**-0.5)
init.constant_(module.per_layer_input_scale, math.rsqrt(2.0))
if hasattr(module, "gradient_clipping"):
init.constant_(module.gradient_clipping, self.config.gradient_clipping)


@auto_docstring(custom_intro="The base Gemma 3n language model without a language modeling head.")


+ 3
- 0
src/transformers/models/git/modeling_git.py View File

@@ -396,6 +396,7 @@ class GitPreTrainedModel(PreTrainedModel):
init.normal_(module.class_embedding, mean=0.0, std=self.config.initializer_range)
init.normal_(module.patch_embedding.weight, std=self.config.initializer_range)
init.normal_(module.position_embedding.weight, std=self.config.initializer_range)
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
if isinstance(module, nn.Linear):
init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
@@ -408,6 +409,8 @@ class GitPreTrainedModel(PreTrainedModel):
elif isinstance(module, nn.LayerNorm):
init.zeros_(module.bias)
init.ones_(module.weight)
elif isinstance(module, GitEmbeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))


# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->Git


+ 0
- 1
src/transformers/models/glm4v/modular_glm4v.py View File

@@ -409,7 +409,6 @@ class Glm4vVisionEmbeddings(nn.Module):
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)

def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor:
"""


+ 7
- 0
src/transformers/models/gpt2/modeling_gpt2.py View File

@@ -492,6 +492,13 @@ class GPT2PreTrainedModel(PreTrainedModel):
elif isinstance(module, nn.LayerNorm):
init.zeros_(module.bias)
init.ones_(module.weight)
elif isinstance(module, GPT2Attention):
max_positions = module.config.max_position_embeddings
init.copy_(module.bias,
torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
1, 1, max_positions, max_positions
),
)

# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
# > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale


+ 3
- 0
src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py View File

@@ -369,6 +369,9 @@ class GPTBigCodePreTrainedModel(PreTrainedModel):
init.normal_(
module.c_proj.weight, mean=0.0, std=self.config.initializer_range / math.sqrt(2 * self.config.n_layer)
)
elif isinstance(module, GPTBigCodeModel):
max_positions = module.config.max_position_embeddings
init.copy_(module.bias, torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)))


@auto_docstring


+ 12
- 1
src/transformers/models/gpt_neo/modeling_gpt_neo.py View File

@@ -70,11 +70,11 @@ class GPTNeoSelfAttention(nn.Module):
# local causal self attention is a sliding window where each token can only attend to the previous
# window_size tokens. This is implemented by updating the causal mask such that for each token
# all other tokens are masked except the previous window_size tokens.
self.attention_type = attention_type
if attention_type == "local":
bias = torch.bitwise_xor(bias, torch.tril(bias, -config.window_size))

self.register_buffer("bias", bias, persistent=False)
self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)

self.attn_dropout = nn.Dropout(float(config.attention_dropout))
self.resid_dropout = nn.Dropout(float(config.resid_dropout))
@@ -382,6 +382,17 @@ class GPTNeoPreTrainedModel(PreTrainedModel):
_supports_flash_attn = True
_can_compile_fullgraph = False # TODO: needs a hybrid cache

def _init_weights(self, module):
super()._init_weights(module)
if isinstance(module, GPTNeoSelfAttention):
max_positions = module.config.max_position_embeddings
bias = torch.tril(torch.ones((max_positions, max_positions), dtype=bool)).view(
1, 1, max_positions, max_positions
)
if module.attention_type == "local":
bias = torch.bitwise_xor(bias, torch.tril(bias, -module.config.window_size))
init.copy_(module.bias, bias)


@auto_docstring
class GPTNeoModel(GPTNeoPreTrainedModel):


+ 6
- 0
src/transformers/models/granite_speech/modeling_granite_speech.py View File

@@ -293,6 +293,12 @@ class GraniteSpeechPreTrainedModel(PreTrainedModel):
super()._init_weights(module)
if isinstance(module, GraniteSpeechEncoderProjector):
init.normal_(module.query)
elif isinstance(module, GraniteSpeechCTCEncoder):
context_size = module.config.context_size
seq = torch.arange(context_size)
relpos_dist = seq.view(-1, 1) - seq.view(1, -1)
attention_dists = torch.clamp(relpos_dist, -context_size, context_size) + module.config.max_pos_emb
init.copy_(module.attention_dists, attention_dists)


@auto_docstring(


+ 5
- 0
src/transformers/models/grounding_dino/modeling_grounding_dino.py View File

@@ -1430,6 +1430,11 @@ class GroundingDinoPreTrainedModel(PreTrainedModel):
elif isinstance(module, GroundingDinoMLPPredictionHead):
init.constant_(module.layers[-1].weight, 0)
init.constant_(module.layers[-1].bias, 0)
elif isinstance(module, GroundingDinoFrozenBatchNorm2d):
init.ones_(module.weight)
init.zeros(module.bias)
module.zeros_(module.running_mean)
module.ones_(module.running_var)

if hasattr(module, "reference_points") and not self.config.two_stage:
init.xavier_uniform_(module.reference_points.weight, gain=1.0)


+ 1
- 0
src/transformers/models/groupvit/modeling_groupvit.py View File

@@ -766,6 +766,7 @@ class GroupViTPreTrainedModel(PreTrainedModel):
if isinstance(module, GroupViTTextEmbeddings):
init.normal_(module.token_embedding.weight, mean=0.0, std=factor * 0.02)
init.normal_(module.position_embedding.weight, mean=0.0, std=factor * 0.02)
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
elif isinstance(module, GroupViTAttention):
factor = self.config.initializer_factor
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor


+ 2
- 0
src/transformers/models/ibert/modeling_ibert.py View File

@@ -603,6 +603,8 @@ class IBertPreTrainedModel(PreTrainedModel):
init.ones_(module.weight)
elif isinstance(module, IBertLMHead):
init.zeros_(module.bias)
elif isinstance(module, IBertEmbeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))

def resize_token_embeddings(self, new_num_tokens=None):
raise NotImplementedError("`resize_token_embeddings` is not supported for I-BERT.")


+ 9
- 0
src/transformers/models/idefics/modeling_idefics.py View File

@@ -852,6 +852,15 @@ class IdeficsPreTrainedModel(PreTrainedModel):
init.normal_(module.alpha_dense, mean=0.0, std=self.config.alphas_initializer_range)
elif isinstance(module, IdeficsPerceiverResampler):
init.normal_(module.latents)
elif isinstance(module, IdeficsEmbedding):
inv_freq = 1.0 / (module.base ** (torch.arange(0, module.dim, 2) / module.dim))
init.copy(module.inv_freq, inv_freq)
t = torch.arange(module.max_position_embeddings).type_as(inv_freq)
freqs = torch.einsum("i,j->ij", t, inv_freq)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb = torch.cat((freqs, freqs), dim=-1)
init.copy_(cos_cached, emb.cos())
init.copy_(sin_cached, emb.sin())


@auto_docstring


+ 4
- 2
src/transformers/models/imagegpt/modeling_imagegpt.py View File

@@ -61,7 +61,7 @@ class ImageGPTLayerNorm(nn.Module):
class ImageGPTAttention(nn.Module):
def __init__(self, config, is_cross_attention: Optional[bool] = False, layer_idx: Optional[int] = None):
super().__init__()
self.config = config
max_positions = config.max_position_embeddings
self.register_buffer(
"bias",
@@ -70,7 +70,6 @@ class ImageGPTAttention(nn.Module):
),
persistent=False,
)
self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)

self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
@@ -384,6 +383,9 @@ class ImageGPTPreTrainedModel(PreTrainedModel):
if "c_proj" in name and "weight" in name:
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
init.normal_(p, mean=0.0, std=self.config.initializer_range / math.sqrt(2 * self.config.n_layer))
elif isinstance(module, ImageGPTAttention):
max_positions= module.config.max_position_embeddings
init.copy_(module.bias, torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(1, 1, max_positions, max_positions))


@auto_docstring


+ 2
- 0
src/transformers/models/instructblip/modeling_instructblip.py View File

@@ -335,6 +335,8 @@ class InstructBlipPreTrainedModel(PreTrainedModel):
init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
elif isinstance(module, (InstructBlipForConditionalGeneration, InstructBlipModel)):
init.zeros_(module.query_tokens)
elif isinstance(module, InstructBlipQFormerEmbeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))


# Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->InstructBlip


+ 5
- 0
src/transformers/models/janus/modular_janus.py View File

@@ -391,6 +391,11 @@ class JanusPreTrainedModel(PreTrainedModel):

_can_compile_fullgraph = True

def _init_weigts(self, module):
super()._init_weights(module)
if isinstance(module, JanusVisionEmbeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))


@dataclass
@auto_docstring(


+ 5
- 1
src/transformers/models/kosmos2/modeling_kosmos2.py View File

@@ -559,6 +559,7 @@ class Kosmos2TextSinusoidalPositionalEmbedding(nn.Module):
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
super().__init__()
self.offset = 2
self.num_positions = num_positions
self.embedding_dim = embedding_dim
self.padding_idx = padding_idx
self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
@@ -1138,6 +1139,7 @@ class Kosmos2PreTrainedModel(PreTrainedModel):
init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
elif isinstance(module, Kosmos2VisionAttention):
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
out_proj_std = (module.embed_dim**-0.5) * factor
@@ -1170,11 +1172,13 @@ class Kosmos2PreTrainedModel(PreTrainedModel):
elif isinstance(module, nn.LayerNorm):
init.ones_(module.weight)
init.zeros_(module.bias)
elif isinstance(module, Kosmos2TextSinusoidalPositionalEmbedding):
emb_weights = module.get_embedding(module.num_positions + module.offset, module.embedding_dim, module.padding_idx)
init.copy_(module.weights, emb_weights)

if isinstance(module, nn.Linear) and module.bias is not None:
init.zeros_(module.bias)


class Kosmos2VisionModel(Kosmos2PreTrainedModel):
config: Kosmos2VisionConfig
main_input_name = "pixel_values"


+ 3
- 0
src/transformers/models/kosmos2_5/modeling_kosmos2_5.py View File

@@ -1253,6 +1253,9 @@ class Kosmos2_5PreTrainedModel(PreTrainedModel):
init.zeros_(module.bias)
elif isinstance(module, Kosmos2_5ImageToTextProjection):
init.normal_(module.latent_query, mean=0.0, std=1.0)
elif isinstance(module, Kosmos2_5TextSinusoidalPositionalEmbedding):
emb_weights = module.get_embedding(module.num_positions + module.offset, module.embedding_dim, module.padding_idx)
init.copy_(module.weights, emb_weights)


class Kosmos2_5VisionModel(Kosmos2_5PreTrainedModel):


+ 8
- 1
src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py View File

@@ -213,7 +213,13 @@ class KyutaiSpeechToTextFeatureExtractor(EncodecFeatureExtractor):


class KyutaiSpeechToTextPreTrainedModel(MoshiPreTrainedModel):
pass
def _init_weights(self, module):
super()._init_weights(module)
if isinstance(module, KyutaiSpeechToTextEmbeddings):
audio_tokens_offsets = torch.arange(module.config.num_codebooks) * module.config.codebook_vocab_size
audio_tokens_offsets += module.config.vocab_size
audio_tokens_offsets = nn.functional.pad(audio_tokens_offsets, (1, 0))
init.copy_(module.audio_tokens_offsets, audio_tokens_offsets)


class KyutaiSpeechToTextConv1dPaddingCache(MimiConv1dPaddingCache):
@@ -223,6 +229,7 @@ class KyutaiSpeechToTextConv1dPaddingCache(MimiConv1dPaddingCache):
class KyutaiSpeechToTextEmbeddings(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.embed_tokens = nn.Embedding(
config.vocab_size + (config.num_codebooks * config.codebook_vocab_size) + 1,
config.hidden_size,


+ 2
- 0
src/transformers/models/layoutlm/modeling_layoutlm.py View File

@@ -431,6 +431,8 @@ class LayoutLMPreTrainedModel(PreTrainedModel):
super()._init_weights(module)
if isinstance(module, LayoutLMLMPredictionHead):
init.zeros_(module.bias)
elif isinstance(module, LayoutLMEmbeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))


@auto_docstring


+ 6
- 0
src/transformers/models/layoutlmv2/modeling_layoutlmv2.py View File

@@ -467,6 +467,12 @@ class LayoutLMv2PreTrainedModel(PreTrainedModel):
if self.config.fast_qkv:
init.zeros_(module.q_bias)
init.zeros_(module.v_bias)
elif isinstance(module, LayoutLMv2Embeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
elif isinstance(module, LayoutLMv2VisualBackbone):
num_channels = len(module.cfg.MODEL.PIXEL_MEAN)
init.copy_(module.pixel_mean, torch.Tensor(module.cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1))
init.copy_(module.pixel_std, torch.Tensor(module.cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1))
elif isinstance(module, LayoutLMv2Model):
if hasattr(module, "visual_segment_embedding"):
init.normal_(module.visual_segment_embedding, mean=0.0, std=self.config.initializer_range)


+ 2
- 0
src/transformers/models/layoutlmv3/modeling_layoutlmv3.py View File

@@ -212,6 +212,8 @@ class LayoutLMv3PreTrainedModel(PreTrainedModel):
if self.config.visual_embed:
init.zeros_(module.cls_token)
init.zeros_(module.pos_embed)
elif isinstance(module, LayoutLMv3TextEmbeddings):
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))


class LayoutLMv3SelfAttention(nn.Module):


+ 6
- 0
src/transformers/models/led/modeling_led.py View File

@@ -23,6 +23,7 @@ import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ... import initialization as init
from ...activations import ACT2FN
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
from ...generation import GenerationMixin
@@ -1076,6 +1077,11 @@ class LEDPreTrainedModel(PreTrainedModel):
"input_ids": input_ids,
}
return dummy_inputs
def _init_weights(self, module):
super()._init_weights(module)
if isinstance(module, LEDForConditionalGeneration):
init.zeros_(module.final_logits_bias)


@dataclass


+ 12
- 0
src/transformers/models/levit/modeling_levit.py View File

@@ -165,6 +165,7 @@ class LevitAttention(nn.Module):

points = list(itertools.product(range(resolution), range(resolution)))
len_points = len(points)
self.len_points = len_points
attention_offsets, indices = {}, []
for p1 in points:
for p2 in points:
@@ -172,6 +173,7 @@ class LevitAttention(nn.Module):
if offset not in attention_offsets:
attention_offsets[offset] = len(attention_offsets)
indices.append(attention_offsets[offset])
self.indices = indices

self.attention_bias_cache = {}
self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
@@ -243,6 +245,8 @@ class LevitAttentionSubsample(nn.Module):
points = list(itertools.product(range(resolution_in), range(resolution_in)))
points_ = list(itertools.product(range(resolution_out), range(resolution_out)))
len_points, len_points_ = len(points), len(points_)
self.len_points_ = len_points_
self.len_points = len_points
attention_offsets, indices = {}, []
for p1 in points_:
for p2 in points:
@@ -251,6 +255,7 @@ class LevitAttentionSubsample(nn.Module):
if offset not in attention_offsets:
attention_offsets[offset] = len(attention_offsets)
indices.append(attention_offsets[offset])
self.indices = indices

self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
self.register_buffer(
@@ -472,6 +477,13 @@ class LevitPreTrainedModel(PreTrainedModel):
input_modalities = ("image",)
_no_split_modules = ["LevitResidualLayer"]

def _init_weights(self, module):
super()._init_weights(module)
if isinstance(module, LevitAttention):
init.copy_(module.attention_bias_idxs, torch.LongTensor(module.indices).view(module.len_points, module.len_points))
elif isinstance(module, LevitAttentionSubSample):
init.copy_(module.attention_bias_idxs, torch.LongTensor(module.indices).view(module.len_points_, module.len_points))


@auto_docstring
class LevitModel(LevitPreTrainedModel):


+ 3
- 0
src/transformers/models/lfm2_moe/modular_lfm2_moe.py View File

@@ -168,6 +168,9 @@ class Lfm2MoePreTrainedModel(LlamaPreTrainedModel):
if isinstance(module, Lfm2MoeExperts):
init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)
elif isinstance(module, Lfm2MoeSparseMoeBlock):
if module.use_expert_bias:
init.zeros_(module.expert_bias)


class Lfm2MoeModel(MixtralModel):


+ 6
- 0
src/transformers/models/speecht5/modeling_speecht5.py View File

@@ -3008,6 +3008,12 @@ class SpeechT5HifiGan(PreTrainedModel):
# Initialize weights and apply final processing
self.post_init()

def _init_weights(self, module):
super()._init_weights(module)
if isinstance(module, SpeechT5HifiGan):
init.zeros(module.mean)
init.ones_(module.scale)

def apply_weight_norm(self):
weight_norm = nn.utils.weight_norm
if hasattr(nn.utils.parametrizations, "weight_norm"):


Loading…
Cancel
Save
Baidu
map