15 Commits

Author SHA1 Message Date
  Raushan Turganbay b61da25169
Remove duplicated processor class from config (#42806) 5 hours ago
  Chunyu 24275124c6
Add local kernel loading support to KernelConfig(). (#42800) 5 hours ago
  Cyril Vallez 4d6516e256
Simplify tie weights logic (#42895) 6 hours ago
  Wang, Yi 24b311eead
fix FastSpeech2ConformerTokenizer crash in tokenize (#42888) 6 hours ago
  r0 0f89661972
Added kernels from kernel hub for Bamba model (#41540) 6 hours ago
  Avihu Dekel 5d2f82b530
Fix GraniteMoeHybrid in transformers v5 (#42872) 6 hours ago
  Anton Vlasjuk 4e7cecb24d
[`Ernie 4.5 Moe`] Fix routing, weights, and update expectations (#42653) 7 hours ago
  Julien Denize 252afd8968
Fix convert_tekken_tokenizer (#42592) 7 hours ago
  Taisei Yamamoto 89998bddca
Stop collecting all model parameters to save models when using DeepSpeed and LoRA (#41416) 7 hours ago
  Cyril Vallez 8d526c238a
[modular] Fix a weird renaming edge-case (#42844) 7 hours ago
  Patrick von Platen 7960b5ea40
[Devstral] Make sure FP8 conversion works correctly (#42715) 7 hours ago
  Tom Aarsen 6c7c992faf
Add missing ModelOutput subclass return type hints (#41219) 7 hours ago
  Preetam Chhimpa 0f97c688d5
Fix BLT training_ci overfit test (#42685) 7 hours ago
  Abubakar Abid 7f52a2a4ea
Add `.on_push_begin()` callback to Trainer and implement for `TrackioCallback` (#42850) 14 hours ago
  Steven Liu 31de95ef71
[docs] optimizations quickstart (#42538) 19 hours ago
53 changed files with 1264 additions and 369 deletions
Split View
  1. +2
    -0
      docs/source/en/_toctree.yml
  2. +178
    -0
      docs/source/en/optimization_overview.md
  3. +250
    -0
      examples/modular-transformers/modeling_test_suffix.py
  4. +12
    -0
      examples/modular-transformers/modular_test_suffix.py
  5. +14
    -29
      examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
  6. +3
    -0
      src/transformers/conversion_mapping.py
  7. +2
    -12
      src/transformers/feature_extraction_utils.py
  8. +2
    -12
      src/transformers/image_processing_base.py
  9. +1
    -1
      src/transformers/integrations/hub_kernels.py
  10. +35
    -0
      src/transformers/integrations/integration_utils.py
  11. +12
    -0
      src/transformers/integrations/mistral.py
  12. +17
    -22
      src/transformers/modeling_utils.py
  13. +15
    -16
      src/transformers/models/bamba/modeling_bamba.py
  14. +15
    -15
      src/transformers/models/bamba/modular_bamba.py
  15. +149
    -0
      src/transformers/models/blt/modeling_blt.py
  16. +155
    -1
      src/transformers/models/blt/modular_blt.py
  17. +4
    -4
      src/transformers/models/dac/modeling_dac.py
  18. +2
    -2
      src/transformers/models/deepseek_vl/modeling_deepseek_vl.py
  19. +2
    -2
      src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py
  20. +2
    -2
      src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
  21. +8
    -8
      src/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py
  22. +8
    -8
      src/transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py
  23. +1
    -0
      src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py
  24. +1
    -1
      src/transformers/models/flava/modeling_flava.py
  25. +4
    -0
      src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py
  26. +24
    -18
      src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py
  27. +10
    -1
      src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py
  28. +3
    -3
      src/transformers/models/grounding_dino/modeling_grounding_dino.py
  29. +17
    -18
      src/transformers/models/jamba/modeling_jamba.py
  30. +17
    -17
      src/transformers/models/jamba/modular_jamba.py
  31. +2
    -2
      src/transformers/models/janus/modeling_janus.py
  32. +2
    -2
      src/transformers/models/janus/modular_janus.py
  33. +23
    -23
      src/transformers/models/mamba2/modeling_mamba2.py
  34. +3
    -3
      src/transformers/models/maskformer/modeling_maskformer_swin.py
  35. +74
    -61
      src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py
  36. +1
    -1
      src/transformers/models/mistral3/convert_mistral3_weights_to_hf.py
  37. +3
    -3
      src/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py
  38. +1
    -4
      src/transformers/models/qwen3_next/modeling_qwen3_next.py
  39. +4
    -4
      src/transformers/models/tvp/modeling_tvp.py
  40. +3
    -3
      src/transformers/models/udop/modeling_udop.py
  41. +2
    -14
      src/transformers/processing_utils.py
  42. +11
    -8
      src/transformers/testing_utils.py
  43. +13
    -1
      src/transformers/trainer.py
  44. +8
    -0
      src/transformers/trainer_callback.py
  45. +71
    -18
      src/transformers/utils/kernel_config.py
  46. +1
    -7
      src/transformers/video_processing_utils.py
  47. +0
    -4
      tests/models/blt/test_modeling_blt.py
  48. +43
    -15
      tests/models/ernie4_5_moe/test_modeling_ernie4_5_moe.py
  49. +8
    -0
      tests/test_processing_common.py
  50. +6
    -0
      tests/test_training_mixin.py
  51. +0
    -3
      tests/trainer/test_trainer.py
  52. +14
    -0
      tests/trainer/test_trainer_callback.py
  53. +6
    -1
      utils/modular_model_converter.py

+ 2
- 0
docs/source/en/_toctree.yml View File

@@ -68,6 +68,8 @@
title: Perplexity of fixed-length models
title: Generate API
- sections:
- local: optimization_overview
title: Overview
- local: attention_interface
title: Attention backends
- local: continuous_batching


+ 178
- 0
docs/source/en/optimization_overview.md View File

@@ -0,0 +1,178 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.

-->

# Overview

Transformers provides multiple inference optimization techniques to make models fast, affordable, and accessible. Options include alternative attention mechanisms for reduced memory traffic, code compilation for faster execution, and optimized kernels for throughput. Stack these techniques for maximum performance.

> [!NOTE]
> Memory and speed are closely related but not the same. Shrinking your memory footprint makes a model "faster" because there is less data to move around. Pure speed optimizations don't always reduce memory and sometimes increase usage. Choose the appropriate optimization based on your use case and hardware.

Use the table below to pick an optimization technique.

| Technique | Speed | Memory |
|---|:---:|:---:|
| [Compilation](#compilation) | ✅ | |
| [Attention backends](#attention-backends) | ✅ | ✅ |
| [Kernels](#kernels) | ✅ | ✅ |
| [Quantization](#quantization) | ✅ | ✅ |
| [Caching](#caching) | ✅ | ✅ |
| [Parallelism](#parallelism) | ✅ | |
| [Continuous batching](#continuous-batching) | ✅ | |

This guide gives you a quick start on Transformers optimizations.

## Compilation

[torch.compile](./perf_torch_compile) reduces Python overhead, fuses operations, and creates kernels tuned for your shapes and hardware. The first run warms it up and subsequent runs use the faster compiled path.

Pass a [fixed size cache](./kv_cache#fixed-size-cache) to [`~GenerationMixin.generate`] to trigger `torch.compile` automatically.

```py
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B", dtype=torch.float16, device_map="auto")
input = tokenizer("The French Bread Law states", return_tensors="pt").to(model.device)

output = model.generate(**input, do_sample=False, max_new_tokens=20, cache_implementation="static")
tokenizer.batch_decode(output, skip_special_tokens=True)[0]
```

> [!WARNING]
> Avoid calling `torch.compile(model)` outside of [`~GenerationMixin.generate`] to prevent the model from recompiling every step.

## Attention backends

Alternative [attention backends](./attention_interface) lower memory traffic. For example, FlashAttention tiles attention computations and avoids large intermediate tensors to reduce memory footprint.

Set `attn_implementation` in [`~PreTrainedModel.from_pretrained`] to load an optimized attention backend.

```py
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B", attn_implementation="flash_attention_2")
```

## Kernels

Kernels fuse operations to boost throughput and reduce memory usage. The [Kernels](https://huggingface.co/docs/kernels/en/index) library loads optimized compute kernels from the [Hub](https://huggingface.co/kernels-community) in a flexible and version-safe way.

The example below loads an optimized FlashAttention-2 kernel without installing the package.

```py
import torch
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen3-0.6B", attn_implementation="kernels-community/flash-attn2"
)
```

## Quantization

[Quantization](./quantization/overview) shrinks the size of every parameter which lowers memory footprint and increases speed because you can do more operations.

Pass a quantization config to the `quantization_config` argument in [`~PreTrainedModel.from_pretrained`]. Each quantization backend has a different config with different arguments. The example below quantizes a model to 4-bits and configures the computation dtype with the [bitsandbytes](./quantization/bitsandbytes) backend.

```py
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)

model = AutoModelForCausalLM.from_pretrained(
"allenai/Olmo-3-7B-Think", quantization_config=bnb_config
)
```

## Caching

[Caching](./kv_cache) speeds up generation by reusing past keys and values instead of recomputing them for every token. To offset and reduce the memory cost of storing past keys and values, Transformers
supports offloading the cache to the CPU. Only the current layer remains on the GPU.

Use the `cache_implementation` argument in [`~GenerationMixin.generate`] to set a cache strategy.

```py
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen3-0.6B", attn_implementation="kernels-community/flash-attn2"
)
inputs = tokenizer("The Le Décret Pain states that a baguette must,", return_tensors="pt")
outputs = model.generate(**inputs, do_sample=False, max_new_tokens=50, cache_implementation="offloaded")
```

## Parallelism

[Parallelism](./perf_infer_gpu_multi) distributes a model across devices so models too big for one device run fast. This approach uses more memory due to sharding overhead and communication to sync results.

[Tensor parallelism](./perf_infer_gpu_multi) splits a model layer across devices. Set `tp_plan="auto"` in [`~PreTrainedModel.from_pretrained`] to enable it.

```py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", tp_plan="auto")
print(model._tp_plan)
```

## Continuous batching

[Continuous batching](./continuous_batching) maximizes throughput by keeping the GPU busy with dynamic scheduling and chunked prefill. [Serving](./serving.md) applications use it to process multiple incoming requests concurrently.

Use [`~ContinuousMixin.generate_batch`] to enable continuous batching.

```py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig

model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen3-0.6B",
attn_implementation="paged|sdpa",
device_map="cuda",
torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")

prompts = [
"The Le Décret Pain states that a baguette must",
"Explain gravity in one sentence.",
"Name the capital of France.",
]
inputs = [tokenizer.encode(p) for p in prompts]

generation_config = GenerationConfig(
max_new_tokens=32,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
do_sample=False,
max_batch_tokens=512,
)

outputs = model.generate_batch(
inputs=inputs,
generation_config=generation_config,
)

for request_id, output in outputs.items():
text = tokenizer.decode(output.generated_tokens, skip_special_tokens=True)
print(f"[{request_id}] {text}")
```

+ 250
- 0
examples/modular-transformers/modeling_test_suffix.py View File

@@ -0,0 +1,250 @@
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from examples/modular-transformers/modular_test_suffix.py.
# Do NOT edit this file manually as any edits will be overwritten by the generation of
# the file from the modular. If any change should be done, please apply the change to the
# modular_test_suffix.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
from collections.abc import Callable
from typing import Optional

import torch
import torch.nn as nn

from ...activations import ACT2FN
from ...cache_utils import Cache
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
from ...processing_utils import Unpack
from ...utils import TransformersKwargs
from .configuration_test_suffix import TestSuffixLlamaConfig


class TestSuffixDecoderLayer(nn.module):
pass


@use_kernel_forward_from_hub("RMSNorm")
class TestSuffixLlamaRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
TestSuffixLlamaRMSNorm is equivalent to T5LayerNorm
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps

def forward(self, hidden_states):
input_dtype = hidden_states.dtype
hidden_states = hidden_states.to(torch.float32)
variance = hidden_states.pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
return self.weight * hidden_states.to(input_dtype)

def extra_repr(self):
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


class TestSuffixLlamaMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
self.intermediate_size = config.intermediate_size
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
self.act_fn = ACT2FN[config.hidden_act]

def forward(self, x):
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
return down_proj


def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)


@use_kernel_func_from_hub("rotary_pos_emb")
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
"""Applies Rotary Position Embedding to the query and key tensors.

Args:
q (`torch.Tensor`): The query tensor.
k (`torch.Tensor`): The key tensor.
cos (`torch.Tensor`): The cosine part of the rotary embedding.
sin (`torch.Tensor`): The sine part of the rotary embedding.
position_ids (`torch.Tensor`, *optional*):
Deprecated and unused.
unsqueeze_dim (`int`, *optional*, defaults to 1):
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
"""
cos = cos.unsqueeze(unsqueeze_dim)
sin = sin.unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
"""
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
"""
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
if n_rep == 1:
return hidden_states
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


def eager_attention_forward(
module: nn.Module,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
attention_mask: Optional[torch.Tensor],
scaling: float,
dropout: float = 0.0,
**kwargs: Unpack[TransformersKwargs],
):
key_states = repeat_kv(key, module.num_key_value_groups)
value_states = repeat_kv(value, module.num_key_value_groups)

attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
if attention_mask is not None:
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
attn_weights = attn_weights + causal_mask

attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
attn_output = torch.matmul(attn_weights, value_states)
attn_output = attn_output.transpose(1, 2).contiguous()

return attn_output, attn_weights


@use_kernelized_func(apply_rotary_pos_emb)
class TestSuffixLlamaAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""

def __init__(self, config: TestSuffixLlamaConfig, layer_idx: int):
super().__init__()
self.config = config
self.layer_idx = layer_idx
self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
self.scaling = self.head_dim**-0.5
self.attention_dropout = config.attention_dropout
self.is_causal = True

self.q_proj = nn.Linear(
config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
)
self.k_proj = nn.Linear(
config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
)
self.v_proj = nn.Linear(
config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
)
self.o_proj = nn.Linear(
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
)

def forward(
self,
hidden_states: torch.Tensor,
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Cache] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[TransformersKwargs],
) -> tuple[torch.Tensor, torch.Tensor]:
input_shape = hidden_states.shape[:-1]
hidden_shape = (*input_shape, -1, self.head_dim)

query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)

cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

if past_key_values is not None:
# sin and cos are specific to RoPE models; cache_position needed for the static cache
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)

attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

attn_output, attn_weights = attention_interface(
self,
query_states,
key_states,
value_states,
attention_mask,
dropout=0.0 if not self.training else self.attention_dropout,
scaling=self.scaling,
**kwargs,
)

attn_output = attn_output.reshape(*input_shape, -1).contiguous()
attn_output = self.o_proj(attn_output)
return attn_output, attn_weights


class TestSuffixLlamaDecoderLayer(GradientCheckpointingLayer):
def __init__(self, config: TestSuffixLlamaConfig, layer_idx: int):
super().__init__()
self.hidden_size = config.hidden_size

self.self_attn = TestSuffixLlamaAttention(config=config, layer_idx=layer_idx)

self.mlp = TestSuffixLlamaMLP(config)
self.input_layernorm = TestSuffixLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_attention_layernorm = TestSuffixLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Cache] = None,
use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
**kwargs: Unpack[TransformersKwargs],
) -> torch.Tensor:
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
# Self Attention
hidden_states, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
use_cache=use_cache,
cache_position=cache_position,
position_embeddings=position_embeddings,
**kwargs,
)
hidden_states = residual + hidden_states

# Fully Connected
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
return hidden_states

+ 12
- 0
examples/modular-transformers/modular_test_suffix.py View File

@@ -0,0 +1,12 @@
import torch.nn as nn

from transformers.models.llama.modeling_llama import LlamaDecoderLayer


class TestSuffixDecoderLayer(nn.module):
pass


# Here, we want to add "Llama" as a suffix to the base `TestModel` name for all required dependencies
class TestSuffixLlamaDecoderLayer(LlamaDecoderLayer):
pass

+ 14
- 29
examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py View File

@@ -45,10 +45,8 @@ from datasets import DatasetDict, load_dataset
import transformers
from transformers import (
AutoConfig,
AutoFeatureExtractor,
AutoModelForSpeechSeq2Seq,
AutoProcessor,
AutoTokenizer,
HfArgumentParser,
Seq2SeqTrainer,
Seq2SeqTrainingArguments,
@@ -396,17 +394,9 @@ def main():
if getattr(config, "model_type", None) == "whisper":
config.update({"apply_spec_augment": model_args.apply_spec_augment})

feature_extractor = AutoFeatureExtractor.from_pretrained(
(model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path),
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
tokenizer = AutoTokenizer.from_pretrained(
(model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path),
processor = AutoProcessor.from_pretrained(
model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
@@ -432,7 +422,7 @@ def main():

if hasattr(model.generation_config, "is_multilingual") and model.generation_config.is_multilingual:
# We only need to set the language and task ids in a multilingual setting
tokenizer.set_prefix_tokens(language=data_args.language, task=data_args.task)
processor.tokenizer.set_prefix_tokens(language=data_args.language, task=data_args.task)
model.generation_config.language = data_args.language
model.generation_config.task = data_args.task
elif data_args.language is not None:
@@ -461,20 +451,20 @@ def main():

# 6. Resample speech dataset if necessary
dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
if dataset_sampling_rate != feature_extractor.sampling_rate:
if dataset_sampling_rate != processor.feature_extractor.sampling_rate:
raw_datasets = raw_datasets.cast_column(
data_args.audio_column_name,
datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate),
datasets.features.Audio(sampling_rate=processor.feature_extractor.sampling_rate),
)

# 7. Preprocessing the datasets.
# We need to read the audio files as arrays and tokenize the targets.
max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
max_input_length = data_args.max_duration_in_seconds * processor.feature_extractor.sampling_rate
min_input_length = data_args.min_duration_in_seconds * processor.feature_extractor.sampling_rate
audio_column_name = data_args.audio_column_name
num_workers = data_args.preprocessing_num_workers
text_column_name = data_args.text_column_name
model_input_name = feature_extractor.model_input_names[0]
model_input_name = processor.feature_extractor.model_input_names[0]
do_lower_case = data_args.do_lower_case
# if SpecAugment is used for whisper models, return attention_mask to guide the mask along time axis
forward_attention_mask = (
@@ -492,7 +482,7 @@ def main():
def prepare_dataset(batch):
# process audio
sample = batch[audio_column_name]
inputs = feature_extractor(
inputs = processor.feature_extractor(
sample["array"],
sampling_rate=sample["sampling_rate"],
return_attention_mask=forward_attention_mask,
@@ -505,7 +495,7 @@ def main():

# process targets
input_str = batch[text_column_name].lower() if do_lower_case else batch[text_column_name]
batch["labels"] = tokenizer(input_str).input_ids
batch["labels"] = processor.tokenizer(input_str).input_ids
return batch

with training_args.main_process_first(desc="dataset map pre-processing"):
@@ -543,11 +533,11 @@ def main():
def compute_metrics(pred):
pred_ids = pred.predictions

pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
# we do not want to group tokens when computing the metrics
label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
label_str = processor.tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)

wer = metric.compute(predictions=pred_str, references=label_str)

@@ -558,13 +548,8 @@ def main():
with training_args.main_process_first():
# only the main process saves them
if is_main_process(training_args.local_process_index):
# save feature extractor, tokenizer and config
feature_extractor.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)
config.save_pretrained(training_args.output_dir)

processor = AutoProcessor.from_pretrained(training_args.output_dir)

# 10. Define data collator
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
processor=processor,
@@ -578,7 +563,7 @@ def main():
args=training_args,
train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
processing_class=feature_extractor,
processing_class=processor.feature_extractor,
data_collator=data_collator,
compute_metrics=(compute_metrics if training_args.predict_with_generate else None),
)


+ 3
- 0
src/transformers/conversion_mapping.py View File

@@ -166,6 +166,9 @@ def _build_checkpoint_conversion_mapping():
mapping["deepseek_v3"] = mapping["qwen2_moe"].copy()
mapping["dots1"] = mapping["qwen2_moe"].copy()
mapping["ernie4_5_moe"] = mapping["qwen2_moe"].copy()
mapping["ernie4_5_moe"] += [
WeightRenaming("mlp.moe_statics.e_score_correction_bias", "mlp.gate.moe_statics.e_score_correction_bias")
]
mapping["glm4_moe"] = mapping["qwen2_moe"].copy()
mapping["glm4v_moe"] = mapping["qwen2_moe"].copy()
mapping["longcat_flash"] = mapping["qwen2_moe"].copy()


+ 2
- 12
src/transformers/feature_extraction_utils.py View File

@@ -256,8 +256,8 @@ class FeatureExtractionMixin(PushToHubMixin):

def __init__(self, **kwargs):
"""Set elements of `kwargs` as attributes."""
# Pop "processor_class" as it should be saved as private attribute
self._processor_class = kwargs.pop("processor_class", None)
# Pop "processor_class", it should not be saved in feature extractor config
kwargs.pop("processor_class", None)
# Additional attributes without default values
for key, value in kwargs.items():
try:
@@ -266,10 +266,6 @@ class FeatureExtractionMixin(PushToHubMixin):
logger.error(f"Can't set {key} with value {value} for {self}")
raise err

def _set_processor_class(self, processor_class: str):
"""Sets processor class as an attribute."""
self._processor_class = processor_class

@classmethod
def from_pretrained(
cls: type[SpecificFeatureExtractorType],
@@ -613,12 +609,6 @@ class FeatureExtractionMixin(PushToHubMixin):
if isinstance(value, np.ndarray):
dictionary[key] = value.tolist()

# make sure private name "_processor_class" is correctly
# saved as "processor_class"
_processor_class = dictionary.pop("_processor_class", None)
if _processor_class is not None:
dictionary["processor_class"] = _processor_class

return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"

def to_json_file(self, json_file_path: Union[str, os.PathLike]):


+ 2
- 12
src/transformers/image_processing_base.py View File

@@ -71,8 +71,8 @@ class ImageProcessingMixin(PushToHubMixin):
# This key was saved while we still used `XXXFeatureExtractor` for image processing. Now we use
# `XXXImageProcessor`, this attribute and its value are misleading.
kwargs.pop("feature_extractor_type", None)
# Pop "processor_class" as it should be saved as private attribute
self._processor_class = kwargs.pop("processor_class", None)
# Pop "processor_class", should not be saved with image processing config anymore
kwargs.pop("processor_class", None)
# Additional attributes without default values
for key, value in kwargs.items():
try:
@@ -81,10 +81,6 @@ class ImageProcessingMixin(PushToHubMixin):
logger.error(f"Can't set {key} with value {value} for {self}")
raise err

def _set_processor_class(self, processor_class: str):
"""Sets processor class as an attribute."""
self._processor_class = processor_class

@classmethod
def from_pretrained(
cls: type[ImageProcessorType],
@@ -428,12 +424,6 @@ class ImageProcessingMixin(PushToHubMixin):
if isinstance(value, np.ndarray):
dictionary[key] = value.tolist()

# make sure private name "_processor_class" is correctly
# saved as "processor_class"
_processor_class = dictionary.pop("_processor_class", None)
if _processor_class is not None:
dictionary["processor_class"] = _processor_class

return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"

def to_json_file(self, json_file_path: Union[str, os.PathLike]):


+ 1
- 1
src/transformers/integrations/hub_kernels.py View File

@@ -370,7 +370,7 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, ModuleType | None] = _
if callable(is_kernel_available) and is_kernel_available():
# Try to import the module "{kernel_name}" from parent package level
try:
module = importlib.import_module(f"{kernel_name}")
module = importlib.import_module(f"{new_kernel_name}")
mapping[kernel_name] = module
return module
except Exception:


+ 35
- 0
src/transformers/integrations/integration_utils.py View File

@@ -940,6 +940,8 @@ class TrackioCallback(TrainerCallback):
```
"""

SPACE_URL = "https://huggingface.co/spaces/{space_id}"

def __init__(self):
has_trackio = is_trackio_available()
if not has_trackio:
@@ -1058,6 +1060,39 @@ class TrackioCallback(TrainerCallback):
metrics = rewrite_logs(metrics)
self._trackio.log(metrics)

def on_push_begin(self, args, state, control, model, **kwargs):
if not state.is_world_process_zero or self._trackio is None:
return
if (current_project := self._trackio.context_vars.current_project.get()) is None:
return
trackio_version = packaging.version.parse(self._trackio.__version__)
if trackio_version < packaging.version.parse("0.13.0"):
warnings.warn(
"The version of `trackio` that is installed is <=0.13.0, so "
"the local Trackio project will not be pushed to Hugging Face. Run "
"`pip install --upgrade trackio` to fix this."
)
return

space_id = self._trackio.context_vars.current_space_id.get()
if space_id is None:
space_id = self._trackio.sync(current_project, force=True)
space_url = self.SPACE_URL.format(space_id=space_id)

badge_markdown = (
f'<a href="{space_url}" target="_blank"><img src="https://raw.githubusercontent.com/gradio-app/trackio/refs/heads/main/trackio/assets/badge.png" alt="Visualize in Trackio"'
' title="Visualize in Trackio" style="height: 40px;"/></a>'
)
if badge_markdown not in modelcard.AUTOGENERATED_TRAINER_COMMENT:
modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n{badge_markdown}"

if getattr(model, "model_tags", None) is not None:
if "trackio" not in model.model_tags:
model.model_tags.append("trackio")
model.model_tags.append(f"trackio::{space_url}")
else:
model.model_tags = ["trackio", f"trackio:{space_url}"]


class CometCallback(TrainerCallback):
"""


+ 12
- 0
src/transformers/integrations/mistral.py View File

@@ -77,6 +77,7 @@ def convert_tekken_tokenizer(tokenizer_file: str):
"""Convert a "tekken" tokenizer to a fast Tokenizer."""
# Tekken format -- need to use the Converter

from mistral_common.tokens.tokenizers.base import SpecialTokens
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer

# Load directly using their lib
@@ -106,4 +107,15 @@ def convert_tekken_tokenizer(tokenizer_file: str):
# Post-process
tokenizer.add_special_tokens({"additional_special_tokens": all_special})

MAP_SPECAL = {
"bos_token": SpecialTokens.bos.value,
"eos_token": SpecialTokens.eos.value,
"pad_token": SpecialTokens.pad.value,
"unk_token": SpecialTokens.unk.value,
}

for special_key, special_token in MAP_SPECAL.items():
if special_token in all_special:
tokenizer.add_special_tokens({special_key: special_token})

return tokenizer

+ 17
- 22
src/transformers/modeling_utils.py View File

@@ -2387,11 +2387,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH

tied_keys = list(tied_keys.items())
for i, (target_param_name, source_param_name) in enumerate(tied_keys):
# Usually we tie a single target to a single source, but when both are missing we may later tie
# both the source and target to a third "backup" parameter that is present in the checkpoint, so we use
# a list here
target_param_names = [target_param_name]

# This is `from_pretrained` -> let's check symmetrically in case the source key is not present
if missing_keys is not None:
remove_from_missing = True
@@ -2412,7 +2407,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
# We're missing the source but we have the target -> we swap them, tying the parameter that exists
elif not source_is_there and target_is_there:
target_param_name, source_param_name = source_param_name, target_param_name
target_param_names = [target_param_name]
# Both are missing -> check other keys in case more than 2 keys are tied to the same weight
elif not source_is_there and not target_is_there:
for target_backup, source_backup in tied_keys[i + 1 :]:
@@ -2421,10 +2415,10 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
if source_backup == source_param_name:
target_backup_is_there = target_backup not in missing_keys
# If the target is present, we found the correct weight to tie into (we know the source is missing)
# Note here that we do not tie the missing source right now as well, as it will be done anyway when
# the pair (target_backup, source_backup) becomes the main pair (target_param_name, source_param_name)
if target_backup_is_there:
source_param_name = target_backup
# Append the source as well, since both are missing we'll tie both
target_param_names.append(source_param_name)
break
# If we did not break from the loop, it was impossible to find a source key -> let's raise
else:
@@ -2440,19 +2434,18 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH

# Perform the actual tying
source_param = self.get_parameter_or_buffer(source_param_name)
for target_param_name in target_param_names:
if "." in target_param_name:
parent_name, name = target_param_name.rsplit(".", 1)
parent = self.get_submodule(parent_name)
else:
name = target_param_name
parent = self
# Tie the weights
setattr(parent, name, source_param)
self._adjust_bias(parent, source_param)
# Remove from missing if necesary
if missing_keys is not None and remove_from_missing:
missing_keys.discard(target_param_name)
if "." in target_param_name:
parent_name, name = target_param_name.rsplit(".", 1)
parent = self.get_submodule(parent_name)
else:
name = target_param_name
parent = self
# Tie the weights
setattr(parent, name, source_param)
self._adjust_bias(parent, source_param)
# Remove from missing if necesary
if missing_keys is not None and remove_from_missing:
missing_keys.discard(target_param_name)

def _adjust_bias(self, output_embeddings, input_embeddings):
if getattr(output_embeddings, "bias", None) is not None and hasattr(output_embeddings, "weight"):
@@ -3540,7 +3533,9 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH

# This is a context manager to override the default kernel mapping
# We are calling kernelize inside this context manager using the use_kernels setter
with use_kernel_mapping(kernel_config.kernel_mapping):
# Param inherit_mapping should be False to avoid still loading kernel from remote
inherit_mapping = not kernel_config.use_local_kernel
with use_kernel_mapping(kernel_config.kernel_mapping, inherit_mapping=inherit_mapping):
self.use_kernels = True
# We use the default kernel mapping in .integrations.hub_kernels
else:


+ 15
- 16
src/transformers/models/bamba/modeling_bamba.py View File

@@ -36,6 +36,7 @@ from ... import initialization as init
from ...cache_utils import Cache
from ...generation import GenerationMixin
from ...integrations import use_kernel_forward_from_hub, use_kernelized_func
from ...integrations.hub_kernels import lazy_load_kernel
from ...modeling_attn_mask_utils import AttentionMaskConverter
from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
@@ -44,22 +45,9 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
from ...utils.generic import maybe_autocast
from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
from .configuration_bamba import BambaConfig


if is_mamba_2_ssm_available():
from mamba_ssm.ops.triton.selective_state_update import selective_state_update
from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
else:
selective_state_update = None

if is_causal_conv1d_available():
from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
else:
causal_conv1d_update, causal_conv1d_fn = None, None


logger = logging.get_logger(__name__)


@@ -501,9 +489,6 @@ def apply_mask_to_padding_states(hidden_states, attention_mask):
return hidden_states


is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))


# Adapted from transformers.models.mamba2.modeling_mamba2.Mamba2Mixer
class BambaMixer(nn.Module):
"""
@@ -575,6 +560,20 @@ class BambaMixer(nn.Module):

self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias)

global causal_conv1d_update, causal_conv1d_fn
causal_conv1d = lazy_load_kernel("causal-conv1d")
causal_conv1d_update = getattr(causal_conv1d, "causal_conv1d_update", None)
causal_conv1d_fn = getattr(causal_conv1d, "causal_conv1d_fn", None)

global selective_state_update, mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
mamba_ssm = lazy_load_kernel("mamba-ssm")
selective_state_update = getattr(mamba_ssm, "selective_state_update", None)
mamba_chunk_scan_combined = getattr(mamba_ssm, "mamba_chunk_scan_combined", None)
mamba_split_conv1d_scan_combined = getattr(mamba_ssm, "mamba_split_conv1d_scan_combined", None)

global is_fast_path_available
is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))

if not is_fast_path_available:
logger.warning_once(
"The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"


+ 15
- 15
src/transformers/models/bamba/modular_bamba.py View File

@@ -43,6 +43,7 @@ from transformers.models.mamba2.modeling_mamba2 import (
)

from ... import initialization as init
from ...integrations.hub_kernels import lazy_load_kernel
from ...modeling_attn_mask_utils import AttentionMaskConverter
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from ...modeling_utils import PreTrainedModel
@@ -52,24 +53,9 @@ from ...utils import (
can_return_tuple,
logging,
)
from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
from .configuration_bamba import BambaConfig


if is_mamba_2_ssm_available():
from mamba_ssm.ops.triton.selective_state_update import selective_state_update
from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
else:
selective_state_update = None

if is_causal_conv1d_available():
from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
else:
causal_conv1d_update, causal_conv1d_fn = None, None

is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))


logger = logging.get_logger(__name__)


@@ -276,6 +262,20 @@ class BambaMixer(nn.Module):

self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias)

global causal_conv1d_update, causal_conv1d_fn
causal_conv1d = lazy_load_kernel("causal-conv1d")
causal_conv1d_update = getattr(causal_conv1d, "causal_conv1d_update", None)
causal_conv1d_fn = getattr(causal_conv1d, "causal_conv1d_fn", None)

global selective_state_update, mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
mamba_ssm = lazy_load_kernel("mamba-ssm")
selective_state_update = getattr(mamba_ssm, "selective_state_update", None)
mamba_chunk_scan_combined = getattr(mamba_ssm, "mamba_chunk_scan_combined", None)
mamba_split_conv1d_scan_combined = getattr(mamba_ssm, "mamba_split_conv1d_scan_combined", None)

global is_fast_path_available
is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))

if not is_fast_path_available:
logger.warning_once(
"The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"


+ 149
- 0
src/transformers/models/blt/modeling_blt.py View File

@@ -444,6 +444,155 @@ class BltPreTrainedModel(PreTrainedModel):
"attentions": OutputRecorder(BltSelfAttention, index=1, layer_name="local_decoder"),
}

@torch.no_grad()
def _init_weights(self, module):
"""
Initialize BLT weights following the original ByteLatentTransformer:

- Most weights are drawn from a truncated normal.
- Scale is ~ 1 / sqrt(model_dim) (or 1 / sqrt(hidden_dim) for FFN outputs).
- Norm layers are set to weight = 1, bias = 0.
"""
class_name = module.__class__.__name__

# Norms: RMSNorm / LayerNorm
if isinstance(module, (BltRMSNorm, nn.LayerNorm)) or "RMSNorm" in class_name or "LayerNorm" in class_name:
if getattr(module, "weight", None) is not None:
nn.init.ones_(module.weight)
if getattr(module, "bias", None) is not None:
nn.init.zeros_(module.bias)
return

# Embeddings (encoder / patcher / hash embeddings)
if isinstance(module, nn.Embedding):
hidden_size = getattr(self.config, "hidden_size", None)
if hidden_size is None and hasattr(self.config, "encoder_config"):
hidden_size = getattr(self.config.encoder_config, "hidden_size", None)
if hidden_size is None:
hidden_size = module.embedding_dim

std = hidden_size**-0.5
nn.init.trunc_normal_(
module.weight,
mean=0.0,
std=std,
a=-3 * std,
b=3 * std,
)
if module.padding_idx is not None:
nn.init.zeros_(module.weight[module.padding_idx])
return

# Self-attention / cross-attention projections
if isinstance(module, (BltSelfAttention, BltCrossAttention)) or class_name in (
"MllamaTextSelfAttention",
"MllamaTextCrossAttention",
):
dim = getattr(self.config, "hidden_size", None)
if dim is None and hasattr(module, "hidden_size"):
dim = module.hidden_size
if dim is None:
for name in ("q_proj", "k_proj", "v_proj", "o_proj", "dense"):
proj = getattr(module, name, None)
if proj is not None and hasattr(proj, "weight"):
dim = proj.weight.shape[-1]
break
if dim is None:
return

std = dim**-0.5

# Input projections (q, k, v)
for proj_name in ("q_proj", "k_proj", "v_proj"):
proj = getattr(module, proj_name, None)
if proj is not None and hasattr(proj, "weight"):
nn.init.trunc_normal_(
proj.weight,
mean=0.0,
std=std,
a=-3 * std,
b=3 * std,
)
if getattr(proj, "bias", None) is not None:
nn.init.zeros_(proj.bias)

# Output projection: o_proj or dense
o_proj = getattr(module, "o_proj", getattr(module, "dense", None))
if o_proj is not None and hasattr(o_proj, "weight"):
nn.init.trunc_normal_(
o_proj.weight,
mean=0.0,
std=std,
a=-3 * std,
b=3 * std,
)
if getattr(o_proj, "bias", None) is not None:
nn.init.zeros_(o_proj.bias)
return

# MLP / FFN blocks
if isinstance(module, BltMLP) or class_name == "MllamaTextMLP":
hidden_size = getattr(self.config, "hidden_size", None)
if hidden_size is None and hasattr(self.config, "decoder_config"):
hidden_size = getattr(self.config.decoder_config, "hidden_size", None)
if hidden_size is None and hasattr(self.config, "encoder_config"):
hidden_size = getattr(self.config.encoder_config, "hidden_size", None)

# Input-side std
in_std = None
if hidden_size is not None:
in_std = hidden_size**-0.5

gate_proj = getattr(module, "gate_proj", getattr(module, "fc1", None))
up_proj = getattr(module, "up_proj", None)
down_proj = getattr(module, "down_proj", getattr(module, "fc2", None))

# gate / input projections
for proj in (gate_proj, up_proj):
if proj is not None and hasattr(proj, "weight"):
std = in_std or (proj.weight.shape[1] ** -0.5)
nn.init.trunc_normal_(
proj.weight,
mean=0.0,
std=std,
a=-3 * std,
b=3 * std,
)
if getattr(proj, "bias", None) is not None:
nn.init.zeros_(proj.bias)

# output/ down projections
if down_proj is not None and hasattr(down_proj, "weight"):
hidden_dim = down_proj.weight.shape[1]
out_std = hidden_dim**-0.5
nn.init.trunc_normal_(
down_proj.weight,
mean=0.0,
std=out_std,
a=-3 * out_std,
b=3 * out_std,
)
if getattr(down_proj, "bias", None) is not None:
nn.init.zeros_(down_proj.bias)
return

# Generic Linear layers (projections, lm_head, etc.)
if isinstance(module, nn.Linear):
fan_in = module.in_features
std = fan_in**-0.5
nn.init.trunc_normal_(
module.weight,
mean=0.0,
std=std,
a=-3 * std,
b=3 * std,
)
if module.bias is not None:
nn.init.zeros_(module.bias)
return

return


class BltLocalEncoder(BltPreTrainedModel):
config: BltLocalEncoderConfig


+ 155
- 1
src/transformers/models/blt/modular_blt.py View File

@@ -360,8 +360,162 @@ class BltPreTrainedModel(MllamaPreTrainedModel):
"attentions": OutputRecorder(BltSelfAttention, index=1, layer_name="local_decoder"),
}

# Weight initialization is adapted from:
# - https://github.com/facebookresearch/blt/blob/main/bytelatent/model/blt.py
# - https://github.com/pytorch/torchtitan/blob/main/torchtitan/experiments/transformers_modeling_backend/model/model.py
#
# Both implementations use truncated normal initialization with std ~ 1 / sqrt(d_model)
# (or 1 / sqrt(hidden_dim) for FFN outputs), and unit initialization for normalization layers.
# We follow the same scheme here, but expressed in the Transformers APIs.

@torch.no_grad()
def _init_weights(self, module):
raise AttributeError("No need to inherit it!")
"""
Initialize BLT weights following the original ByteLatentTransformer:

- Most weights are drawn from a truncated normal.
- Scale is ~ 1 / sqrt(model_dim) (or 1 / sqrt(hidden_dim) for FFN outputs).
- Norm layers are set to weight = 1, bias = 0.
"""
class_name = module.__class__.__name__

# Norms: RMSNorm / LayerNorm
if isinstance(module, (BltRMSNorm, nn.LayerNorm)) or "RMSNorm" in class_name or "LayerNorm" in class_name:
if getattr(module, "weight", None) is not None:
nn.init.ones_(module.weight)
if getattr(module, "bias", None) is not None:
nn.init.zeros_(module.bias)
return

# Embeddings (encoder / patcher / hash embeddings)
if isinstance(module, nn.Embedding):
hidden_size = getattr(self.config, "hidden_size", None)
if hidden_size is None and hasattr(self.config, "encoder_config"):
hidden_size = getattr(self.config.encoder_config, "hidden_size", None)
if hidden_size is None:
hidden_size = module.embedding_dim

std = hidden_size**-0.5
nn.init.trunc_normal_(
module.weight,
mean=0.0,
std=std,
a=-3 * std,
b=3 * std,
)
if module.padding_idx is not None:
nn.init.zeros_(module.weight[module.padding_idx])
return

# Self-attention / cross-attention projections
if isinstance(module, (BltSelfAttention, BltCrossAttention)) or class_name in (
"MllamaTextSelfAttention",
"MllamaTextCrossAttention",
):
dim = getattr(self.config, "hidden_size", None)
if dim is None and hasattr(module, "hidden_size"):
dim = module.hidden_size
if dim is None:
for name in ("q_proj", "k_proj", "v_proj", "o_proj", "dense"):
proj = getattr(module, name, None)
if proj is not None and hasattr(proj, "weight"):
dim = proj.weight.shape[-1]
break
if dim is None:
return

std = dim**-0.5

# Input projections (q, k, v)
for proj_name in ("q_proj", "k_proj", "v_proj"):
proj = getattr(module, proj_name, None)
if proj is not None and hasattr(proj, "weight"):
nn.init.trunc_normal_(
proj.weight,
mean=0.0,
std=std,
a=-3 * std,
b=3 * std,
)
if getattr(proj, "bias", None) is not None:
nn.init.zeros_(proj.bias)

# Output projection: o_proj or dense
o_proj = getattr(module, "o_proj", getattr(module, "dense", None))
if o_proj is not None and hasattr(o_proj, "weight"):
nn.init.trunc_normal_(
o_proj.weight,
mean=0.0,
std=std,
a=-3 * std,
b=3 * std,
)
if getattr(o_proj, "bias", None) is not None:
nn.init.zeros_(o_proj.bias)
return

# MLP / FFN blocks
if isinstance(module, BltMLP) or class_name == "MllamaTextMLP":
hidden_size = getattr(self.config, "hidden_size", None)
if hidden_size is None and hasattr(self.config, "decoder_config"):
hidden_size = getattr(self.config.decoder_config, "hidden_size", None)
if hidden_size is None and hasattr(self.config, "encoder_config"):
hidden_size = getattr(self.config.encoder_config, "hidden_size", None)

# Input-side std
in_std = None
if hidden_size is not None:
in_std = hidden_size**-0.5

gate_proj = getattr(module, "gate_proj", getattr(module, "fc1", None))
up_proj = getattr(module, "up_proj", None)
down_proj = getattr(module, "down_proj", getattr(module, "fc2", None))

# gate / input projections
for proj in (gate_proj, up_proj):
if proj is not None and hasattr(proj, "weight"):
std = in_std or (proj.weight.shape[1] ** -0.5)
nn.init.trunc_normal_(
proj.weight,
mean=0.0,
std=std,
a=-3 * std,
b=3 * std,
)
if getattr(proj, "bias", None) is not None:
nn.init.zeros_(proj.bias)

# output/ down projections
if down_proj is not None and hasattr(down_proj, "weight"):
hidden_dim = down_proj.weight.shape[1]
out_std = hidden_dim**-0.5
nn.init.trunc_normal_(
down_proj.weight,
mean=0.0,
std=out_std,
a=-3 * out_std,
b=3 * out_std,
)
if getattr(down_proj, "bias", None) is not None:
nn.init.zeros_(down_proj.bias)
return

# Generic Linear layers (projections, lm_head, etc.)
if isinstance(module, nn.Linear):
fan_in = module.in_features
std = fan_in**-0.5
nn.init.trunc_normal_(
module.weight,
mean=0.0,
std=std,
a=-3 * std,
b=3 * std,
)
if module.bias is not None:
nn.init.zeros_(module.bias)
return

return

def _update_causal_mask(self, module):
raise AttributeError("No need to inherit it!")


+ 4
- 4
src/transformers/models/dac/modeling_dac.py View File

@@ -16,7 +16,7 @@

import math
from dataclasses import dataclass
from typing import Optional
from typing import Optional, Union

import numpy as np
import torch
@@ -583,7 +583,7 @@ class DacModel(DacPreTrainedModel):
input_values: torch.Tensor,
n_quantizers: Optional[int] = None,
return_dict: Optional[bool] = None,
):
) -> Union[tuple, DacEncoderOutput]:
r"""
input_values (`torch.Tensor of shape `(batch_size, 1, time_steps)`):
Input audio data to encode,
@@ -610,7 +610,7 @@ class DacModel(DacPreTrainedModel):
quantized_representation: Optional[torch.Tensor] = None,
audio_codes: Optional[torch.Tensor] = None,
return_dict: Optional[bool] = None,
):
) -> Union[tuple, DacDecoderOutput]:
r"""
quantized_representation (torch.Tensor of shape `(batch_size, dimension, time_steps)`, *optional*):
Quantized continuous representation of input.
@@ -643,7 +643,7 @@ class DacModel(DacPreTrainedModel):
input_values: torch.Tensor,
n_quantizers: Optional[int] = None,
return_dict: Optional[bool] = None,
):
) -> Union[tuple, DacOutput]:
r"""
input_values (`torch.Tensor` of shape `(batch_size, 1, time_steps)`):
Audio data to encode.


+ 2
- 2
src/transformers/models/deepseek_vl/modeling_deepseek_vl.py View File

@@ -196,7 +196,7 @@ class DeepseekVLModel(DeepseekVLPreTrainedModel):
use_cache: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs,
):
) -> DeepseekVLBaseModelOutputWithPast:
if (input_ids is None) ^ (inputs_embeds is not None):
raise ValueError(
"You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
@@ -268,7 +268,7 @@ class DeepseekVLForConditionalGeneration(DeepseekVLPreTrainedModel, GenerationMi
use_cache: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[TransformersKwargs],
):
) -> DeepseekVLCausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,


+ 2
- 2
src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py View File

@@ -314,7 +314,7 @@ class DeepseekVLHybridModel(DeepseekVLHybridPreTrainedModel):
use_cache: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs,
):
) -> DeepseekVLHybridBaseModelOutputWithPast:
if (input_ids is None) ^ (inputs_embeds is not None):
raise ValueError(
"You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
@@ -424,7 +424,7 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLHybridPreTrainedModel,
use_cache: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[TransformersKwargs],
):
) -> DeepseekVLHybridCausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,


+ 2
- 2
src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py View File

@@ -297,7 +297,7 @@ class DeepseekVLHybridModel(DeepseekVLModel):
use_cache: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs,
):
) -> DeepseekVLHybridBaseModelOutputWithPast:
if (input_ids is None) ^ (inputs_embeds is not None):
raise ValueError(
"You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
@@ -361,7 +361,7 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneratio
use_cache: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[TransformersKwargs],
):
) -> DeepseekVLHybridCausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,


+ 8
- 8
src/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py View File

@@ -373,14 +373,14 @@ class Ernie4_5_MoeTopKRouter(nn.Module):

with maybe_autocast(device_type=device_type, enabled=False): # Force float32
router_logits = F.linear(hidden_states.float(), self.weight)
router_logits = F.softmax(router_logits, dim=1, dtype=torch.float)
router_top_value, router_indices = torch.topk(self.moe_statics(router_logits), self.top_k, dim=-1)
router_top_value = router_top_value / torch.clamp(
router_top_value.sum(dim=-1, keepdim=True), min=self.norm_min
routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
_, selected_experts = torch.topk(self.moe_statics(routing_weights), self.top_k, dim=-1)
routing_weights = torch.gather(routing_weights, dim=-1, index=selected_experts)
routing_weights = routing_weights / torch.clamp(
routing_weights.sum(dim=-1, keepdim=True), min=self.norm_min
)
router_scores = router_top_value
router_scores = router_scores.to(hidden_states.dtype)
return router_logits, router_scores, router_indices
routing_weights = routing_weights.to(hidden_states.dtype)
return router_logits, selected_experts, routing_weights


class Ernie4_5_MoeSparseMoeBlock(nn.Module):
@@ -403,7 +403,7 @@ class Ernie4_5_MoeSparseMoeBlock(nn.Module):
if self.shared_experts is not None:
shared_output = self.shared_experts(hidden_states)

_, top_k_weights, top_k_index = self.gate(hidden_states)
_, top_k_index, top_k_weights = self.gate(hidden_states)
final_hidden_states = self.experts(hidden_states, top_k_index, top_k_weights)

if self.shared_experts is not None:


+ 8
- 8
src/transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py View File

@@ -148,14 +148,14 @@ class Ernie4_5_MoeTopKRouter(nn.Module):

with maybe_autocast(device_type=device_type, enabled=False): # Force float32
router_logits = F.linear(hidden_states.float(), self.weight)
router_logits = F.softmax(router_logits, dim=1, dtype=torch.float)
router_top_value, router_indices = torch.topk(self.moe_statics(router_logits), self.top_k, dim=-1)
router_top_value = router_top_value / torch.clamp(
router_top_value.sum(dim=-1, keepdim=True), min=self.norm_min
routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
_, selected_experts = torch.topk(self.moe_statics(routing_weights), self.top_k, dim=-1)
routing_weights = torch.gather(routing_weights, dim=-1, index=selected_experts)
routing_weights = routing_weights / torch.clamp(
routing_weights.sum(dim=-1, keepdim=True), min=self.norm_min
)
router_scores = router_top_value
router_scores = router_scores.to(hidden_states.dtype)
return router_logits, router_scores, router_indices
routing_weights = routing_weights.to(hidden_states.dtype)
return router_logits, selected_experts, routing_weights


class Ernie4_5_MoeSparseMoeBlock(nn.Module):
@@ -178,7 +178,7 @@ class Ernie4_5_MoeSparseMoeBlock(nn.Module):
if self.shared_experts is not None:
shared_output = self.shared_experts(hidden_states)

_, top_k_weights, top_k_index = self.gate(hidden_states)
_, top_k_index, top_k_weights = self.gate(hidden_states)
final_hidden_states = self.experts(hidden_states, top_k_index, top_k_weights)

if self.shared_experts is not None:


+ 1
- 0
src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py View File

@@ -79,6 +79,7 @@ class FastSpeech2ConformerTokenizer(PreTrainedTokenizer):
unk_token=unk_token,
pad_token=pad_token,
should_strip_spaces=should_strip_spaces,
special_tokens_pattern="none",
**kwargs,
)



+ 1
- 1
src/transformers/models/flava/modeling_flava.py View File

@@ -1107,7 +1107,7 @@ class FlavaModel(FlavaPreTrainedModel):
output_hidden_states: bool = True,
return_dict: Optional[bool] = None,
**kwargs,
) -> Union[tuple, FlavaOutput]:
) -> Union[tuple, FlavaModelOutput]:
r"""
input_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`):
Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See


+ 4
- 0
src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py View File

@@ -92,6 +92,8 @@ class GraniteMoeHybridConfig(PreTrainedConfig):
allow the model to output the auxiliary loss.
router_aux_loss_coef (`float`, *optional*, defaults to 0.001): router auxiliary loss coefficient
shared_intermediate_size (`int`, *optional*, defaults to 1024): intermediate size for shared experts.
position_embedding_type (`str`, *optional*):
Positional embedding type to be used; defaults to None. Allowed options: `[None, "rope"]`
layer_types (`List`, *optional*): list of strings to be used as layer types.
Allowed choices: "mamba", "attention".
mamba_n_heads (`int`, *optional*, defaults to 128):
@@ -159,6 +161,7 @@ class GraniteMoeHybridConfig(PreTrainedConfig):
output_router_logits: Optional[bool] = False,
router_aux_loss_coef: Optional[float] = 0.001,
shared_intermediate_size: Optional[int] = 1024,
position_embedding_type: Optional[str] = None,
layer_types: Optional[list[str]] = None,
mamba_n_heads: Optional[int] = 128,
mamba_n_groups: Optional[int] = 1,
@@ -198,6 +201,7 @@ class GraniteMoeHybridConfig(PreTrainedConfig):
self.output_router_logits = output_router_logits
self.router_aux_loss_coef = router_aux_loss_coef
self.shared_intermediate_size = shared_intermediate_size
self.position_embedding_type = position_embedding_type
self.rope_parameters = rope_parameters

mamba_intermediate = mamba_expand * hidden_size


+ 24
- 18
src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py View File

@@ -32,6 +32,7 @@ from ... import initialization as init
from ...cache_utils import Cache
from ...generation import GenerationMixin
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
from ...integrations.hub_kernels import lazy_load_kernel
from ...masking_utils import create_causal_mask
from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_outputs import BaseModelOutputWithPast, MoeCausalLMOutputWithPast, MoeModelOutputWithPast
@@ -40,22 +41,9 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
from ...utils.generic import check_model_inputs, maybe_autocast
from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
from .configuration_granitemoehybrid import GraniteMoeHybridConfig


if is_mamba_2_ssm_available():
from mamba_ssm.ops.triton.selective_state_update import selective_state_update
from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
else:
selective_state_update = None

if is_causal_conv1d_available():
from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
else:
causal_conv1d_update, causal_conv1d_fn = None, None


logger = logging.get_logger(__name__)


@@ -165,6 +153,7 @@ class GraniteMoeHybridAttention(nn.Module):
attention_mask: Optional[torch.Tensor],
past_key_values: Optional[Cache] = None,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # None or rope embeddings
**kwargs: Unpack[TransformersKwargs],
) -> tuple[torch.Tensor, torch.Tensor]:
input_shape = hidden_states.shape[:-1]
@@ -174,6 +163,10 @@ class GraniteMoeHybridAttention(nn.Module):
key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)

if position_embeddings is not None:
cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

if past_key_values is not None:
cache_kwargs = {"cache_position": cache_position}
key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
@@ -371,9 +364,6 @@ def apply_mask_to_padding_states(hidden_states, attention_mask):
return hidden_states


is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))


# Adapted from transformers.models.mamba2.modeling_mamba2.Mamba2Mixer
class GraniteMoeHybridMambaLayer(nn.Module):
"""
@@ -445,6 +435,20 @@ class GraniteMoeHybridMambaLayer(nn.Module):

self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias)

global causal_conv1d_update, causal_conv1d_fn
causal_conv1d = lazy_load_kernel("causal-conv1d")
causal_conv1d_update = getattr(causal_conv1d, "causal_conv1d_update", None)
causal_conv1d_fn = getattr(causal_conv1d, "causal_conv1d_fn", None)

global selective_state_update, mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
mamba_ssm = lazy_load_kernel("mamba-ssm")
selective_state_update = getattr(mamba_ssm, "selective_state_update", None)
mamba_chunk_scan_combined = getattr(mamba_ssm, "mamba_chunk_scan_combined", None)
mamba_split_conv1d_scan_combined = getattr(mamba_ssm, "mamba_split_conv1d_scan_combined", None)

global is_fast_path_available
is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))

if not is_fast_path_available:
logger.warning_once(
"The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"
@@ -1265,7 +1269,7 @@ class GraniteMoeHybridModel(GraniteMoeHybridPreTrainedModel):
[GraniteMoeHybridDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self.norm = GraniteMoeHybridRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.rotary_emb = GraniteMoeHybridRotaryEmbedding(config=config)
self.rotary_emb = GraniteMoeHybridRotaryEmbedding(config) if config.position_embedding_type == "rope" else None
self.gradient_checkpointing = False
self.embedding_multiplier = config.embedding_multiplier

@@ -1313,7 +1317,9 @@ class GraniteMoeHybridModel(GraniteMoeHybridPreTrainedModel):

# embed positions
hidden_states = inputs_embeds
position_embeddings = self.rotary_emb(hidden_states, position_ids)
position_embeddings = None
if self.rotary_emb is not None:
position_embeddings = self.rotary_emb(hidden_states, position_ids)

for decoder_layer in self.layers:
# Depending on the layer type we opt for 2D base attention mask (Mamba) or 4D causal mask (Attention)


+ 10
- 1
src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py View File

@@ -39,6 +39,7 @@ from ..granitemoeshared.modeling_granitemoeshared import (
GraniteMoeSharedModel,
GraniteMoeSharedMoE,
GraniteMoeSharedPreTrainedModel,
apply_rotary_pos_emb,
eager_attention_forward,
)
from .configuration_granitemoehybrid import GraniteMoeHybridConfig
@@ -57,6 +58,7 @@ class GraniteMoeHybridAttention(GraniteMoeSharedAttention):
attention_mask: Optional[torch.Tensor],
past_key_values: Optional[Cache] = None,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # None or rope embeddings
**kwargs: Unpack[TransformersKwargs],
) -> tuple[torch.Tensor, torch.Tensor]:
input_shape = hidden_states.shape[:-1]
@@ -66,6 +68,10 @@ class GraniteMoeHybridAttention(GraniteMoeSharedAttention):
key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)

if position_embeddings is not None:
cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

if past_key_values is not None:
cache_kwargs = {"cache_position": cache_position}
key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
@@ -203,6 +209,7 @@ class GraniteMoeHybridModel(GraniteMoeSharedModel):
[GraniteMoeHybridDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self.embedding_multiplier = config.embedding_multiplier
self.rotary_emb = GraniteMoeHybridRotaryEmbedding(config) if config.position_embedding_type == "rope" else None

@auto_docstring
@check_model_inputs
@@ -245,7 +252,9 @@ class GraniteMoeHybridModel(GraniteMoeSharedModel):

# embed positions
hidden_states = inputs_embeds
position_embeddings = self.rotary_emb(hidden_states, position_ids)
position_embeddings = None
if self.rotary_emb is not None:
position_embeddings = self.rotary_emb(hidden_states, position_ids)

for decoder_layer in self.layers:
# Depending on the layer type we opt for 2D base attention mask (Mamba) or 4D causal mask (Attention)


+ 3
- 3
src/transformers/models/grounding_dino/modeling_grounding_dino.py View File

@@ -1511,7 +1511,7 @@ class GroundingDinoEncoder(GroundingDinoPreTrainedModel):
output_hidden_states=None,
return_dict=None,
**kwargs,
):
) -> Union[tuple, GroundingDinoEncoderOutput]:
r"""
Args:
vision_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -1666,7 +1666,7 @@ class GroundingDinoDecoder(GroundingDinoPreTrainedModel):
output_hidden_states=None,
return_dict=None,
**kwargs,
):
) -> Union[tuple, GroundingDinoDecoderOutput]:
r"""
Args:
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
@@ -2059,7 +2059,7 @@ class GroundingDinoModel(GroundingDinoPreTrainedModel):
output_hidden_states=None,
return_dict=None,
**kwargs,
):
) -> Union[tuple, GroundingDinoModelOutput]:
r"""
input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide


+ 17
- 18
src/transformers/models/jamba/modeling_jamba.py View File

@@ -33,6 +33,7 @@ from ... import initialization as init
from ...activations import ACT2FN
from ...generation import GenerationMixin
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
from ...integrations.hub_kernels import lazy_load_kernel
from ...masking_utils import create_causal_mask
from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
@@ -40,22 +41,9 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
from ...utils.generic import OutputRecorder, check_model_inputs
from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available
from .configuration_jamba import JambaConfig


if is_mamba_ssm_available():
from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, selective_scan_fn
from mamba_ssm.ops.triton.selective_state_update import selective_state_update
else:
selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None

if is_causal_conv1d_available():
from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
else:
causal_conv1d_update, causal_conv1d_fn = None, None


logger = logging.get_logger(__name__)


@@ -306,11 +294,6 @@ class JambaAttention(nn.Module):
return attn_output, attn_weights


is_fast_path_available = all(
(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
)


class JambaMambaMixer(nn.Module):
"""
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
@@ -364,6 +347,22 @@ class JambaMambaMixer(nn.Module):
self.b_layernorm = JambaRMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
self.c_layernorm = JambaRMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)

global causal_conv1d_update, causal_conv1d_fn
causal_conv1d = lazy_load_kernel("causal-conv1d")
causal_conv1d_update = getattr(causal_conv1d, "causal_conv1d_update", None)
causal_conv1d_fn = getattr(causal_conv1d, "causal_conv1d_fn", None)

global selective_state_update, mamba_inner_fn, selective_scan_fn
mamba_ssm = lazy_load_kernel("mamba-ssm")
selective_state_update = getattr(mamba_ssm, "selective_state_update", None)
mamba_inner_fn = getattr(mamba_ssm, "mamba_inner_fn", None)
selective_scan_fn = getattr(mamba_ssm, "selective_scan_fn", None)

global is_fast_path_available
is_fast_path_available = all(
(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
)

if not is_fast_path_available:
logger.warning_once(
"The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"


+ 17
- 17
src/transformers/models/jamba/modular_jamba.py View File

@@ -25,6 +25,7 @@ from torch import nn

from ... import initialization as init
from ...activations import ACT2FN
from ...integrations.hub_kernels import lazy_load_kernel
from ...masking_utils import create_causal_mask
from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
@@ -32,29 +33,12 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...utils import TransformersKwargs, auto_docstring, logging
from ...utils.generic import OutputRecorder, check_model_inputs
from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available
from ..llama.modeling_llama import LlamaAttention, LlamaRMSNorm, eager_attention_forward
from ..mistral.modeling_mistral import MistralMLP
from ..mixtral.modeling_mixtral import MixtralExperts, MixtralForCausalLM
from .configuration_jamba import JambaConfig


if is_mamba_ssm_available():
from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, selective_scan_fn
from mamba_ssm.ops.triton.selective_state_update import selective_state_update
else:
selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None

if is_causal_conv1d_available():
from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
else:
causal_conv1d_update, causal_conv1d_fn = None, None

is_fast_path_available = all(
(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
)


logger = logging.get_logger(__name__)


@@ -258,6 +242,22 @@ class JambaMambaMixer(nn.Module):
self.b_layernorm = JambaRMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
self.c_layernorm = JambaRMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)

global causal_conv1d_update, causal_conv1d_fn
causal_conv1d = lazy_load_kernel("causal-conv1d")
causal_conv1d_update = getattr(causal_conv1d, "causal_conv1d_update", None)
causal_conv1d_fn = getattr(causal_conv1d, "causal_conv1d_fn", None)

global selective_state_update, mamba_inner_fn, selective_scan_fn
mamba_ssm = lazy_load_kernel("mamba-ssm")
selective_state_update = getattr(mamba_ssm, "selective_state_update", None)
mamba_inner_fn = getattr(mamba_ssm, "mamba_inner_fn", None)
selective_scan_fn = getattr(mamba_ssm, "selective_scan_fn", None)

global is_fast_path_available
is_fast_path_available = all(
(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
)

if not is_fast_path_available:
logger.warning_once(
"The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"


+ 2
- 2
src/transformers/models/janus/modeling_janus.py View File

@@ -1124,7 +1124,7 @@ class JanusModel(JanusPreTrainedModel):
use_cache: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs,
):
) -> JanusBaseModelOutputWithPast:
if (input_ids is None) ^ (inputs_embeds is not None):
raise ValueError(
"You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
@@ -1201,7 +1201,7 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
use_cache: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[TransformersKwargs],
):
) -> JanusCausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,


+ 2
- 2
src/transformers/models/janus/modular_janus.py View File

@@ -942,7 +942,7 @@ class JanusModel(JanusPreTrainedModel):
use_cache: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs,
):
) -> JanusBaseModelOutputWithPast:
if (input_ids is None) ^ (inputs_embeds is not None):
raise ValueError(
"You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
@@ -1019,7 +1019,7 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
use_cache: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[TransformersKwargs],
):
) -> JanusCausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,


+ 23
- 23
src/transformers/models/mamba2/modeling_mamba2.py View File

@@ -24,6 +24,7 @@ from torch import nn
from ... import initialization as init
from ...activations import ACT2FN
from ...generation import GenerationMixin
from ...integrations.hub_kernels import lazy_load_kernel
from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_utils import PreTrainedModel
from ...utils import (
@@ -31,35 +32,12 @@ from ...utils import (
auto_docstring,
logging,
)
from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
from .configuration_mamba2 import Mamba2Config


logger = logging.get_logger(__name__)


if is_mamba_2_ssm_available():
from mamba_ssm.ops.triton.selective_state_update import selective_state_update
from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
else:
mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined, selective_state_update = None, None, None

if is_causal_conv1d_available():
from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
else:
causal_conv1d_update, causal_conv1d_fn = None, None

is_fast_path_available = all(
(
selective_state_update,
mamba_chunk_scan_combined,
mamba_split_conv1d_scan_combined,
causal_conv1d_fn,
causal_conv1d_update,
)
)


# Helper methods for segment sum computation


@@ -286,6 +264,28 @@ class Mamba2Mixer(nn.Module):
self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
self.use_bias = config.use_bias

global causal_conv1d_update, causal_conv1d_fn
causal_conv1d = lazy_load_kernel("causal-conv1d")
causal_conv1d_update = getattr(causal_conv1d, "causal_conv1d_update", None)
causal_conv1d_fn = getattr(causal_conv1d, "causal_conv1d_fn", None)

global selective_state_update, mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
mamba_ssm = lazy_load_kernel("mamba-ssm")
selective_state_update = getattr(mamba_ssm, "selective_state_update", None)
mamba_chunk_scan_combined = getattr(mamba_ssm, "mamba_chunk_scan_combined", None)
mamba_split_conv1d_scan_combined = getattr(mamba_ssm, "mamba_split_conv1d_scan_combined", None)

global is_fast_path_available
is_fast_path_available = all(
(
selective_state_update,
mamba_chunk_scan_combined,
mamba_split_conv1d_scan_combined,
causal_conv1d_fn,
causal_conv1d_update,
)
)

if not is_fast_path_available:
logger.warning_once(
"The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"


+ 3
- 3
src/transformers/models/maskformer/modeling_maskformer_swin.py View File

@@ -19,7 +19,7 @@ states before downsampling, which is different from the default Swin Transformer
import collections.abc
import math
from dataclasses import dataclass
from typing import Optional
from typing import Optional, Union

import torch
from torch import Tensor, nn
@@ -656,7 +656,7 @@ class MaskFormerSwinEncoder(nn.Module):
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
) -> Union[tuple, MaskFormerSwinBaseModelOutput]:
all_hidden_states = () if output_hidden_states else None
all_input_dimensions = ()
all_self_attentions = () if output_attentions else None
@@ -739,7 +739,7 @@ class MaskFormerSwinModel(MaskFormerSwinPreTrainedModel):
interpolate_pos_encoding=False,
return_dict=None,
**kwargs,
):
) -> Union[tuple, MaskFormerSwinModelOutputWithPooling]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states


+ 74
- 61
src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py View File

@@ -35,46 +35,48 @@ from transformers.quantizers.auto import AutoQuantizationConfig


# fmt: off
STATE_DICT_MAPPING = {
# Text model keys
r"^output.weight": r"lm_head.weight",
r"^norm.weight": r"model.language_model.norm.weight",
r"^tok_embeddings.weight": r"model.language_model.embed_tokens.weight",
r"^layers.(\d+).attention_norm.weight": r"model.language_model.layers.\1.input_layernorm.weight",
r"^layers.(\d+).ffn_norm.weight": r"model.language_model.layers.\1.post_attention_layernorm.weight",
r"^layers.(\d+).attention.w(q|k|v|o).weight": r"model.language_model.layers.\1.self_attn.\2_proj.weight",
r"^layers.(\d+).feed_forward.w1.weight": r"model.language_model.layers.\1.mlp.gate_proj.weight",
r"^layers.(\d+).feed_forward.w2.weight": r"model.language_model.layers.\1.mlp.down_proj.weight",
r"^layers.(\d+).feed_forward.w3.weight": r"model.language_model.layers.\1.mlp.up_proj.weight",
r"^layers.(\d+).attention.w(q|k|v|o).qscale_act": r"model.language_model.layers.\1.self_attn.\2_proj.activation_scale",
r"^layers.(\d+).feed_forward.w1.qscale_act": r"model.language_model.layers.\1.mlp.gate_proj.activation_scale",
r"^layers.(\d+).feed_forward.w2.qscale_act": r"model.language_model.layers.\1.mlp.down_proj.activation_scale",
r"^layers.(\d+).feed_forward.w3.qscale_act": r"model.language_model.layers.\1.mlp.up_proj.activation_scale",
r"^layers.(\d+).attention.w(q|k|v|o).qscale_weight": r"model.language_model.layers.\1.self_attn.\2_proj.weight_scale_inv",
r"^layers.(\d+).feed_forward.w1.qscale_weight": r"model.language_model.layers.\1.mlp.gate_proj.weight_scale_inv",
r"^layers.(\d+).feed_forward.w2.qscale_weight": r"model.language_model.layers.\1.mlp.down_proj.weight_scale_inv",
r"^layers.(\d+).feed_forward.w3.qscale_weight": r"model.language_model.layers.\1.mlp.up_proj.weight_scale_inv",

# Vision model keys
r"vision_encoder.transformer.layers.(\d+).attention_norm.weight": r"model.vision_tower.transformer.layers.\1.attention_norm.weight",
r"^vision_encoder.transformer.layers.(\d+).ffn_norm.weight": r"model.vision_tower.transformer.layers.\1.ffn_norm.weight",
r"^vision_encoder.transformer.layers.(\d+).attention.w(q|k|v|o).weight": r"model.vision_tower.transformer.layers.\1.attention.\2_proj.weight",
r"^vision_encoder.transformer.layers.(\d+).feed_forward.w1.weight": r"model.vision_tower.transformer.layers.\1.feed_forward.gate_proj.weight",
r"^vision_encoder.transformer.layers.(\d+).feed_forward.w2.weight": r"model.vision_tower.transformer.layers.\1.feed_forward.down_proj.weight",
r"^vision_encoder.transformer.layers.(\d+).feed_forward.w3.weight": r"model.vision_tower.transformer.layers.\1.feed_forward.up_proj.weight",
r"^vision_language_adapter.w_in": r"model.multi_modal_projector.linear_1",
r"^vision_language_adapter.w_out": r"model.multi_modal_projector.linear_2",
r"^vision_encoder.ln_pre.weight": r"model.vision_tower.ln_pre.weight",
r"^vision_encoder.patch_conv.weight": r"model.vision_tower.patch_conv.weight",
r"^patch_merger.merging_layer.weight": r"model.multi_modal_projector.patch_merger.merging_layer.weight",
r"^pre_mm_projector_norm.weight": r"model.multi_modal_projector.norm.weight",
}
def get_sd_mapping(has_vision: bool) -> dict:
model_key = "model.language_model" if has_vision else "model"
return {
# Text model keys
r"^output.weight": r"lm_head.weight",
r"^norm.weight": rf"{model_key}.norm.weight",
r"^tok_embeddings.weight": rf"{model_key}.embed_tokens.weight",
r"^layers.(\d+).attention_norm.weight": rf"{model_key}.layers.\1.input_layernorm.weight",
r"^layers.(\d+).ffn_norm.weight": rf"{model_key}.layers.\1.post_attention_layernorm.weight",
r"^layers.(\d+).attention.w(q|k|v|o).weight": rf"{model_key}.layers.\1.self_attn.\2_proj.weight",
r"^layers.(\d+).feed_forward.w1.weight": rf"{model_key}.layers.\1.mlp.gate_proj.weight",
r"^layers.(\d+).feed_forward.w2.weight": rf"{model_key}.layers.\1.mlp.down_proj.weight",
r"^layers.(\d+).feed_forward.w3.weight": rf"{model_key}.layers.\1.mlp.up_proj.weight",
r"^layers.(\d+).attention.w(q|k|v|o).qscale_act": rf"{model_key}.layers.\1.self_attn.\2_proj.activation_scale",
r"^layers.(\d+).feed_forward.w1.qscale_act": rf"{model_key}.layers.\1.mlp.gate_proj.activation_scale",
r"^layers.(\d+).feed_forward.w2.qscale_act": rf"{model_key}.layers.\1.mlp.down_proj.activation_scale",
r"^layers.(\d+).feed_forward.w3.qscale_act": rf"{model_key}.layers.\1.mlp.up_proj.activation_scale",
r"^layers.(\d+).attention.w(q|k|v|o).qscale_weight": rf"{model_key}.layers.\1.self_attn.\2_proj.weight_scale_inv",
r"^layers.(\d+).feed_forward.w1.qscale_weight": rf"{model_key}.layers.\1.mlp.gate_proj.weight_scale_inv",
r"^layers.(\d+).feed_forward.w2.qscale_weight": rf"{model_key}.layers.\1.mlp.down_proj.weight_scale_inv",
r"^layers.(\d+).feed_forward.w3.qscale_weight": rf"{model_key}.layers.\1.mlp.up_proj.weight_scale_inv",

# Vision model keys
r"vision_encoder.transformer.layers.(\d+).attention_norm.weight": r"model.vision_tower.transformer.layers.\1.attention_norm.weight",
r"^vision_encoder.transformer.layers.(\d+).ffn_norm.weight": r"model.vision_tower.transformer.layers.\1.ffn_norm.weight",
r"^vision_encoder.transformer.layers.(\d+).attention.w(q|k|v|o).weight": r"model.vision_tower.transformer.layers.\1.attention.\2_proj.weight",
r"^vision_encoder.transformer.layers.(\d+).feed_forward.w1.weight": r"model.vision_tower.transformer.layers.\1.feed_forward.gate_proj.weight",
r"^vision_encoder.transformer.layers.(\d+).feed_forward.w2.weight": r"model.vision_tower.transformer.layers.\1.feed_forward.down_proj.weight",
r"^vision_encoder.transformer.layers.(\d+).feed_forward.w3.weight": r"model.vision_tower.transformer.layers.\1.feed_forward.up_proj.weight",
r"^vision_language_adapter.w_in": r"model.multi_modal_projector.linear_1",
r"^vision_language_adapter.w_out": r"model.multi_modal_projector.linear_2",
r"^vision_encoder.ln_pre.weight": r"model.vision_tower.ln_pre.weight",
r"^vision_encoder.patch_conv.weight": r"model.vision_tower.patch_conv.weight",
r"^patch_merger.merging_layer.weight": r"model.multi_modal_projector.patch_merger.merging_layer.weight",
r"^pre_mm_projector_norm.weight": r"model.multi_modal_projector.norm.weight",
}
# fmt: on


def map_old_key_to_new(old_key):
def map_old_key_to_new(old_key, mapping):
"""Map of a key of the original state dict to the equivalent key in HF format"""
for pattern, replacement in STATE_DICT_MAPPING.items():
for pattern, replacement in mapping.items():
new_key, n_replace = re.subn(pattern, replacement, old_key)
# Early exit of the loop
if n_replace > 0:
@@ -100,11 +102,13 @@ def convert_state_dict(original_state_dict: dict, config: Mistral3Config):
"""Convert a state dict file, when a single `nn.Module` is never sharded in different files (usual case)."""
new_dict = {}

is_vision = isinstance(config, Mistral3Config)
mapping = get_sd_mapping(is_vision)
for old_key, tensor in original_state_dict.items():
if "fake_quantizer" in old_key:
continue

new_key = map_old_key_to_new(old_key)
new_key = map_old_key_to_new(old_key, mapping)

if "vision" in old_key:
num_attention_heads = config.vision_config.num_attention_heads
@@ -114,10 +118,11 @@ def convert_state_dict(original_state_dict: dict, config: Mistral3Config):
key_value_dim = head_dim * num_attention_heads
query_dim = head_dim * num_attention_heads
else:
num_attention_heads = config.text_config.num_attention_heads
hidden_size = config.text_config.hidden_size
head_dim = config.text_config.head_dim
num_key_value_heads = config.text_config.num_key_value_heads
text_config = config.text_config if is_vision else config
num_attention_heads = text_config.num_attention_heads
hidden_size = text_config.hidden_size
head_dim = text_config.head_dim
num_key_value_heads = text_config.num_key_value_heads
key_value_dim = head_dim * num_key_value_heads
query_dim = head_dim * num_attention_heads

@@ -130,8 +135,11 @@ def convert_state_dict(original_state_dict: dict, config: Mistral3Config):
return new_dict


def convert_config(original_config: dict, max_position_embeddings: int = 262144):
def convert_config(original_config: dict, max_position_embeddings: int = 262144, is_vision: bool = True):
original_vision_config = original_config.pop("vision_encoder", None)
assert is_vision == (original_vision_config is not None), (
f"is_vision={is_vision} but original_vision_config={original_vision_config}"
)
original_text_config = original_config

# Text config
@@ -159,9 +167,9 @@ def convert_config(original_config: dict, max_position_embeddings: int = 262144)
"original_max_position_embeddings": original_config["yarn"]["original_max_position_embeddings"],
"beta_fast": float(original_config["yarn"]["beta"]),
"beta_slow": float(original_config["yarn"]["alpha"]),
"mscale_all_dim": 1.0,
"mscale_all_dim": 1.0 if is_vision else 0.0,
"mscale": 1.0,
"llama_4_scaling_beta": original_config["llama_4_scaling"]["beta"],
"llama_4_scaling_beta": original_config.get("llama_4_scaling", {}).get("beta", 0),
}

# These are not always defined depending on `params.json`
@@ -173,11 +181,25 @@ def convert_config(original_config: dict, max_position_embeddings: int = 262144)
if new_text_config_kwargs["sliding_window"] is not None:
new_text_config_kwargs["sliding_window"] = int(new_text_config_kwargs["sliding_window"])

new_text_config = Ministral3Config(**new_text_config_kwargs)
def get_maybe_quant_config() -> dict:
kwargs = {}
if original_config.get("quantization", {}).get("qformat_weight") == "fp8_e4m3":
assert original_config["quantization"]["qscheme_act"] == "TENSOR"
quantization_config = {
"activation_scheme": "static",
"modules_to_not_convert": ["model.vision_tower", "model.multi_modal_projector", "lm_head"],
"quant_method": "fp8",
"weight_block_size": None,
}
kwargs["quantization_config"] = AutoQuantizationConfig.from_dict(quantization_config)
return kwargs

# No vision
if original_vision_config is None:
new_text_config = Ministral3Config(**new_text_config_kwargs, **get_maybe_quant_config())
return new_text_config
else:
new_text_config = Ministral3Config(**new_text_config_kwargs)

# Vision config
new_vision_config = original_vision_config
@@ -191,17 +213,6 @@ def convert_config(original_config: dict, max_position_embeddings: int = 262144)
_ = new_vision_config.pop("max_image_size")
new_vision_config = PixtralVisionConfig(hidden_act="silu", **new_vision_config)

kwargs = {}
if original_config.get("quantization", {}).get("qformat_weight") == "fp8_e4m3":
assert original_config["quantization"]["qscheme_act"] == "TENSOR"
quantization_config = {
"activation_scheme": "static",
"modules_to_not_convert": ["model.vision_tower", "model.multi_modal_projector"],
"quant_method": "fp8",
"weight_block_size": None,
}
kwargs["quantization_config"] = AutoQuantizationConfig.from_dict(quantization_config)

new_config = Mistral3Config(
vision_config=new_vision_config,
text_config=new_text_config,
@@ -209,7 +220,7 @@ def convert_config(original_config: dict, max_position_embeddings: int = 262144)
image_token_id=image_token_id,
spatial_merge_size=spatial_merge_size,
vision_feature_layer=-1,
**kwargs,
**get_maybe_quant_config(),
)
return new_config

@@ -218,7 +229,8 @@ def convert_and_write_model(input_dir: str, output_dir: str, max_position_embedd
"""Convert the model and save it (this implicitly save the config as well)."""
params = read_json(os.path.join(input_dir, "params.json"))

config = convert_config(params, max_position_embeddings)
is_vision = params.get("vision_encoder") is not None
config = convert_config(params, max_position_embeddings, is_vision)

full_state_dict = {}
# The model may be split between different files, but a single nn.Module is always fully present in a single file
@@ -228,8 +240,10 @@ def convert_and_write_model(input_dir: str, output_dir: str, max_position_embedd
new_dict = convert_state_dict(original_state_dict, config)
full_state_dict.update(new_dict)

if config.text_config.tie_word_embeddings:
full_state_dict["lm_head.weight"] = full_state_dict["model.language_model.embed_tokens.weight"]
text_config = config.text_config if is_vision else config
if text_config.tie_word_embeddings:
model_key = "model.language_model" if is_vision else "model"
full_state_dict["lm_head.weight"] = full_state_dict[f"{model_key}.embed_tokens.weight"]

# Load weights into model and resave them
with torch.device("meta"):
@@ -259,7 +273,6 @@ def convert_and_write_processor_and_tokenizer(

tokenizer_file = os.path.join(input_dir, "tekken.json")
tokenizer = convert_tekken_tokenizer(tokenizer_file)
tokenizer.add_special_tokens({"pad_token": "<pad>"})

# No vision
if isinstance(model_config, Ministral3Config):


+ 1
- 1
src/transformers/models/mistral3/convert_mistral3_weights_to_hf.py View File

@@ -192,7 +192,7 @@ def convert_and_write_processor(input_dir: str, output_dir: str):
"""Convert the tokenizer and save it."""
tokenizer_file = os.path.join(input_dir, "tekken.json")
tokenizer = convert_tekken_tokenizer(tokenizer_file)
tokenizer.add_special_tokens({"pad_token": "<pad>"})
chat_template = '{%- if messages[0]["role"] == "system" %}{%- set system_message = messages[0]["content"] %}{%- set loop_messages = messages[1:] %}\n{%- else %}{%- set loop_messages = messages %}{%- endif %}{{- bos_token }}{%- for message in loop_messages %}{%- if (message[\'role\'] == \'user\') != (loop.index0 % 2 == 0) %}{{- raise_exception(\'After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\') }}{%- endif %}{%- if message["role"] == "user" %}{%- if loop.last and system_message is defined %}{{- "[INST]" + system_message + "\n\n" }}{%- else %}{{ "[INST]" }}{%- endif %}{%- endif %}{%- if message["content"] is not string %}{%- for chunk in message["content"] %}{%- if chunk["type"] == "text" %}{%- if "content" in chunk %}{{- chunk["content"] }}{%- elif "text" in chunk %}{{- chunk["text"] }}{%- endif %}{%- elif chunk["type"] == "image" %}{{- "[IMG]" }}{%- else %}{{- raise_exception("Unrecognized content type!") }}{%- endif %}{%- endfor %}{%- else %}{{- message["content"] }}{%- endif %}{%- if message["role"] == "user" %}{{- "[/INST]" }}{%- elif message["role"] == "assistant" %}{{- eos_token}}{%- else %}{{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}{%- endif %}{%- endfor %}'

config = read_json(os.path.join(input_dir, "params.json"))


+ 3
- 3
src/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py View File

@@ -1181,7 +1181,7 @@ class MMGroundingDinoEncoder(MMGroundingDinoPreTrainedModel):
output_hidden_states=None,
return_dict=None,
**kwargs,
):
) -> Union[tuple, MMGroundingDinoEncoderOutput]:
r"""
Args:
vision_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -1478,7 +1478,7 @@ class MMGroundingDinoDecoder(MMGroundingDinoPreTrainedModel):
output_hidden_states=None,
return_dict=None,
**kwargs,
):
) -> Union[tuple, MMGroundingDinoDecoderOutput]:
r"""
Args:
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
@@ -1954,7 +1954,7 @@ class MMGroundingDinoModel(MMGroundingDinoPreTrainedModel):
output_hidden_states=None,
return_dict=None,
**kwargs,
):
) -> Union[tuple, MMGroundingDinoModelOutput]:
r"""
input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide


+ 1
- 4
src/transformers/models/qwen3_next/modeling_qwen3_next.py View File

@@ -45,10 +45,7 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
from ...utils.import_utils import (
is_causal_conv1d_available,
is_flash_linear_attention_available,
)
from ...utils.import_utils import is_causal_conv1d_available, is_flash_linear_attention_available
from .configuration_qwen3_next import Qwen3NextConfig




+ 4
- 4
src/transformers/models/tvp/modeling_tvp.py View File

@@ -16,7 +16,7 @@

import math
from dataclasses import dataclass
from typing import Optional
from typing import Optional, Union

import torch
from torch import nn
@@ -462,7 +462,7 @@ class TvpEncoder(nn.Module):
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
) -> Union[tuple, BaseModelOutput]:
return_dict = return_dict if return_dict is not None else self.config.return_dict
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -722,7 +722,7 @@ class TvpModel(TvpPreTrainedModel):
return_dict: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
**kwargs,
):
) -> Union[tuple, BaseModelOutputWithPooling]:
r"""
Examples:
```python
@@ -824,7 +824,7 @@ class TvpForVideoGrounding(TvpPreTrainedModel):
return_dict: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
**kwargs,
):
) -> Union[tuple, TvpVideoGroundingOutput]:
r"""
labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*):
The labels contains duration, start time, and end time of the video corresponding to the text.


+ 3
- 3
src/transformers/models/udop/modeling_udop.py View File

@@ -1106,7 +1106,7 @@ class UdopStack(UdopPreTrainedModel):
return_dict=None,
cache_position=None,
**kwargs,
):
) -> Union[tuple, BaseModelOutputWithAttentionMask]:
use_cache = use_cache if use_cache is not None else self.config.use_cache
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1476,7 +1476,7 @@ class UdopModel(UdopPreTrainedModel):
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs,
) -> tuple[Tensor, ...]:
) -> Union[tuple, Seq2SeqModelOutput]:
r"""
bbox (`torch.LongTensor` of shape `({0}, 4)`, *optional*):
Bounding boxes of each input sequence tokens. Selected in the range `[0,
@@ -1655,7 +1655,7 @@ class UdopForConditionalGeneration(UdopPreTrainedModel, GenerationMixin):
labels: Optional[Tensor] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs,
) -> tuple[Tensor, ...]:
) -> Union[tuple, Seq2SeqLMOutput]:
r"""
bbox (`torch.LongTensor` of shape `({0}, 4)`, *optional*):
Bounding boxes of each input sequence tokens. Selected in the range `[0,


+ 2
- 14
src/transformers/processing_utils.py View File

@@ -702,17 +702,6 @@ class ProcessorMixin(PushToHubMixin):
if "chat_template" in output:
del output["chat_template"]

def save_public_processor_class(dictionary):
# make sure private name "_processor_class" is correctly
# saved as "processor_class"
_processor_class = dictionary.pop("_processor_class", None)
if _processor_class is not None:
dictionary["processor_class"] = _processor_class
for value in dictionary.values():
if isinstance(value, dict):
save_public_processor_class(value)
return dictionary

def cast_array_to_list(dictionary):
"""
Numpy arrays are not serialiazable but can be in pre-processing dicts.
@@ -743,7 +732,6 @@ class ProcessorMixin(PushToHubMixin):
)
}
output = cast_array_to_list(output)
output = save_public_processor_class(output)
output["processor_class"] = self.__class__.__name__

return output
@@ -816,15 +804,15 @@ class ProcessorMixin(PushToHubMixin):

for attribute_name in self.get_attributes():
attribute = getattr(self, attribute_name)
if hasattr(attribute, "_set_processor_class"):
attribute._set_processor_class(self.__class__.__name__)

# Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
if attribute_name == "tokenizer":
attribute._set_processor_class(self.__class__.__name__)
attribute.save_pretrained(save_directory)
# if a model has multiple tokenizers, save the additional tokenizers in their own folders.
# Note that the additional tokenizers must have "tokenizer" in their attribute name.
elif "tokenizer" in attribute_name:
attribute._set_processor_class(self.__class__.__name__)
attribute.save_pretrained(os.path.join(save_directory, attribute_name))
elif attribute._auto_class is not None:
custom_object_save(attribute, save_directory, config=attribute)


+ 11
- 8
src/transformers/testing_utils.py View File

@@ -1091,17 +1091,20 @@ def require_torch_large_gpu(test_case, memory: float = 20):
)(test_case)


def require_torch_large_accelerator(test_case, memory: float = 20):
def require_torch_large_accelerator(test_case=None, *, memory: float = 20):
"""Decorator marking a test that requires an accelerator with more than `memory` GiB of memory."""
if torch_device != "cuda" and torch_device != "xpu":
return unittest.skip(reason=f"test requires a GPU or XPU with more than {memory} GiB of memory")(test_case)

torch_accelerator_module = getattr(torch, torch_device)
def memory_decorator(tc):
if torch_device not in ("cuda", "xpu"):
return unittest.skip(f"test requires a GPU or XPU with more than {memory} GiB of memory")(tc)

return unittest.skipUnless(
torch_accelerator_module.get_device_properties(0).total_memory / 1024**3 > memory,
f"test requires a GPU or XPU with more than {memory} GiB of memory",
)(test_case)
torch_accel = getattr(torch, torch_device)
return unittest.skipUnless(
torch_accel.get_device_properties(0).total_memory / 1024**3 > memory,
f"test requires a GPU or XPU with more than {memory} GiB of memory",
)(tc)

return memory_decorator if test_case is None else memory_decorator(test_case)


def require_torch_accelerator(test_case):


+ 13
- 1
src/transformers/trainer.py View File

@@ -4021,7 +4021,16 @@ class Trainer:
self._save(output_dir, state_dict=state_dict)
elif self.is_deepspeed_enabled:
try:
state_dict = self.accelerator.get_state_dict(self.deepspeed)
accept_exclude_frozen_parameters = "exclude_frozen_parameters" in set(
inspect.signature(self.model_wrapped.save_checkpoint).parameters.keys()
)
zero3_sharding = self.deepspeed.config.get("zero_optimization", {}).get("stage", None) == 3
if accept_exclude_frozen_parameters and _is_peft_model(self.model) and zero3_sharding:
# When using PEFT with DeepSpeed ZeRO Stage 3,
# we do not need to load the frozen parameters
state_dict = self.deepspeed._zero3_consolidated_16bit_state_dict(exclude_frozen_parameters=True)
else:
state_dict = self.accelerator.get_state_dict(self.deepspeed)
if self.args.should_save:
self._save(output_dir, state_dict=state_dict)
except ValueError:
@@ -4827,6 +4836,7 @@ class Trainer:
if not self.args.hub_always_push and self.push_in_progress is not None and not self.push_in_progress.is_done():
return

self.callback_handler.on_push_begin(self.args, self.state, self.control)
output_dir = self.args.output_dir
# To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder
modeling_files = [CONFIG_NAME, GENERATION_CONFIG_NAME, WEIGHTS_NAME, SAFE_WEIGHTS_NAME]
@@ -4921,6 +4931,8 @@ class Trainer:
The URL of the repository where the model was pushed if `blocking=False`, or a `Future` object tracking the
progress of the commit if `blocking=True`.
"""
self.callback_handler.on_push_begin(self.args, self.state, self.control)

model_name = kwargs.pop("model_name", None)
if model_name is None and self.args.should_save:
if self.args.hub_model_id is None:


+ 8
- 0
src/transformers/trainer_callback.py View File

@@ -420,6 +420,11 @@ class TrainerCallback:
Event called after a prediction step.
"""

def on_push_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
"""
Event called before pushing the model to the hub, at the beginning of Trainer.push_to_hub and Trainer._push_from_checkpoint.
"""


class CallbackHandler(TrainerCallback):
"""Internal class that just calls the list of callbacks in order."""
@@ -532,6 +537,9 @@ class CallbackHandler(TrainerCallback):
def on_prediction_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
return self.call_event("on_prediction_step", args, state, control)

def on_push_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
return self.call_event("on_push_begin", args, state, control, **kwargs)

def call_event(self, event, args, state, control, **kwargs):
for callback in self.callbacks:
result = getattr(callback, event)(


+ 71
- 18
src/transformers/utils/kernel_config.py View File

@@ -71,14 +71,36 @@ def add_to_mapping(layer_name, device, repo_name, mode, compatible_mapping):
}


def add_to_mapping_local(layer_name, device, repo_name, mode, compatible_mapping):
from pathlib import Path

from kernels import LocalLayerRepository

if device not in ["cuda", "rocm", "xpu", "npu"]:
raise ValueError(f"Only cuda, rocm, xpu and npu devices supported, got: {device}")
repo_layer_name = repo_name.split(":")[1]
repo_path = repo_name.split(":")[0]
repo_package_name = repo_path.split("/")[-1]
compatible_mapping[layer_name] = {
device: {
mode: LocalLayerRepository(
repo_path=Path(repo_path),
package_name=repo_package_name,
layer_name=repo_layer_name,
)
}
}


class KernelConfig(PushToHubMixin):
"""
Kernel configuration class. This class is used to configure the kernel mapping for a model.
"""

def __init__(self, kernel_mapping={}):
def __init__(self, kernel_mapping={}, use_local_kernel=False):
self.kernel_mapping = kernel_mapping
self.registered_layer_names = {}
self.use_local_kernel = use_local_kernel

def update_kernel(self, repo_id, registered_name, layer_name, device, mode, revision=None):
from kernels import LayerRepository
@@ -105,6 +127,7 @@ class KernelConfig(PushToHubMixin):
2. Each kernel value is either a string of the form 'org/repo:layer_name' or a dict mapping device types ("cuda", "rocm", "xpu", "npu") to such strings.
3. Each device key in a dict is one of "cuda", "rocm", "xpu", or "npu".
4. Each repo_name is a valid repository and layer name in the format 'org/repo:layer_name' (i.e., a string containing both a slash and a colon).
5. If a local path is detected, it should be in the format '/abs/path:layer_name'. The absolute path must include the `package_name`, like "/home/user/layer_norm".

Args:
model: The model instance whose modules are checked for registered kernel_layer_name attributes.
@@ -114,14 +137,13 @@ class KernelConfig(PushToHubMixin):
or if a repo_name is not a valid 'org/repo:layer_name' string.
"""
MAPPING_FORMAT = """
For single device form remote
{
"RMSNorm":
"kernels-community/layer_norm:LlamaRMSNorm",
...
},

or

For multiple devices form remote
{
"RMSNorm": {
"cuda":
@@ -132,6 +154,23 @@ class KernelConfig(PushToHubMixin):
},
...
}
For single device form local
{
"RMSNorm":
"/abs/path:LlamaRMSNorm",
...
},
For multiple devices form local
{
"RMSNorm": {
"cuda":
"/abs/path:LlamaRMSNorm",
"rocm":
"/abs/path:LlamaRMSNorm",
...
},
...
}
"""
self.store_registered_layer_names(model)
# Validate that the kernel mapping is a dict
@@ -149,7 +188,7 @@ class KernelConfig(PushToHubMixin):
if isinstance(kernel, str):
if "/" not in kernel or ":" not in kernel:
raise ValueError(
f"Kernel mapping for '{layer_name}' must be a valid repo name with a layer name (e.g., 'org/repo:layer_name'), got: {kernel}"
f"Kernel mapping for '{layer_name}' must be a valid repo name with a layer name (e.g., 'org/repo:layer_name' or '/abs/path:layer_name'), got: {kernel}"
)

elif isinstance(kernel, dict):
@@ -159,9 +198,8 @@ class KernelConfig(PushToHubMixin):

if not isinstance(repo_name, str) or "/" not in repo_name or ":" not in repo_name:
raise ValueError(
f"Kernel mapping for '{layer_name}' must be a valid repo name with a layer name (e.g., 'org/repo:layer_name'), got: {repo_name}"
f"Kernel mapping for '{layer_name}' must be a valid repo name with a layer name (e.g., 'org/repo:layer_name' or '/abs/path:layer_name'), got: {repo_name}"
)

else:
raise ValueError(f"Kernel mapping must follow the format: {MAPPING_FORMAT}, got: {kernel}")

@@ -174,18 +212,13 @@ class KernelConfig(PushToHubMixin):
...
},

or
or for local path:

{
"RMSNorm": {
"cuda":
"kernels-community/layer_norm:LlamaRMSNorm",
"rocm":
"kernels-community/layer_norm:LlamaRMSNorm",
...
},
"RMSNorm":
"/home/user/liger_kernels:LigerRMSNorm",
...
}
},

into a nested mapping:

@@ -200,6 +233,20 @@ class KernelConfig(PushToHubMixin):
}
}

or for local path:

{
"RMSNorm": {
"cuda": {
Mode.INFERENCE: LocalLayerRepository(
repo_path=Path("/home/user/liger_kernels"),
package_name="liger_kernels",
layer_name="LigerRMSNorm",
)
}
}
}

that's compatible with the kernels library.

The device is inferred from the model's parameters if not provided.
@@ -217,11 +264,17 @@ class KernelConfig(PushToHubMixin):

if isinstance(kernel, str):
repo_name = kernel
add_to_mapping(layer_name, current_device, repo_name, mode, compatible_mapping)
if not self.use_local_kernel:
add_to_mapping(layer_name, current_device, repo_name, mode, compatible_mapping)
else:
add_to_mapping_local(layer_name, current_device, repo_name, mode, compatible_mapping)
elif isinstance(kernel, dict):
for device, repo_name in kernel.items():
if device != current_device:
continue
add_to_mapping(layer_name, device, repo_name, mode, compatible_mapping)
if not self.use_local_kernel:
add_to_mapping(layer_name, device, repo_name, mode, compatible_mapping)
else:
add_to_mapping_local(layer_name, device, repo_name, mode, compatible_mapping)

self.kernel_mapping = compatible_mapping

+ 1
- 7
src/transformers/video_processing_utils.py View File

@@ -175,7 +175,7 @@ class BaseVideoProcessor(BaseImageProcessorFast):
def __init__(self, **kwargs: Unpack[VideosKwargs]) -> None:
super().__init__()

self._processor_class = kwargs.pop("processor_class", None)
kwargs.pop("processor_class", None)

# Additional attributes without default values
for key, value in kwargs.items():
@@ -799,12 +799,6 @@ class BaseVideoProcessor(BaseImageProcessorFast):
if isinstance(value, np.ndarray):
dictionary[key] = value.tolist()

# make sure private name "_processor_class" is correctly
# saved as "processor_class"
_processor_class = dictionary.pop("_processor_class", None)
if _processor_class is not None:
dictionary["processor_class"] = _processor_class

return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"

def to_json_file(self, json_file_path: Union[str, os.PathLike]):


+ 0
- 4
tests/models/blt/test_modeling_blt.py View File

@@ -177,10 +177,6 @@ class BltModelTest(CausalLMModelTest, unittest.TestCase):
# used in `test_torch_compile_for_training`
_torch_compile_train_cls = BltForCausalLM if is_torch_available() else None

@unittest.skip("BLT model requires special handling for training overfit test")
def test_training_overfit(self):
pass

@pytest.mark.generate
@parameterized.expand([("greedy", 1), ("beam search", 2)])
@unittest.skip(


+ 43
- 15
tests/models/ernie4_5_moe/test_modeling_ernie4_5_moe.py View File

@@ -27,7 +27,6 @@ from transformers.testing_utils import (
require_torch,
require_torch_accelerator,
require_torch_large_accelerator,
require_torch_multi_accelerator,
slow,
torch_device,
)
@@ -130,9 +129,7 @@ class Ernie4_5_MoeModelTest(CausalLMModelTest, unittest.TestCase):
self.assertNotAlmostEqual(include_padding_result.aux_loss.item(), result.aux_loss.item())


# Run on runners with larger accelerators (for example A10 instead of T4) with a lot of CPU RAM (e.g. g5-12xlarge)
@require_torch_multi_accelerator
@require_torch_large_accelerator
@slow
@require_torch
class Ernie4_5_MoeIntegrationTest(unittest.TestCase):
@classmethod
@@ -144,27 +141,58 @@ class Ernie4_5_MoeIntegrationTest(unittest.TestCase):
del cls.model
cleanup(torch_device, gc_collect=True)

def setup(self):
cleanup(torch_device, gc_collect=True)

def tearDown(self):
cleanup(torch_device, gc_collect=True)

@classmethod
def get_model(cls):
if cls.model is None:
cls.model = Ernie4_5_MoeForCausalLM.from_pretrained(
"baidu/ERNIE-4.5-21B-A3B-PT",
device_map="auto",
quantization_config=BitsAndBytesConfig(load_in_4bit=True),
)
def get_large_model(cls):
cls.model = Ernie4_5_MoeForCausalLM.from_pretrained(
"baidu/ERNIE-4.5-21B-A3B-PT",
device_map="auto",
quantization_config=BitsAndBytesConfig(load_in_4bit=True),
)

return cls.model

@classmethod
def get_small_model(cls):
cls.model = Ernie4_5_MoeForCausalLM.from_pretrained(
"hf-internal-testing/ERNIE-4.5-Small-Moe",
device_map="auto",
dtype="auto",
)

return cls.model

@require_torch_large_accelerator(memory=48) # Tested on A100 but requires around 48GiB
@require_bitsandbytes
@slow
def test_model_21b_a3b_generation(self):
EXPECTED_TEXT_COMPLETION = "User: Hey, are you conscious? Can you talk to me?\nAssistant: I don't have consciousness in the way humans do. I'm a text-based AI created to process and generate responses based on patterns in data." # fmt: skip
EXPECTED_TEXT_COMPLETION = "User: Hey, are you conscious? Can you talk to me?\nAssistant: \nI don't have consciousness in the way humans do. I don't feel emotions, have thoughts, or experience awareness. However, I'm" # fmt: skip

model = self.get_large_model()
tokenizer = AutoTokenizer.from_pretrained("baidu/ERNIE-4.5-21B-A3B-PT")
prompt = "Hey, are you conscious? Can you talk to me?"
messages = [{"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], add_special_tokens=False, return_tensors="pt").to(model.device)

generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=32,
do_sample=False,
)
text = tokenizer.decode(generated_ids[0], skip_special_tokens=True).strip("\n")
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)

def test_shortened_model_generation(self):
# This is gibberish which is expected as the model are the first x layers of the original 28B model
EXPECTED_TEXT_COMPLETION = 'User: Hey, are you conscious? Can you talk to me?\nAssistant: 不了的 tongues说话 dagat绵席裹着头phones<mask:11>odikèkèk<mask:11><mask:11>bun褶席席地说起来这么说的话的话retti upside upsideolate疡疡疡' # fmt: skip

model = self.get_model()
tokenizer = AutoTokenizer.from_pretrained("baidu/ERNIE-4.5-21B-A3B-PT", revision="refs/pr/11")
model = self.get_small_model()
tokenizer = AutoTokenizer.from_pretrained("baidu/ERNIE-4.5-21B-A3B-PT")
prompt = "Hey, are you conscious? Can you talk to me?"
messages = [{"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)


+ 8
- 0
tests/test_processing_common.py View File

@@ -428,6 +428,14 @@ class ProcessorTesterMixin:

# tokenizer repr contains model-path from where we loaded
if "tokenizer" not in attribute:
# We don't store/load `_processor_class` for subprocessors.
# The `_processor_class` is saved once per config, at general level
self.assertFalse(hasattr(attribute_second, "_processor_class"))
self.assertFalse(hasattr(attribute_first, "_processor_class"))

self.assertFalse(hasattr(attribute_second, "processor_class"))
self.assertFalse(hasattr(attribute_first, "processor_class"))

self.assertEqual(repr(attribute_first), repr(attribute_second))

def test_processor_from_and_save_pretrained_as_nested_dict(self):


+ 6
- 0
tests/test_training_mixin.py View File

@@ -347,6 +347,11 @@ class TrainingTesterMixin(ABC):

logger.info(f"Prompt: {self._decode_text_tokens([expected_tokens[0]])}")

model_type = getattr(config, "model_type", "")
use_cache = model_type == "recurrent_gemma"
if use_cache:
logger.info("Only RecurrentGemmaModel is using use_cache=True. Other models run with use_cache=False")

with torch.no_grad():
generated_ids = model.generate(
prompt_ids,
@@ -354,6 +359,7 @@ class TrainingTesterMixin(ABC):
do_sample=False,
pad_token_id=config.pad_token_id if hasattr(config, "pad_token_id") else 0,
eos_token_id=0,
use_cache=use_cache,
)

generated_tokens = generated_ids[0].tolist()


+ 0
- 3
tests/trainer/test_trainer.py View File

@@ -4592,9 +4592,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):

image_processor_dict = image_processor.to_dict()
reloaded_image_processor_dict = reloaded_image_processor.to_dict()
# When the processor is saved in the trainer, the _processor_class gets set in the reload_image_processor dict
image_processor_dict.pop("_processor_class")
reloaded_image_processor_dict.pop("_processor_class")
self.assertDictEqual(image_processor_dict, reloaded_image_processor_dict)

# For tokenizers, there isn't a direct to_dict method and the properties stored in the configs e.g.


+ 14
- 0
tests/trainer/test_trainer_callback.py View File

@@ -102,6 +102,9 @@ class MyTestTrainerCallback(TrainerCallback):
def on_prediction_step(self, args, state, control, **kwargs):
self.events.append("on_prediction_step")

def on_push_begin(self, args, state, control, **kwargs):
self.events.append("on_push_begin")


@require_torch
class TrainerCallbackTest(unittest.TestCase):
@@ -443,3 +446,14 @@ class TrainerCallbackTest(unittest.TestCase):
trainer = self.get_trainer(max_steps=2, save_strategy="epoch", callbacks=[OnEndCallback])
trainer.train()
assert times_saved == 1

def test_on_push_begin(self):
trainer = self.get_trainer(callbacks=[MyTestTrainerCallback], max_steps=1)
trainer.train()
callback = [cb for cb in trainer.callback_handler.callbacks if isinstance(cb, MyTestTrainerCallback)][0]
initial_event_count = len(callback.events)

trainer.callback_handler.on_push_begin(trainer.args, trainer.state, trainer.control)
assert "on_push_begin" in callback.events
assert callback.events.count("on_push_begin") == 1
assert len(callback.events) == initial_event_count + 1

+ 6
- 1
utils/modular_model_converter.py View File

@@ -1478,10 +1478,15 @@ class ModularFileMapper(ModuleMapper):
suffix = common_partial_suffix(class_name, modeling_bases[0])
if len(suffix) > 0 and suffix[0].isupper():
cased_model_name = class_name.replace(suffix, "")
# If both the old model and new model share the last part of their name, is detected as a common
# If both the old model and new model share the last part of their name, it is detected as a common
# suffix, but it should not be the case -> use the full name in this case
if len(cased_model_name) < len(cased_default_name) and cased_default_name in class_name:
cased_model_name = cased_default_name
# If the new class name is of the form ` class NewNameOldNameClass(OldNameClass):`, i.e. it contains both names,
# add the OldName as suffix (see `examples/modular-transformers/modular_test_suffix.py`)
elif class_name.replace(cased_default_name, "") == modeling_bases[0]:
file_model_name = filename.split(".")[-2]
cased_model_name = cased_default_name + get_cased_name(file_model_name)
prefix_model_name_mapping[filename].update([cased_model_name])

# Check if we found multiple prefixes for some modeling files


Loading…
Cancel
Save
Baidu
map