Merge branch 'main' into export-friendly

Add `.on_push_begin()` callback to Trainer and implement for `TrackioCallback` (#42850 )
* changes * changes * Update src/transformers/integrations/integration_utils.py Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com> * changes * changes * changes * changes * changes * changes * changes --------- Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -318,6 +318,14 @@ non_model_job = CircleCIJob(
    parallelism=6,
 )

 training_ci_job = CircleCIJob(
    "training_ci",
    additional_env={"RUN_TRAINING_TESTS": True},
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
    install_steps=["uv pip install ."],
    marker="is_training_test",
    parallelism=6,
 )

 # We also include a `dummy.py` file in the files to be doc-tested to prevent edge case failure. Otherwise, the pytest
 # hangs forever during test collection while showing `collecting 0 items / 21 errors`. (To see this, we have to remove
@@ -348,7 +356,8 @@ EXAMPLES_TESTS = [examples_torch_job]
 PIPELINE_TESTS = [pipelines_torch_job]
 REPO_UTIL_TESTS = [repo_utils_job]
 DOC_TESTS = [doc_test_job]
 ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job]  # fmt: skip
 TRAINING_CI_TESTS = [training_ci_job]
 ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job] + TRAINING_CI_TESTS  # fmt: skip


 def create_circleci_config(folder=None):
--- a/.github/workflows/circleci-failure-summary-comment.yml
+++ b/.github/workflows/circleci-failure-summary-comment.yml
@@ -0,0 +1,245 @@
 name: CircleCI Failure Summary Comment

 on:
  pull_request_target:
    types: [opened, synchronize, reopened]

 jobs:
  comment:
    runs-on: ubuntu-22.04
    permissions:
      pull-requests: write
    env:
      TARGET_BRANCH: ${{ github.event.pull_request.head.ref }}
      TARGET_SHA: ${{ github.event.pull_request.head.sha }}
      PR_NUMBER: ${{ github.event.pull_request.number }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.13"

      - name: Install dependencies
        run: python -m pip install requests huggingface_hub

      - name: Wait for CircleCI check suite completion
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          COMMIT_SHA: ${{ github.event.pull_request.head.sha }}
          GITHUB_REPOSITORY: ${{ github.repository }}
        run: |
          # Exit on error, undefined variables, or pipe failures
          set -euo pipefail
          
          echo "Waiting for CircleCI check suite to complete..."
          # Timeout after 30 minutes (1800 seconds)
          end=$((SECONDS + 1800))
          
          while [ $SECONDS -lt $end ]; do
            # Query GitHub API for check suites associated with this commit
            # || echo "" allows retry on transient API failures instead of exiting
            suite_json=$(gh api "repos/${GITHUB_REPOSITORY}/commits/${COMMIT_SHA}/check-suites" \
              --jq '.check_suites[] | select(.app.slug == "circleci-checks")' || echo "")
            
            if [ -z "$suite_json" ]; then
              echo "CircleCI check suite not found yet, retrying..."
            else
              status=$(echo "$suite_json" | jq -r '.status')
              conclusion=$(echo "$suite_json" | jq -r '.conclusion // empty')
              echo "CircleCI status: $status, conclusion: $conclusion"
              
              # Check suite is done when status is "completed" AND conclusion is set
              if [ "$status" = "completed" ] && [ -n "$conclusion" ]; then
                echo "Check suite completed successfully"
                exit 0
              fi
            fi
            
            # Poll every 20 seconds
            sleep 20
          done
    
          echo "ERROR: Timed out waiting for CircleCI check suite"
          exit 1

      - name: Get CircleCI run's artifacts and upload them to Hub
        id: circleci
        env:
          COMMIT_SHA: ${{ github.event.pull_request.head.sha }}
          REPO: ${{ github.repository }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          # Step 1: Get CircleCI check suite ID
          echo "Getting check suites for commit ${COMMIT_SHA}..."
          check_suites=$(curl -s -H "Authorization: token ${GITHUB_TOKEN}" \
            "https://api.github.com/repos/${REPO}/commits/${COMMIT_SHA}/check-suites")
          
          circleci_suite_id=$(echo "$check_suites" | jq -r '.check_suites[] | select(.app.slug == "circleci-checks") | .id' | head -n 1)
          echo "CircleCI check suite ID: ${circleci_suite_id}"
          
          # Step 2: Get check runs from the CircleCI suite
          echo "Getting check runs for suite ${circleci_suite_id}..."
          check_runs=$(curl -s -H "Authorization: token ${GITHUB_TOKEN}" \
            "https://api.github.com/repos/${REPO}/check-suites/${circleci_suite_id}/check-runs")
          
          # Step 3: Extract workflow ID from the "run_tests" check run
          workflow_id=$(echo "$check_runs" | jq -r '.check_runs[] | select(.name == "run_tests") | .details_url' | grep -oP 'workflows/\K[a-f0-9-]+')
          echo "CircleCI Workflow ID: ${workflow_id}"
          
          # Step 4: Get all jobs in the workflow
          echo "Getting jobs for workflow ${workflow_id}..."
          jobs=$(curl -s \
            "https://circleci.com/api/v2/workflow/${workflow_id}/job")
          
          # Step 5: Extract collection_job details
          collection_job_number=$(echo "$jobs" | jq -r '.items[] | select(.name == "collection_job") | .job_number')
          collection_job_id=$(echo "$jobs" | jq -r '.items[] | select(.name == "collection_job") | .id')
          echo "CircleCI Collection job number: ${collection_job_number}"
          echo "CircleCI Collection job ID: ${collection_job_id}"
          
          # Step 6: Get artifacts list
          echo "Getting artifacts for job ${collection_job_number}..."
          artifacts=$(curl -s \
            "https://circleci.com/api/v2/project/gh/${REPO}/${collection_job_number}/artifacts")
          
          echo "$artifacts" | jq '.'
          
          # Step 7: Download failure_summary.json specifically
          failure_summary_url=$(echo "$artifacts" | jq -r '.items[] | select(.path == "outputs/failure_summary.json") | .url')
          
          if [ -z "$failure_summary_url" ]; then
            echo "failure_summary.json not found in artifacts - PR may not have latest main merged. Skipping."
            echo "artifact_found=false" >> $GITHUB_OUTPUT
            exit 0
          fi
          
          echo "Downloading failure_summary.json from: ${failure_summary_url}"
          mkdir -p outputs
          curl -s -L "${failure_summary_url}" -o outputs/failure_summary.json
          ls -la outputs
          
          echo "Downloaded failure_summary.json successfully"
          
          # Verify the file was downloaded
          if [ ! -f outputs/failure_summary.json ]; then
            echo "Failed to download failure_summary.json - skipping."
            echo "artifact_found=false" >> $GITHUB_OUTPUT
            exit 0
          fi
          
          echo "File size: $(wc -c < outputs/failure_summary.json) bytes"
          
          # Export variables for next steps
          echo "artifact_found=true" >> $GITHUB_OUTPUT
          echo "workflow_id=${workflow_id}" >> $GITHUB_OUTPUT
          echo "collection_job_number=${collection_job_number}" >> $GITHUB_OUTPUT

      - name: Upload summaries to Hub
        if: steps.circleci.outputs.artifact_found == 'true'
        env:
          HF_TOKEN: ${{ secrets.HF_CI_WRITE_TOKEN }}
          CIRCLECI_RESULTS_DATASET_ID: "transformers-community/circleci-test-results"
          PR_NUMBER: ${{ github.event.pull_request.number }}
          COMMIT_SHA: ${{ github.event.pull_request.head.sha }}
        run: |
          python << 'EOF'
          import os
          from pathlib import Path
          from huggingface_hub import HfApi
          
          # Setup paths
          pr_number = os.environ["PR_NUMBER"]
          commit_short = os.environ["COMMIT_SHA"][:12]
          folder_path = f"pr-{pr_number}/sha-{commit_short}"
          
          # Create folder and move file
          Path(folder_path).mkdir(parents=True, exist_ok=True)
          Path("outputs/failure_summary.json").rename(f"{folder_path}/failure_summary.json")
          
          # Upload to Hub
          dataset_id = os.environ["CIRCLECI_RESULTS_DATASET_ID"]
          api = HfApi(token=os.environ["HF_TOKEN"])
          api.upload_folder(
              commit_message=f"Update CircleCI artifacts for PR {pr_number} ({commit_short})",
              folder_path=folder_path,
              path_in_repo=folder_path,
              repo_id=dataset_id,
              repo_type="dataset",
          )
          
          print(f"Uploaded {folder_path} to {dataset_id}")
          EOF

      - name: Delete existing CircleCI summary comments
        if: steps.circleci.outputs.artifact_found == 'true'
        env:
          PR_NUMBER: ${{ github.event.pull_request.number }}
        uses: actions/github-script@v7
        with:
          script: |
            const PR_NUMBER = parseInt(process.env.PR_NUMBER, 10);
            
            // Get all comments on the PR
            const { data: comments } = await github.rest.issues.listComments({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: PR_NUMBER
            });
            
            // Find existing bot comments that start with "View the CircleCI Test Summary for this PR:"
            const existingComments = comments.filter(comment => 
              comment.user.login === 'github-actions[bot]' && 
              comment.body.startsWith('View the CircleCI Test Summary for this PR:')
            );

            // Delete all matching comments
            for (const comment of existingComments) {
              console.log(`Deleting comment #${comment.id}`);
              await github.rest.issues.deleteComment({
                owner: context.repo.owner,
                repo: context.repo.repo,
                comment_id: comment.id
              });
            }
            
            console.log(`Deleted ${existingComments.length} old CircleCI summary comment(s)`);

      - name: Post comment with helper link
        if: steps.circleci.outputs.artifact_found == 'true'
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          GITHUB_REPOSITORY: ${{ github.repository }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
          PR_SHA: ${{ github.event.pull_request.head.sha }}
        run: |
          COMMIT_SHORT="${PR_SHA:0:12}"
          SUMMARY_FILE="pr-${PR_NUMBER}/sha-${COMMIT_SHORT}/failure_summary.json"
          
          if [ ! -f "$SUMMARY_FILE" ]; then
            echo "failure_summary.json missing, skipping comment."
            exit 0
          fi
          
          failures=$(jq '.failures | length' "$SUMMARY_FILE")
          if [ "$failures" -eq 0 ]; then
            echo "No failures detected, skipping PR comment."
            exit 0
          fi
          
          # Build Space URL with encoded parameters
          repo_enc=$(jq -rn --arg v "$GITHUB_REPOSITORY" '$v|@uri')
          pr_enc=$(jq -rn --arg v "$PR_NUMBER" '$v|@uri')
          sha_short="${PR_SHA:0:6}"
          sha_enc=$(jq -rn --arg v "$sha_short" '$v|@uri')
          SPACE_URL="https://huggingface.co/spaces/transformers-community/circle-ci-viz?pr=${pr_enc}&sha=${sha_enc}"

          # Post comment (using printf for proper newlines)
          gh api \
            --method POST \
            -H "Accept: application/vnd.github+json" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
            "repos/${GITHUB_REPOSITORY}/issues/${PR_NUMBER}/comments" \
            -f body="$(printf "View the CircleCI Test Summary for this PR:\n\n%s" "$SPACE_URL")"
--- a/.github/workflows/pr-repo-consistency-bot.yml
+++ b/.github/workflows/pr-repo-consistency-bot.yml
@@ -0,0 +1,314 @@
 name: PR Repo. Consistency Bot

 on:
  issue_comment:
    types:
      - created
    branches-ignore:
      - main
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.issue.number }}-${{ startsWith(github.event.comment.body, '@bot /repo') }}
  cancel-in-progress: true
 permissions: read-all


 jobs:
  get-pr-number:
    name: Get PR number
    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or", "itazap", "3outeille"]'), github.actor) && startsWith(github.event.comment.body, '@bot /repo') }}
    uses: ./.github/workflows/get-pr-number.yml

  get-pr-info:
    name: Get PR commit SHA
    needs: get-pr-number
    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
    uses: ./.github/workflows/get-pr-info.yml
    with:
      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}

  check-timestamps:
    name: Check timestamps (security check)
    runs-on: ubuntu-22.04
    needs: get-pr-info
    outputs:
      VERIFIED_PR_HEAD_SHA: ${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}
    steps:
      - name: Verify `merge_commit` timestamp is older than the issue comment timestamp
        env:
          COMMENT_DATE: ${{ github.event.comment.created_at }}
          PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
        run: |
            COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
            echo "COMMENT_DATE: $COMMENT_DATE"
            echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
            if [ $COMMENT_TIMESTAMP -le $PR_MERGE_COMMIT_TIMESTAMP ]; then
              echo "Last commit on the pull request is newer than the issue comment triggering this run! Abort!";
              exit -1;
            fi

  init_comment_with_url:
    name: Init Comment on PR
    runs-on: ubuntu-22.04
    needs: [get-pr-number, check-timestamps]
    outputs:
       comment_id: ${{ steps.init_comment.outputs.comment_id }}
    permissions:
      pull-requests: write
    steps:
      - name: Delete existing bot comment if it exists
        env:
          PR_NUMBER: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
        uses: actions/github-script@v6
        with:
          script: |
            const PR_NUMBER = parseInt(process.env.PR_NUMBER, 10);
            
            // Get all comments on the PR
            const { data: comments } = await github.rest.issues.listComments({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: PR_NUMBER
            });
            
            // Find existing bot comments that start with "Repo. Consistency"
            const existingComments = comments.filter(comment => 
              comment.user.login === 'github-actions[bot]' && 
              comment.body.startsWith('Repo. Consistency')
            );

            if (existingComments.length > 0) {
              // Get the most recent comment
              const mostRecentComment = existingComments
                .sort((a, b) => new Date(b.created_at) - new Date(a.created_at))[0];
              
              console.log(`Deleting most recent comment #${mostRecentComment.id}`);
              await github.rest.issues.deleteComment({
                owner: context.repo.owner,
                repo: context.repo.repo,
                comment_id: mostRecentComment.id
              });
            }

      - name: Comment on PR with workflow run link
        id: init_comment
        env:
          PR_NUMBER: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
        uses: actions/github-script@v6
        with:
          script: |
            const PR_NUMBER = parseInt(process.env.PR_NUMBER, 10);
            const runUrl = `${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`

            const { data: botComment } = await github.rest.issues.createComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: PR_NUMBER,
              body: `Repo. Consistency fix is beginning .... [View the workflow run here](${runUrl}).`
            });
            core.setOutput('comment_id', botComment.id);

  run-repo-consistency-checks:
    runs-on: ubuntu-22.04
    needs: [get-pr-info, check-timestamps, init_comment_with_url]
    outputs:
      changes_detected: ${{ steps.run_checks.outputs.changes_detected }}
    steps:
      # Checkout the trusted base repository (main branch) - this is safe
      - name: Checkout base repository
        uses: actions/checkout@v4
        with:
          ref: main

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: "3.10"

      - name: Install dependencies from trusted main branch
        run: |
          python -m pip install --upgrade pip
          pip install -e ".[quality]"
          pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu

      - name: Fetch and checkout PR code manually
        env:
          PR_HEAD_REPO_FULL_NAME: ${{ needs.get-pr-info.outputs.PR_HEAD_REPO_FULL_NAME }}
          PR_HEAD_SHA: ${{ needs.check-timestamps.outputs.VERIFIED_PR_HEAD_SHA }}
        run: |
          # Create separate directory for PR code
          mkdir -p pr-repo
          cd pr-repo
          
          # Initialize git and fetch only the specific commit
          git init
          git remote add pr-origin https://github.com/${PR_HEAD_REPO_FULL_NAME}.git
          git fetch --depth=1 pr-origin ${PR_HEAD_SHA}
          git checkout ${PR_HEAD_SHA}

      - name: Run checks with trusted script
        id: run_checks
        run: |
          # Copy trusted script to PR directory
          cp utils/check_copies.py pr-repo/utils/check_copies.py
          
          # Run the trusted script in PR directory
          cd pr-repo
          python utils/check_copies.py --fix_and_overwrite
          
          # Check if there are changes
          if [ -n "$(git status --porcelain)" ]; then
            echo "changes_detected=true" >> $GITHUB_OUTPUT
          else
            echo "changes_detected=false" >> $GITHUB_OUTPUT
          fi

      - name: Save modified files
        if: steps.run_checks.outputs.changes_detected == 'true'
        run: |
          cd pr-repo
          mkdir -p ../artifact-staging
          git diff --name-only > ../artifact-staging/modified-files.txt
          # Copy each modified file
          while IFS= read -r file; do
            mkdir -p "../artifact-staging/pr-repo/$(dirname "$file")"
            cp "$file" "../artifact-staging/pr-repo/$file"
          done < ../artifact-staging/modified-files.txt

      - name: Upload modified files
        if: steps.run_checks.outputs.changes_detected == 'true'
        uses: actions/upload-artifact@v4
        with:
          name: modified-files
          path: artifact-staging/

  commit-and-comment:
    runs-on: ubuntu-22.04
    needs: [get-pr-number, get-pr-info, check-timestamps, init_comment_with_url, run-repo-consistency-checks]
    if: always()
    permissions:
      pull-requests: write
    steps:
      - name: Download modified files
        if: needs.run-repo-consistency-checks.outputs.changes_detected == 'true'
        uses: actions/download-artifact@v4
        with:
          name: modified-files

      - name: Push changes via GitHub API (no checkout)
        if: needs.run-repo-consistency-checks.outputs.changes_detected == 'true'
        uses: actions/github-script@v6
        env:
          PR_HEAD_REF: ${{ needs.get-pr-info.outputs.PR_HEAD_REF }}
          PR_HEAD_SHA: ${{ needs.check-timestamps.outputs.VERIFIED_PR_HEAD_SHA }}
          PR_HEAD_REPO_OWNER: ${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}
          PR_HEAD_REPO_NAME: ${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}
        with:
          github-token: ${{ secrets.HF_STYLE_BOT_ACTION }}
          script: |
            const fs = require('fs');
            const path = require('path');
            
            const owner = process.env.PR_HEAD_REPO_OWNER;
            const repo = process.env.PR_HEAD_REPO_NAME;
            const baseSha = process.env.PR_HEAD_SHA;
            const branch = process.env.PR_HEAD_REF;
            
            console.log(`Creating commit on ${owner}/${repo} branch ${branch} from ${baseSha}`);
            
            // Read list of modified files
            const modifiedFiles = fs.readFileSync('modified-files.txt', 'utf8')
              .trim()
              .split('\n')
              .filter(f => f.length > 0);
            
            console.log(`Modified files: ${modifiedFiles.join(', ')}`);
            
            // Get the base commit to retrieve its tree SHA (metadata only, no checkout)
            const { data: baseCommit } = await github.rest.git.getCommit({
              owner,
              repo,
              commit_sha: baseSha
            });
            
            console.log(`Base tree SHA: ${baseCommit.tree.sha}`);
            
            // Create blobs for each modified file
            const tree = [];
            for (const file of modifiedFiles) {
              const filePath = path.join('pr-repo', file);
              const content = fs.readFileSync(filePath, 'utf8');
              
              console.log(`Creating blob for ${file}`);
              const { data: blob } = await github.rest.git.createBlob({
                owner,
                repo,
                content: content,
                encoding: 'utf-8'
              });
              
              tree.push({
                path: file,
                mode: '100644',
                type: 'blob',
                sha: blob.sha
              });
            }
            
            // Create new tree based on the base tree
            console.log(`Creating tree with ${tree.length} modified files`);
            const { data: newTree } = await github.rest.git.createTree({
              owner,
              repo,
              base_tree: baseCommit.tree.sha,
              tree: tree
            });
            
            // Create commit
            console.log(`Creating commit`);
            const { data: newCommit } = await github.rest.git.createCommit({
              owner,
              repo,
              message: 'Apply repo. consistency fixes',
              tree: newTree.sha,
              parents: [baseSha]
            });
            
            console.log(`Created commit: ${newCommit.sha}`);
            
            // Update branch ref
            console.log(`Updating ref heads/${branch} to ${newCommit.sha}`);
            await github.rest.git.updateRef({
              owner,
              repo,
              ref: `heads/${branch}`,
              sha: newCommit.sha
            });
            
            console.log(`Successfully pushed commit to ${branch}`);

      - name: Prepare final comment message
        id: prepare_final_comment
        if: needs.init_comment_with_url.result == 'success'
        env:
          CHANGES_DETECTED: ${{ needs.run-repo-consistency-checks.outputs.changes_detected }}
        run: |
          if [ "$CHANGES_DETECTED" = 'true' ]; then
            echo "final_comment=Repo. Consistency bot fixed some files and pushed the changes." >> $GITHUB_OUTPUT
          else
            echo "final_comment=Repo. Consistency fix runs successfully without any file modified." >> $GITHUB_OUTPUT
          fi

      - name: Comment on PR
        if: needs.init_comment_with_url.result == 'success'
        uses: actions/github-script@v6
        env:
          PR_NUMBER: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
        with:
          script: |
            const PR_NUMBER = parseInt(process.env.PR_NUMBER, 10);
            await github.rest.issues.updateComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              comment_id: ${{ needs.init_comment_with_url.outputs.comment_id }},
              body: `${{ steps.prepare_final_comment.outputs.final_comment }}`
            });
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -0,0 +1,60 @@
 name: Release
 on:
  push:
    tags:
      - v*
    branches:
      - 'v*-release'

 jobs:
  build_and_test:
    name: build release
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: set up python
        uses: actions/setup-python@v5
        with:
          python-version: "3.13"

      - run: pip install setuptools
      - run: pip install -e .
      - run: make build-release

      - run: pip uninstall -y transformers
      - run: pip install dist/*.whl

      - run: python -c "from transformers import *"

      - run: pip install -e .[torch]
      - run: python -c "from transformers import pipeline; classifier = pipeline('text-classification'); assert classifier('What a nice release')[0]['score'] > 0"

      - name: Upload build artifacts
        uses: actions/upload-artifact@v4
        with:
          name: python-dist
          path: |
            dist/**
            build/**

  upload_package:
    needs: build_and_test
    if: startsWith(github.ref, 'refs/tags/')
    runs-on: ubuntu-latest
    environment: pypi-release
    permissions:
      id-token: write

    steps:
      - uses: actions/checkout@v4

      - name: Download build artifacts
        uses: actions/download-artifact@v4
        with:
          name: python-dist
          path: .

      - name: Publish package distributions to TestPyPI
        uses: pypa/gh-action-pypi-publish@release/v1

--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@@ -80,6 +80,38 @@ jobs:
        run: |
          nvidia-smi

      - name: Create python alias
        run: |
          ln -sf $(which python3) /usr/local/bin/python
          ln -sf $(which pip3) /usr/local/bin/pip
          echo "✅ python -> python3 symlink created"

      - name: Install psutil for memory monitor
        run: |
          pip install psutil --break-system-packages

      - name: Download memory monitor script
        working-directory: /transformers
        run: |
          apt-get update && apt-get install -y curl
          curl -o memory_monitor.py https://raw.githubusercontent.com/huggingface/transformers/refs/heads/utility_scripts/utils/memory_monitor.py

      - name: Start memory monitor
        working-directory: /transformers
        continue-on-error: true  # Don't fail workflow if monitor has issues
        run: |
          python3 memory_monitor.py --threshold 90 --interval 1 > memory_monitor.log 2>&1 &
          echo $! > memory_monitor.pid
          echo "Memory monitor started with PID $(cat memory_monitor.pid)"
          # Give it a moment to start
          sleep 2
          # Verify it's running
          ps aux | grep memory_monitor | grep -v grep || echo "Warning: memory monitor may not be running"

      - name: Install utilities
        run: |
          apt-get install -y nano

      - name: Store Slack infos
        #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step
        shell: bash
@@ -92,6 +124,36 @@ jobs:
          echo "$github_actor"
          echo "github_actor=$github_actor" >> $GITHUB_ENV

      - name: Setup automatic environment for SSH login
        run: |
          # Create shared environment setup
          cat > /root/.env_setup << 'EOF'
          # Auto-setup (non-sensitive vars)
          export HF_HOME=/mnt/cache
          export TRANSFORMERS_IS_CI=yes
          export OMP_NUM_THREADS=8
          export MKL_NUM_THREADS=8
          export RUN_SLOW=yes
          export TF_FORCE_GPU_ALLOW_GROWTH=true
          export CUDA_VISIBLE_DEVICES=0,1
          
          cd /transformers 2>/dev/null || true
          
          # Remind user to set token if needed
          if [ -z "$HF_HUB_READ_TOKEN" ]; then
              echo "⚠️  HF_HUB_READ_TOKEN not set. Set it with:"
              echo "    export HF_HUB_READ_TOKEN=hf_xxxxx"
          else
              echo "✅ HF_HUB_READ_TOKEN is set"
          fi
          
          echo "📁 Working directory: $(pwd)"
          EOF
          
          # Source from both .bash_profile and .bashrc
          echo 'source /root/.env_setup' >> /root/.bash_profile
          echo 'source /root/.env_setup' >> /root/.bashrc

      - name: Store Slack infos
        #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step
        shell: bash
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -125,9 +125,9 @@ If you're contributing a **vision-language model** (or any multimodal model that
 All new models should use the modular architecture pattern. Create a `modular_<model_name>.py` file using the modular model converter:

 - Use the CLI, [`transformers add-new-model-like`](https://github.com/huggingface/transformers/blob/main/src/transformers/cli/add_new_model_like.py) to generate a modular skeleton and get started
 - All code should be in the modular file if possible. Modeling must be in it, it's better if configuration is in it as well. [Modular guide](./docs/source/en/modular_transformers.md#implementing-a-modular-file) shows a quick way to set up a modular file.
 - All code should be in the modular file if possible. Modeling must be in it, it's better if configuration is in it as well. [Modular guide](https://huggingface.co/docs/transformers/modular_transformers#implementing-a-modular-file) shows a quick way to set up a modular file.
 - Reuse existing patterns from similar models as much as possible
 - You can make the model compatible with inference engines such as vLLM or SGLang, and enable zero-effort integration. See specific requirements for model implementation in ["Transformers modeling backend"](./docs/source/en/transformers_as_backend.md#multimodal-models)
 - You can make the model compatible with inference engines such as vLLM or SGLang, and enable zero-effort integration. See specific requirements for model implementation in ["Transformers modeling backend"](https://huggingface.co/docs/transformers/transformers_as_backend#multimodal-models)

 To verify your modular file is correct, run:

--- a/MIGRATION_GUIDE_V5.md
+++ b/MIGRATION_GUIDE_V5.md
@@ -105,7 +105,7 @@ class Llama5Tokenizer(TokenizersBackend):
            self._vocab = vocab

        if merges is not None:
            self._merges = merges
            self._merges = merges or []
        else:
            self._merges = generate_merges(filtered_vocab)

@@ -339,7 +339,7 @@ _We aim for this to be fixed and released in a following release candidate in th
 ### Remote code incompatibility

 A lot of paths were removed and reworked; paths like `transformers.tokenization_utils` and `transformers.tokenization_utils_fast`, which no longer exist.
 We'll be working on backwards compatibility for these before version 5 is fully released.
 These now redirect to `transformers.tokenization_utils_sentencepiece` and `transformers.tokenization_utils_tokenizers` respectively; please update imports accordingly.

 _We aim for this to be fixed and released in a following release candidate in the week that follows RC0._

@@ -621,4 +621,4 @@ Linked PR: https://github.com/huggingface/transformers/pull/42391.
 - related to 1., it is not possible to set proxies from your script. To handle proxies, you must set the `HTTP_PROXY` / `HTTPS_PROXY` environment variables
 - `hf_transfer` and therefore `HF_HUB_ENABLE_HF_TRANSFER` have been completed dropped in favor of `hf_xet`. This should be transparent for most users. Please let us know if you notice any downside!

 `typer-slim` has been added as required dependency, used to implement both `hf` and `transformers` CLIs.
 `typer-slim` has been added as required dependency, used to implement both `hf` and `transformers` CLIs.
--- a/+ 3
+++ b/+ 3
@@ -45,7 +45,7 @@ repo-consistency:
 	python utils/check_modular_conversion.py
 	python utils/check_dummies.py
 	python utils/check_repo.py
 	python utils/check_init_weights_data.py
 	python utils/check_modeling_structure.py
 	python utils/check_inits.py
 	python utils/check_pipeline_typing.py
 	python utils/check_config_docstrings.py
@@ -53,7 +53,7 @@ repo-consistency:
 	python utils/check_doctest_list.py
 	python utils/update_metadata.py --check-only
 	python utils/check_docstrings.py
 	python utils/add_dates.py
 	python utils/add_dates.py --check-only

 # this target runs checks on all files

@@ -93,6 +93,7 @@ fix-copies:
 	python utils/check_pipeline_typing.py --fix_and_overwrite
 	python utils/check_doctest_list.py --fix_and_overwrite
 	python utils/check_docstrings.py --fix_and_overwrite
 	python utils/add_dates.py

 # Run tests for the library

@@ -134,4 +135,3 @@ build-release:
 	rm -rf build
 	python setup.py bdist_wheel
 	python setup.py sdist
 	python utils/check_build.py
--- a/benchmark_v2/framework/benchmark_config.py
+++ b/benchmark_v2/framework/benchmark_config.py
@@ -5,6 +5,7 @@ import logging
 from functools import lru_cache
 from typing import Any

 from transformers.generation.configuration_utils import CompileConfig
 from transformers.utils.import_utils import is_flash_attn_2_available, is_kernels_available


@@ -61,8 +62,7 @@ class BenchmarkConfig:
        sequence_length: int = 128,
        num_tokens_to_generate: int = 128,
        attn_implementation: str = "eager",
        compile_mode: str | None = None,
        compile_options: dict[str, Any] | None = None,
        compile_kwargs: dict[str, Any] | None = None,
        kernelize: bool = False,
        name: str | None = None,
        skip_validity_check: bool = False,
@@ -79,8 +79,11 @@ class BenchmarkConfig:
        # Generation parameters
        self.attn_implementation = attn_implementation
        # Optimization parameters
        self.compile_mode = compile_mode
        self.compile_options = compile_options if compile_options is not None else {}
        if compile_kwargs is None:
            self.compile_config = None
        else:
            compile_kwargs["fullgraph"] = compile_kwargs.get("fullgraph", True)
            self.compile_config = CompileConfig(**compile_kwargs)
        self.kernelize = kernelize
        # Constant parameters
        self.dtype = "torch.bfloat16"
@@ -92,22 +95,41 @@ class BenchmarkConfig:
    def check_validity(self, skip_validity_check: bool = False) -> None:
        if skip_validity_check:
            return
        # Check FA is installed
        is_fa = self.attn_implementation == "flash_attention_2"
        if is_fa and not is_fa2_or_kernel_available():
            logger.warning("Flash attention is not available. Defaulting to SDPA.")

        # If flash_attention_2 is selected but not available, default to SDPA
        if self.attn_implementation == "flash_attention_2" and not is_fa2_or_kernel_available():
            logger.error("Flash attention is not available. Defaulting to SDPA.")
            self.attn_implementation = "sdpa"
        # Flash attention does not support compile mode, so we turn it off # FIXME: it would be better to support it
        if is_fa and self.compile_mode is not None:
            logger.warning("Flash attention does not support compile mode. Turning off compile mode.")
            self.compile_mode = None
        # Handle continuous batching cases
        if self.continuous_batching:
            if self.attn_implementation == "flex_attention":
                logger.error(
                    "Disabling continuous batching because of invalid configuration: flex attention is not supported."
                )
                self.continuous_batching = False

        # The combination of flash_attention_2, compile and generate is not supported # FIXME: support it
        if (
            not self.continuous_batching
            and self.attn_implementation == "flash_attention_2"
            and self.compile_config is not None
        ):
            logger.error(
                "The combination of flash_attention_2, compile and generate is not supported. Turning off compile."
            )
            self.compile_config = None

        # Continuous batching does not support flex attention as an attention implementation # FIXME: support it
        if self.attn_implementation == "flex_attention" and self.continuous_batching:
            logger.error(
                "Disabling continuous batching because of invalid configuration: flex attention is not supported."
            )
            self.continuous_batching = False

        # Continuous batching supports compile mode "default" or "max-autotune-no-cudagraphs"
        if (
            self.continuous_batching
            and self.compile_config is not None
            and self.compile_config.mode not in ["default", "max-autotune-no-cudagraphs"]
        ):
            logger.error(
                f"You have continuous batching and compile enabled, but {self.compile_config.mode = } is not supported."
                " Supported modes are: default, max-autotune-no-cudagraphs. Changing to default."
            )
            self.compile_config.mode = "default"

    @property
    def hash(self) -> str:
@@ -120,7 +142,7 @@ class BenchmarkConfig:
            gpu_monitor_str = "monitored" if self.gpu_monitoring else "unmonitored"
            dimensions_str = f"b{self.batch_size}_s{self.sequence_length}_n{self.num_tokens_to_generate}"
            attn_code = self.attn_implementation
            compile_str = f"compiled_{self.compile_mode}" if self.compile_mode is not None else "uncompiled"
            compile_str = f"compiled_{self.compile_config.mode}" if self.compile_config is not None else "uncompiled"
            kernelize_str = "kernelized" if self.kernelize else "unkernelized"
            continuous_batching_str = "cb" if self.continuous_batching else "generate"
            sep = "-"
@@ -129,7 +151,7 @@ class BenchmarkConfig:
            gpu_monitor_str = ("with" if self.gpu_monitoring else "no") + " GPU monitoring"
            dimensions_str = f"batch size {self.batch_size}, sequence length {self.sequence_length}, {self.num_tokens_to_generate} generated tokens"
            attn_code = f"{self.attn_implementation} attention"
            compile_str = "compiled" if self.compile_mode is not None else "not compiled"
            compile_str = "compiled" if self.compile_config is not None else "not compiled"
            kernelize_str = "kernelized" if self.kernelize else "not kernelized"
            continuous_batching_str = "continuous batching" if self.continuous_batching else "regular generate"
            sep = ", "
@@ -148,8 +170,7 @@ class BenchmarkConfig:
            "sequence_length": self.sequence_length,
            "num_tokens_to_generate": self.num_tokens_to_generate,
            "attn_implementation": self.attn_implementation,
            "compile_mode": self.compile_mode,
            "compile_options": self.compile_options | {},  # to avoid inplace modification of the original dict
            "compile_kwargs": self.compile_config.to_dict() if self.compile_config is not None else None,
            "kernelize": self.kernelize,
        }

@@ -164,8 +185,7 @@ class BenchmarkConfig:
            sequence_length=data.get("sequence_length", 128),
            num_tokens_to_generate=data.get("num_tokens_to_generate", 128),
            attn_implementation=data.get("attn_implementation", "eager"),
            compile_mode=data.get("compile_mode"),
            compile_options=data.get("compile_options"),
            compile_kwargs=data.get("compile_kwargs"),
            kernelize=data.get("kernelize", False),
            name=data.get("name"),
            skip_validity_check=skip_validity_check,
@@ -218,12 +238,13 @@ def get_config_by_level(level: int) -> list[BenchmarkConfig]:
            # Usually there is not much to gain by compiling with other modes, but we allow it for level 4
            compile_modes = BenchmarkConfig.all_compiled_modes if level >= 4 else [None, "default"]
            for cm in compile_modes:
                compile_kwargs = {"mode": cm} if cm is not None else None
                for kernelize_on in {False, KERNELIZATION_AVAILABLE}:
                    for cb_on in [False, True]:
                        configs.append(
                            BenchmarkConfig(
                                attn_implementation=attn_implementation,
                                compile_mode=cm,
                                compile_kwargs=compile_kwargs,
                                kernelize=kernelize_on,
                                continuous_batching=cb_on,
                            )
@@ -231,14 +252,14 @@ def get_config_by_level(level: int) -> list[BenchmarkConfig]:
        return configs
    # Otherwise, we add the configs for the given level
    if level >= 0:
        configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default"))
        configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_kwargs={}))
    if level >= 1:
        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2"))
        configs.append(BenchmarkConfig(attn_implementation="eager", compile_mode="default"))
        configs.append(BenchmarkConfig(attn_implementation="eager", compile_kwargs={}))
        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", continuous_batching=True))
    if level >= 2:
        configs.append(BenchmarkConfig(attn_implementation="sdpa", compile_mode="default"))
        configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", kernelize=True))
        configs.append(BenchmarkConfig(attn_implementation="sdpa", compile_kwargs={}))
        configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_kwargs={}, kernelize=True))
        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", kernelize=True))
        configs.append(BenchmarkConfig(attn_implementation="sdpa", continuous_batching=True))
    return configs
--- a/benchmark_v2/framework/benchmark_runner.py
+++ b/benchmark_v2/framework/benchmark_runner.py
@@ -18,7 +18,6 @@ from tqdm import trange
 from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    CompileConfig,
    GenerationConfig,
    GenerationMixin,
 )
@@ -78,23 +77,26 @@ def get_git_revision() -> str:
        return git_hash.readline().strip()


 def flush_memory():
    """Flush GPU memory and run garbage collection."""
 def flush_memory(flush_compile: bool = True) -> None:
    """Flush GPU memory and run garbage collection. If the flush_compile flag is set, we also clear the everything
    related to compile cache."""
    gc.collect()
    # Dynamo resets
    torch._dynamo.reset()
    torch._dynamo.reset_code_caches()
    if hasattr(torch._inductor, "codecache"):
        # Clear FX graph cache
        if hasattr(torch._inductor.codecache, "FxGraphCache"):
            torch._inductor.codecache.FxGraphCache.clear()
        # Clear PyCodeCache
        if hasattr(torch._inductor.codecache, "PyCodeCache"):
            torch._inductor.codecache.PyCodeCache.cache_clear()
        # Clear TritonFuture cache (for async compilation)
        if hasattr(torch._inductor.codecache, "TritonFuture"):
            if hasattr(torch._inductor.codecache.TritonFuture, "_compile_cache"):
                torch._inductor.codecache.TritonFuture._compile_cache.clear()
    # If needed, flush everything related to torch.compile
    if flush_compile:
        # Dynamo resets
        torch._dynamo.reset()
        torch._dynamo.reset_code_caches()
        if hasattr(torch._inductor, "codecache"):
            # Clear FX graph cache
            if hasattr(torch._inductor.codecache, "FxGraphCache"):
                torch._inductor.codecache.FxGraphCache.clear()
            # Clear PyCodeCache
            if hasattr(torch._inductor.codecache, "PyCodeCache"):
                torch._inductor.codecache.PyCodeCache.cache_clear()
            # Clear TritonFuture cache (for async compilation)
            if hasattr(torch._inductor.codecache, "TritonFuture"):
                if hasattr(torch._inductor.codecache.TritonFuture, "_compile_cache"):
                    torch._inductor.codecache.TritonFuture._compile_cache.clear()
    # Clear CUDA cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
@@ -179,20 +181,25 @@ class BenchmarkRunner:
        self.inputs["use_cache"] = True

        # Prepare generation config
        gen_config = GenerationConfig(
            do_sample=False, top_p=1.0, temperature=1.0, max_new_tokens=config.num_tokens_to_generate
        )
        generation_config_kwargs = {
            "do_sample": False,
            "max_new_tokens": config.num_tokens_to_generate,
        }

        # Add compile config if found
        if config.compile_config is not None:
            generation_config_kwargs.update(compile_config=config.compile_config)
            # To trigger compile in generate, we need to set the cache to static
            if not config.continuous_batching:
                generation_config_kwargs.update(cache_implementation="static")

        # Prepare compile config
        if config.compile_mode is not None:
            gen_config.compile_config = CompileConfig(mode=config.compile_mode, options=config.compile_options)
            gen_config.cache_implementation = "static"
        generation_config = GenerationConfig(**generation_config_kwargs)

        # Load model
        self.logger.debug(f"Loading model {model_id} on device {config.device}...")
        dtype = getattr(torch, config.dtype.removeprefix("torch."))
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id, dtype=dtype, attn_implementation=config.attn_implementation, generation_config=gen_config
            model_id, dtype=dtype, attn_implementation=config.attn_implementation, generation_config=generation_config
        )
        self.model = self.model.eval().to(config.device)

@@ -200,32 +207,23 @@ class BenchmarkRunner:
        if config.kernelize and kernelize is not None and Mode is not None:
            self.model = kernelize(self.model, mode=Mode.INFERENCE)

    def run_benchmark(
        self, model_id: str, config: BenchmarkConfig, num_tokens_to_profile: int = 0
    ) -> dict[str, Any] | None:
    def run_benchmark(self, config: BenchmarkConfig, num_tokens_to_profile: int = 0) -> BenchmarkResult | None:
        """Run a single benchmark with the given model ID and config."""
        with torch.no_grad():
            self.logger.info(f"Running benchmark scenario: {config.name}")
            self.logger.debug(f"Full config: {config.to_dict()}")

            # Quick validation: try one measurement first to see if this scenario works
            flush_memory()
            e2e_latency, timestamps, shape_and_decoded_output, gpu_metrics = self.time_generate(
                max_new_tokens=config.num_tokens_to_generate,
                use_continuous_batching=config.continuous_batching,
                gpu_monitor=None,
            )
            e2e_latency = self.time_generate(config, warmup=True)[0]
            if e2e_latency < 0:
                self.logger.warning(f"Skipping config {config.name}: {e2e_latency = } (no GPU monitoring)")
                self.logger.warning(f"Skipping config {config.name}: {e2e_latency = }")
                return None

            # Warmup runs
            self.logger.info(f"Warming up with {config.warmup_iterations} iterations...")
            for _ in trange(config.warmup_iterations, desc="Warmup"):
                _ = self.time_generate(
                    max_new_tokens=config.num_tokens_to_generate,
                    use_continuous_batching=config.continuous_batching,
                    gpu_monitor=None,
                )
                self.time_generate(config, warmup=True)
            self.logger.info("Warmup over.")

            # Measurement runs
@@ -233,9 +231,7 @@ class BenchmarkRunner:
            self.logger.info(f"Benchmarking with {config.measurement_iterations} iterations.")
            for _ in trange(config.measurement_iterations, desc="Benchmarking"):
                e2e_latency, timestamps, shape_and_decoded_output, gpu_metrics = self.time_generate(
                    max_new_tokens=config.num_tokens_to_generate,
                    use_continuous_batching=config.continuous_batching,
                    gpu_monitor=(GPUMonitor(logger=self.logger) if config.gpu_monitoring else None),
                    config, warmup=False
                )
                result.accumulate(e2e_latency, timestamps, shape_and_decoded_output, gpu_metrics)
            self.logger.info("Benchmarking done. Cleaning up.")
@@ -244,52 +240,45 @@ class BenchmarkRunner:
            if num_tokens_to_profile > 0:
                self.profile_generate(num_tokens_to_profile, config.name)

            return {
                "metadata": BenchmarkMetadata(
                    model_id=model_id,
                    branch_name=self.branch_name,
                    commit_id=self.commit_id,
                    commit_message=self.commit_message,
                ),
                "measurements": result,
                "config": config,
            }
            return result

    def time_generate(
        self,
        max_new_tokens: int,
        use_continuous_batching: bool = False,
        gpu_monitor: GPUMonitor | None = None,
        self, config: BenchmarkConfig, warmup: bool
    ) -> tuple[float, list[float], str, GPURawMetrics | None]:
        # Prepare gpu monitoring if needed
        if gpu_monitor is not None:
        if config.gpu_monitoring and not warmup:
            gpu_monitor = GPUMonitor(logger=self.logger)
            gpu_monitor.start()
        else:
            gpu_monitor = None

        # Generate and time
        if use_continuous_batching:
        if config.continuous_batching:
            inputs = self.inputs["input_ids"].tolist()
            wall_time_0 = time.perf_counter()
            results = self.model.generate_batch(inputs, allow_prefix_sharing=False, record_timestamps=True)
            outputs = self.model.generate_batch(inputs, allow_prefix_sharing=False, record_timestamps=True)
        else:
            streamer = BenchmarkStreamer()
            wall_time_0 = time.perf_counter()
            results = self.model.generate(**self.inputs, streamer=streamer)
            outputs = self.model.generate(**self.inputs, streamer=streamer)

        wall_time_1 = time.perf_counter()
        gpu_metrics = gpu_monitor.stop_and_collect() if gpu_monitor is not None else None

        # Retrieve timestamps and results in a way that allows similar post-processing
        input_tokens = self.inputs["input_ids"].size(-1)
        if use_continuous_batching:
            timestamps = [result.timestamps for result in results.values()]
            results = torch.tensor([result.generated_tokens for result in results.values()])
        if config.continuous_batching:
            timestamps = [output.timestamps[:] for output in outputs.values()]
            results = torch.tensor([output.generated_tokens[:] for output in outputs.values()])
        else:
            timestamps = [streamer.timestamps[1:]]  # skip the first timestamp because it's the input tokens
            results = results[:, input_tokens:]
            results = outputs[:, input_tokens:]
        outputs = None
        flush_memory(flush_compile=False)

        # Check if generation had the right number of tokens
        if results.size(-1) != max_new_tokens:
            raise RuntimeError(f"Generated {results.size(-1)} tokens, expected {max_new_tokens}")
        if results.size(-1) != config.num_tokens_to_generate:
            raise RuntimeError(f"Generated {results.size(-1)} tokens, expected {config.num_tokens_to_generate}")

        # Decode outputs
        decoded_output = self.tokenizer.decode(results[0], skip_special_tokens=True)
@@ -298,6 +287,9 @@ class BenchmarkRunner:
        # Compute metrics
        e2e_latency = wall_time_1 - wall_time_0
        timestamps = torch.tensor(timestamps).sub(wall_time_0).tolist()
        self.logger.info(
            f"Time generate done in {e2e_latency:.2f} seconds. Memory usage: {torch.cuda.memory_allocated() / 1024**2:.2f} MB"
        )
        return e2e_latency, timestamps, shape_and_decoded_output, gpu_metrics

    def profile_generate(self, num_tokens_to_profile: int, config_name: str) -> None:
@@ -316,12 +308,14 @@ class BenchmarkRunner:
            os.makedirs(self.profile_dir, exist_ok=True)
        prof.export_chrome_trace(f"{self.profile_dir}/{config_name}.json")

    @torch.inference_mode()
    def run_benchmarks(
        self,
        model_id: str,
        benchmark_configs: list[BenchmarkConfig],
        num_tokens_to_profile: int = 0,
        pretty_print_summary: bool = True,
        summarized: bool = True,
    ) -> tuple[str, dict[str, Any]]:
        """Run multiple benchmarks for the given model ID and list of benchmark configs."""
        all_results = {}
@@ -343,15 +337,27 @@ class BenchmarkRunner:

            # Launch benchmark in a try/except block to avoid stopping the whole run if one benchmark fails
            try:
                results = self.run_benchmark(model_id, config, num_tokens_to_profile)
                if results is not None:
                    all_results[config.hash] = results

                result = self.run_benchmark(config, num_tokens_to_profile)
            except Exception as e:
                self.logger.error(f"Error running with scenario: {config.name}:\n{repr(e)}")
                result = None

            # Memoize
            all_results[config.hash] = {
                "metadata": BenchmarkMetadata(
                    model_id=model_id,
                    branch_name=self.branch_name,
                    commit_id=self.commit_id,
                    commit_message=self.commit_message,
                    success=result is not None,
                ),
                "measurements": result if result is not None else BenchmarkResult(),
                "config": config,
            }

            # Cleanup model and save results
            self.cleanup()
            self.save_results(model_id, all_results, timestamp=timestamp)
            self.save_results(model_id, all_results, timestamp=timestamp, summarized=summarized)

        if len(all_results) < 1:
            raise RuntimeError("No benchmark was run successfully")
@@ -378,7 +384,7 @@ class BenchmarkRunner:

        return (timestamp, all_results)

    def save_results(self, model_name: str, results: dict, timestamp: str = "") -> str:
    def save_results(self, model_name: str, results: dict, timestamp: str = "", summarized: bool = True) -> str:
        """Save benchmark results to JSON file."""
        # Create model-specific subdirectory
        model_name = model_name.replace("/", "_")
@@ -395,7 +401,7 @@ class BenchmarkRunner:
        for cfg_hash in results.keys():
            converted_results[cfg_hash] = {
                "metadata": results[cfg_hash]["metadata"].to_dict(),
                "measurements": results[cfg_hash]["measurements"].to_dict(),
                "measurements": results[cfg_hash]["measurements"].to_dict(summarized=summarized),
                "config": results[cfg_hash]["config"].to_dict(),
            }

--- a/benchmark_v2/framework/data_classes.py
+++ b/benchmark_v2/framework/data_classes.py
@@ -9,12 +9,12 @@ from .hardware_metrics import GPURawMetrics, HardwareInfo

 def compute_basic_statistics(measurements: list[float]) -> dict[str, float]:
    return {
        "avg": np.mean(measurements),
        "std": np.std(measurements),
        "min": np.min(measurements),
        "med": np.median(measurements),
        "max": np.max(measurements),
        "p95": np.percentile(measurements, 95),
        "avg": np.mean(measurements) if measurements else 0,
        "std": np.std(measurements) if measurements else 0,
        "min": np.min(measurements) if measurements else 0,
        "med": np.median(measurements) if measurements else 0,
        "max": np.max(measurements) if measurements else 0,
        "p95": np.percentile(measurements, 95) if measurements else 0,
    }


@@ -64,14 +64,18 @@ class BenchmarkMetadata:
    commit_id: str
    commit_message: str
    hardware_info: HardwareInfo
    success: bool

    def __init__(self, model_id: str, commit_id: str, branch_name: str = "main", commit_message: str = "") -> None:
    def __init__(
        self, model_id: str, commit_id: str, branch_name: str = "main", commit_message: str = "", success: bool = True
    ) -> None:
        self.model_id = model_id
        self.timestamp = datetime.now(timezone.utc).isoformat()
        self.branch_name = branch_name
        self.commit_id = commit_id
        self.commit_message = commit_message
        self.hardware_info = HardwareInfo()
        self.success = success

    def to_dict(self) -> dict[str, Any]:
        return {
@@ -81,6 +85,7 @@ class BenchmarkMetadata:
            "commit_id": self.commit_id,
            "commit_message": self.commit_message,
            "hardware_info": self.hardware_info.to_dict(),
            "success": self.success,
        }


--- a/conftest.py
+++ b/conftest.py
@@ -90,6 +90,7 @@ def pytest_configure(config):
    config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality")
    config.addinivalue_line("markers", "flash_attn_test: mark test which tests flash attention functionality")
    config.addinivalue_line("markers", "flash_attn_3_test: mark test which tests flash attention 3 functionality")
    config.addinivalue_line("markers", "training_ci: mark test for training CI validation")

    os.environ["DISABLE_SAFETENSORS_CONVERSION"] = "true"

--- a/docker/transformers-intel-cpu/Dockerfile
+++ b/docker/transformers-intel-cpu/Dockerfile
@@ -48,7 +48,6 @@ RUN pip install --upgrade pip wheel
 RUN pip install torch torchvision torchaudio torchcodec --index-url https://download.pytorch.org/whl/cpu --no-cache-dir
 RUN pip install av pyctcdecode pytesseract decord galore-torch fire scipy scikit-learn sentencepiece sentence_transformers sacremoses nltk rouge_score librosa soundfile mpi4py pytorch_msssim
 RUN pip install onnx optimum onnxruntime
 RUN pip install autoawq
 RUN pip install gptqmodel --no-build-isolation
 RUN pip install -U datasets timm transformers accelerate peft diffusers opencv-python kenlm evaluate
 RUN pip install -U intel-openmp
--- a/docker/transformers-pytorch-xpu/Dockerfile
+++ b/docker/transformers-pytorch-xpu/Dockerfile
@@ -74,7 +74,7 @@ RUN pip install torchcodec torchdata --no-cache-dir
 RUN pip install evaluate pyctcdecode pytesseract decord galore-torch fire scipy scikit-learn sentencepiece sacremoses nltk rouge_score librosa soundfile g2p_en mpi4py requests_mock
 RUN pip install pretty_midi essentia resampy Levenshtein av sacrebleu phonemizer invisible_watermark schedulefree setuptools
 RUN pip install gptqmodel --no-build-isolation
 RUN pip install gguf hqq compressed_tensors autoawq deepspeed torchao onnx auto_round
 RUN pip install gguf hqq compressed_tensors deepspeed torchao onnx auto_round
 RUN pip install hf_transfer huggingface-hub hf-doc-builder datasets optimum-quanto timm transformers accelerate optimum peft diffusers trl kernels

 # install liger-kernel
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -49,9 +49,6 @@ RUN python3 -m pip install --no-cache-dir hqq
 # For GGUF tests
 RUN python3 -m pip install --no-cache-dir gguf

 # Add autoawq for quantization testing
 RUN python3 -m pip install --no-cache-dir --no-build-isolation autoawq[kernels]

 # Add quanto for quantization testing
 RUN python3 -m pip install --no-cache-dir optimum-quanto

--- a/docs/source/ar/llm_tutorial.md
+++ b/docs/source/ar/llm_tutorial.md
@@ -238,7 +238,7 @@ LLMs هي [معماريات فك التشفير فقط](https://huggingface.co/l

 ### زمن الاستجابة والإنتاجية واستهلاك الذاكرة
 1. دليل تحسين نماذج اللغات الكبيرة من حيث السرعة والذاكرة: دليل تحسين نماذج اللغات الكبيرة.
 2. التكميم (Quantization): دليل حول تقنية التكميم التكميم مثل تقنيتي bitsandbytes و autogptq، والتي توضح كيفية تقليل متطلبات الذاكرة بشكل كبير.
 2. التكميم (Quantization): دليل حول تقنية التكميم التكميم مثل تقنيتي bitsandbytes و GPT-QModel، والتي توضح كيفية تقليل متطلبات الذاكرة بشكل كبير.

 ### مكتبات مرتبطة
 1. [`optimum`](https://github.com/huggingface/optimum), امتداد لمكتبة Transformers يعمل على تحسين الأداء لأجهزة معينة.
--- a/docs/source/ar/llm_tutorial_optimization.md
+++ b/docs/source/ar/llm_tutorial_optimization.md
@@ -273,7 +273,7 @@ flush()

 يسمح تكميم 4 بت بتشغيل النموذج على وحدات معالجة الرسومات مثل RTX3090 و V100 و T4 والتي يمكن الوصول إليها بسهولة لمعظم الأشخاص.

 لمزيد من المعلومات حول التكميم ولمعرفة كيف يمكن تكميم النماذج لطلب ذاكرة GPU VRAM أقل حتى من 4 بت، نوصي بالاطلاع على تنفيذ [`AutoGPTQ`](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#autogptq-integration%60).
 لمزيد من المعلومات حول التكميم ولمعرفة كيف يمكن تكميم النماذج لطلب ذاكرة GPU VRAM أقل حتى من 4 بت، نوصي بالاطلاع على تنفيذ [`GPT-QModel`](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#gptqmodel).

 > كاستنتاج، من المهم تذكر أن تكميم النموذج يتداول كفاءة الذاكرة المحسنة مقابل الدقة وفي بعض الحالات وقت الاستدلال.

--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -23,8 +23,6 @@
      title: Legacy model contribution
    - local: auto_docstring
      title: Documenting a model
    - local: attention_interface
      title: Customizing attention function
    title: Models
  - sections:
    - local: fast_tokenizers
@@ -61,11 +59,31 @@
    - local: llm_tutorial
      title: Text generation
    - local: generation_strategies
      title: Generation strategies
      title: Decoding methods
    - local: generation_features
      title: Generation features
    - local: tasks/prompting
      title: Prompt engineering
    - local: perplexity
      title: Perplexity of fixed-length models
    title: Generate API
  - sections:
    - local: optimization_overview
      title: Overview
    - local: attention_interface
      title: Attention backends
    - local: continuous_batching
      title: Continuous batching
    - local: kernel_doc/overview
      title: Kernels in transformers
    - local: perf_torch_compile
      title: torch.compile
    - local: perf_infer_gpu_one
      title: GPU
    - local: perf_infer_gpu_multi
      title: Distributed inference
    - local: perf_infer_cpu
      title: CPU
    - local: llm_optims
      title: Optimizing inference
    - local: cache_explanation
@@ -74,14 +92,14 @@
      title: KV cache strategies
    - local: llm_tutorial_optimization
      title: Getting the most out of LLMs
    - local: perplexity
      title: Perplexity of fixed-length models
    title: LLMs
    title: Optimization
  - sections:
    - local: conversations
      title: Chat basics
    - local: chat_templating
      title: Chat templates
    - local: chat_content_patterns
      title: Chat message patterns
    - local: chat_templating_multimodal
      title: Multimodal chat templates
    - local: chat_extras
@@ -101,24 +119,12 @@
    - local: open_webui
      title: Open WebUI
    title: Serving
  - sections:
    - local: perf_torch_compile
      title: torch.compile
    - local: perf_infer_gpu_one
      title: GPU
    - local: perf_infer_gpu_multi
      title: Distributed inference
    - local: perf_infer_cpu
      title: CPU
    title: Optimization
  - local: agents
    title: Agents
  - local: tools
    title: Tools
  - local: transformers_as_backend
    title: Transformers as modeling backend
  - local: continuous_batching
    title: Continuous Batching
  title: Inference
 - isExpanded: false
  sections:
@@ -218,11 +224,6 @@
  - local: quantization/contribute
    title: Contribute
  title: Quantization
 - isExpanded: false
  sections:
  - local: kernel_doc/overview
    title: Kernels in transformers
  title: Kernels
 - isExpanded: false
  sections:
  - local: serialization
@@ -904,6 +905,8 @@
        title: Hubert
      - local: model_doc/kyutai_speech_to_text
        title: Kyutai Speech-To-Text
      - local: model_doc/lasr
        title: LASR
      - local: model_doc/mimi
        title: Mimi
      - local: model_doc/mms
@@ -1120,6 +1123,8 @@
        title: OWL-ViT
      - local: model_doc/owlv2
        title: OWLv2
      - local: model_doc/paddleocr_vl
        title: PaddleOCRVL
      - local: model_doc/paligemma
        title: PaliGemma
      - local: model_doc/perceiver
--- a/docs/source/en/attention_interface.md
+++ b/docs/source/en/attention_interface.md
@@ -13,103 +13,145 @@ rendered properly in your Markdown viewer.

 -->

 # Attention Interface
 # Attention backends

 This page describes how to use the `AttentionInterface` in order to register custom attention functions to use with
 supported models.
 All attention implementations perform the same computation. Every token is compared to every other token. The difference is *how* the computation is performed. Basic attention scales poorly because it materializes the full attention matrix in memory, creating bottlenecks that slow down inference. Optimized implementations rearrange the math to reduce memory traffic for faster, more affordable inference.

 ## Customizing attention function
 The [`AttentionInterface`] provides optimized attention implementations. It decouples the attention implementation from the model implementation to simplify experimentation with different functions. Add new backends easily with this consistent interface.

 Most recent models can now switch from one attention function used in the Attention layer to the other, thanks to a simple mapping.
 By default, we provide the implementation for [`sdpa`](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html),
 [`flash_attention_2`](https://github.com/Dao-AILab/flash-attention) and [`flex_attention`](https://pytorch.org/docs/stable/nn.attention.flex_attention.html#module-torch.nn.attention.flex_attention)
 as well as `eager`, which is a simple matrix multiplication without any optimization on top.  
 This is the setting you can usually choose when instantiating a model:
 | attention backend | description |
 |---|---|
 | `"flash_attention_3"` | improves FlashAttention-2 by also overlapping operations and fusing forward and backward passes more tightly |
 | `"flash_attention_2"` | tiles computations into smaller blocks and uses fast on-chip memory |
 | `"flex_attention"` | framework for specifying custom attention patterns (sparse, block-local, sliding window) without writing low-level kernels by hand |
 | `"sdpa"` | built-in PyTorch implementation of [scaled dot product attention](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) |
 | <code>"paged&#124;flash_attention_2"</code> | Paged version of FlashAttention-2 |
 | <code>"paged&#124;sdpa"</code> | Paged version of SDPA |
 | <code>"paged&#124;eager"</code> | Paged version of eager |

 ```python
 from transformers import AutoModelForCausalLM
 ## Set an attention backend

 model_id = "meta-llama/Llama-3.2-1B"
 Use the `attn_implementation` argument in [`~PreTrainedModel.from_pretrained`] to instantiate a model with a specific attention function.

 ```py
 import torch
 from transformers import AutoModelForCausalLM

 # Here, using flash attention as an example
 model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="flash_attention_2")
 model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B", attn_implementation="flash_attention_2"
 )
 ```

 But what if you wanted to create your own attention function? Or simply play around with existing ones, adding
 a few statements here and there? You can now do so with the `AttentionInterface`! Here is an example:
 Switch between attention backends at runtime without reloading the model using [`~PreTrainedModel.set_attn_implementation`].

 ```python
 from transformers import AutoModelForCausalLM, AttentionInterface
 from transformers.integrations.sdpa_attention import sdpa_attention_forward
 import torch
 ```py
 model.set_attn_implementation("sdpa")
 ```

 model_id = "meta-llama/Llama-3.2-1B"
 ### Kernels

 def my_new_sdpa(*args, **kwargs):
    print("I just entered the attention computation")
    return sdpa_attention_forward(*args, **kwargs)
 Download and load compiled compute kernels directly from the [Hub](https://huggingface.co/models?other=kernels) at runtime with the [Kernels](https://huggingface.co/docs/kernels/index) library. This avoids packaging issues from mismatched PyTorch or CUDA versions.

 AttentionInterface.register("my_new_sdpa", my_new_sdpa)
 Kernels automatically register to [`AttentionInterface`] upon detection. You don't need to install the FlashAttention package explicitly.

 model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="my_new_sdpa")
 # Try running the forward with the new attention function
 model(torch.ones(1, 5, dtype=int))
 ```py
 import torch
 from transformers import AutoModelForCausalLM

 model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B", attn_implementation="kernels-community/flash-attn2"
 )
 ```

 You will see it prints "I just entered the attention computation" as many times as there are layers in the model (with this example, 16 times).
 ### SDPA context manager

 ## Dynamically switching attention function
 PyTorch's scaled dot product attention (SDPA) selects the fastest attention function for CUDA backends automatically. It defaults to the PyTorch C++ implementation for other backends.

 You could dynamically change the model's attention function as well:
 Force SDPA to use a specific implementation with the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager.

 ```python
 # Back to use original sdpa implementation
 model.set_attn_implementation("sdpa")
 ```py
 import torch
 from torch.nn.attention import SDPBackend, sdpa_kernel
 from transformers import AutoModelForCausalLM

 model(torch.ones(1, 5, dtype=int))
 model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B", attn_implementation="sdpa"
 )

 with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
    outputs = model.generate(**inputs)
 ```

 and it will stop printing the statements, as it now uses the `sdpa` attention.  
 This allows to quickly change an attention function, without needing to reload the model!
 ## Backbone-specific attention

 ## Different attention per backbone in multimodal models
 Multimodal models use different backbones for each modality. Optimize performance by assigning specific attention functions to each backbone. Some vision backbones perform better in fp32, for example, which FlashAttention does not support.

 For multimodal models different attention functions may work better for each backbone module. For example, some vision backbones perform better in fp32, but are incompatible with FlashAttention. To continue using FlashAttention while keeping the vision encoder in fp32, create a dict and map each config to an attention implementation as shown below.
 Map vision backbones to different attention functions with a dict while the text backbone continues to use FlashAttention. Keys in the attention implementation must match sub-config names.

 ```python
 ```py
 from transformers import AutoModelForImageTextToText

 model_id = "facebook/chameleon-7b"

 attention_implementation_per_backbone = {"vision_config": "sdpa", "text_config": "flash_attention_2"}
 model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation=attention_implementation_per_backbone)

 # NOTE: keys in the attention implementation have to be the same as the sub-config names
 for key in attention_implementation_per_backbone:
    assert key in model.config.sub_configs, f"Invalid key in `attention_implementation`"

 # You can omit certain backbones - the default attention function (SDPA) will be used
 # This is equivalent to the previous example
 model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation={"text_config": "flash_attention_2"})
 model = AutoModelForImageTextToText.from_pretrained(
    "facebook/chameleon-7b", attn_implementation=attention_implementation_per_backbone
 )
 ```

 Omit certain backbones from the dict to use the default attention function (SDPA).

 ```py
 model = AutoModelForImageTextToText.from_pretrained(
    "facebook/chameleon-7b", attn_implementation={"text_config": "flash_attention_2"}
 )
 ```

 Set the same attention function for all backbones with a single string.

 ```py
 model = AutoModelForImageTextToText.from_pretrained(
    "facebook/chameleon-7b", attn_implementation="eager"
 )
 ```

 # Set the same attention implementation for all backbones with single string, same as in non-multimodal models
 model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation="eager")
 Set the attention function globally with an empty key.

 # Alternatively use a dict with an empty key for global configuration
 model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation={"": "eager"})
 ```py
 model = AutoModelForImageTextToText.from_pretrained(
    "facebook/chameleon-7b", attn_implementation={"": "eager"}
 )
 ```

 ## What about new args needed in my custom attention function?
 ## Create a new attention function

 Customize or create new attention functions by adding them to the attention registry with [`AttentionInterface.register`]. Models use these functions through the `attn_implementation` argument.

 But indeed, what if the new function requires a new arg to be properly used? It's no issue! Models supporting the
 `AttentionInterface` propagate kwargs all the way to the Attention layers, and to the used attention function. That way,
 you can simply pass the arg (as a kwargs, i.e. you need to qualify the name of the arg) in the model's forward, and it will be correctly used in the attention. However, custom attention functions have some limitations. In particular, it must follow the signature and return format of other attention functions, i.e.
 This example customizes the attention function to print a statement for each layer.

 ```python
 import torch
 from transformers import AutoModelForCausalLM, AttentionInterface
 from transformers.integrations.sdpa_attention import sdpa_attention_forward

 def my_new_sdpa(*args, **kwargs):
    print("I just entered the attention computation")
    return sdpa_attention_forward(*args, **kwargs)

 AttentionInterface.register("my_new_sdpa", my_new_sdpa)

 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", attn_implementation="my_new_sdpa")
 model(torch.ones(1, 5, dtype=int))
 ```

 You can also add new arguments to the attention function. Models supporting [`AttentionInterface`] propagate kwargs to attention layers and the attention function. Pass arguments as kwargs in the model's forward function. Custom attention functions must follow this signature and return format.

 ```python
 import torch
 from transformers import AutoModelForCausalLM, AttentionInterface
 from transformers.integrations.sdpa_attention import sdpa_attention_forward

 def custom_attention(
    module: torch.nn.Module,  # required arg
@@ -127,44 +169,19 @@ def custom_attention(
 AttentionInterface.register("custom", custom_attention)

 model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="custom")
 # Forward pass with the new kwargs
 model(torch.ones(1, 5, dtype=int), a_new_kwargs=..., another_new_kwargs=...)
 ```

 If in doubt about what args/kwargs a given model sends to the attention function, simply check that model's modeling code on [GitHub](https://github.com/huggingface/transformers/tree/main/src/transformers/models)!
 Check a model's [modeling code](https://github.com/huggingface/transformers/tree/main/src/transformers/models) to confirm what arguments and kwargs it sends to the attention function.

 ## Accessing current available implementations
 ### AttentionMaskInterface

 Most of the time, you will simply need to `register` a new function. If, however, you need to access an existing one,
 and/or perform a few checks, the preferred way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
 would expect from a usual Python dictionary:

 ```python
 >>> from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS

 >>> list(ALL_ATTENTION_FUNCTIONS.keys())
 >>> ['flash_attention_2', 'flex_attention', 'sdpa']

 >>> ALL_ATTENTION_FUNCTIONS["sdpa"]
 >>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward>

 >>> ALL_ATTENTION_FUNCTIONS.get("sdpa", None)
 >>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward>

 # You can also globally `register` a new function directly on it
 >>> ALL_ATTENTION_FUNCTIONS.register("new_func", new_func)
 ```

 ## Attention Mask Interface

 Having a new attention function may mean that you need a new format of attention mask to decide what key and value tokens
 the query tokens should attend to. This is now possible with the `AttentionMaskInterface`! It works in the same way as
 the `AttentionInterface`:
 Configure which key and value tokens queries attend to with [`AttentionMaskInterface`]. Some attention functions require this configuration. Customize the attention mask function and add it to the registry with [`AttentionMaskInterface.register`].

 ```python
 import torch
 from transformers import AttentionMaskInterface
 from transformers.masking_utils import sdpa_mask
 import torch

 def my_new_sdpa_mask(*args, **kwargs):
    print("I just entered the attention mask computation")
@@ -173,11 +190,9 @@ def my_new_sdpa_mask(*args, **kwargs):
 AttentionMaskInterface.register("my_new_sdpa_mask", my_new_sdpa_mask)
 ```

 The reason you have to register it is because we need to automatically correct your mask format based on the attention implementation (for example, flex attention uses a BlockMask format, while sdpa uses a 4D tensor).
 By default, if you do not register an attention mask function along with your attention function, mask creation will be skipped
 and `attention_mask=None` will be passed along to the Attention layers.
 Registered attention masks automatically correct the mask format for the attention implementation. For example, FlexAttention uses a [BlockMask](https://docs.pytorch.org/docs/stable/nn.attention.flex_attention.html?utm_source=chatgpt.com#torch.nn.attention.flex_attention.BlockMask) format, while SDPA uses a 4D tensor. Without a registered attention mask function, mask creation is skipped and `attention_mask=None` passes to the model's attention layers.

 The default signature of the attention mask functions is the following:
 This is the default signature for an attention mask function.

 ```python
 def custom_attention_mask(
@@ -191,6 +206,6 @@ def custom_attention_mask(
 ) -> Optional[torch.Tensor]:
 ```

 It mostly works thanks to the `mask_function`, which is a `Callable` in the form of [torch's mask_mod functions](https://pytorch.org/blog/flexattention/), taking 4 indices as input and returning a boolean to indicate if this position should take part in the attention computation.
 The `mask_function` argument is a `Callable` that mimics PyTorch's [mask_mod](https://pytorch.org/blog/flexattention/) functions. It takes 4 indices as input and returns a boolean. This boolean indicates if the position contributes to the attention computation.

 If you cannot use the `mask_function` to create your mask for some reason, you can try to work around it by doing something similar to our [torch export workaround](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/executorch.py).
 Use this [workaround](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/executorch.py) for torch export if `mask_function` fails to create a mask.
--- a/docs/source/en/chat_content_patterns.md
+++ b/docs/source/en/chat_content_patterns.md
@@ -0,0 +1,198 @@
 <!--Copyright 2025 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.

 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.

 -->

 # Chat message patterns

 Chat models expect conversations as a list of dictionaries. Each dictionary uses `role` and `content` keys. The `content` key holds the user message passed to the model. Large language models accept text and tools and multimodal models combine text with images, videos, and audio.

 Transformers uses a unified format where each modality type is specified explicitly, making it straightforward to mix and match inputs in a single message.

 This guide covers message formatting patterns for each modality, tools, batch inference, and multi-turn conversations.

 ## Text

 Text is the most basic content type. It's the foundation for all other patterns. Pass your message to `"content"` as a string.

 ```py
 message = [
    {
        "role": "user",
        "content": "Explain the French Bread Law."
    }
 ]
 ```

 You could also use the explicit `"type": "text"` format to keep your code consistent when you add images, videos, or audio later.

 ```py
 message = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "Explain the French Bread Law."}]
    }
 ]
 ```

 ## Tools

 [Tools](./chat_extras) are functions a chat model can call, like getting real-time weather data, instead of generating it on its own.

 The `assistant` role handles the tool request. Set `"type": "function"` in the `"tool_calls"` key and provide your tool to the `"function"` key. Append the assistant's tool request to your message.

 ```py
 weather = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
 message.append(
    {
        "role": "assistant", 
        "tool_calls": [{"type": "function", "function": weather}]
    }
 )
 ```

 The `tool` role handles the result. Append it in `"content"`. This value should always be a string.

 ```py
 message.append({"role": "tool", "content": "22"})
 ```

 ## Multimodal

 Multimodal models extend this format to handle images, videos, and audio. Each input specifies its `"type"` and provides the media with `"url"` or `"path"`.

 ### Image

 Set `"type": "image"` and use `"url"` for links or `"path"` for local files.

 ```py
 message = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://assets.bonappetit.com/photos/57ad4ebc53e63daf11a4ddc7/master/w_1280,c_limit/kouign-amann.jpg"},
            {"type": "text", "text": "What pastry is shown in the image?"}
        ]
    }
 ]
 ```

 ### Video

 Set `"type": "video"` and use `"url"` for links or `"path"` for local files.

 ```py
 message = [
    {
        "role": "user",
        "content": [
            {"type": "video", "url": "https://static01.nyt.com/images/2019/10/01/dining/01Sourdough-GIF-1/01Sourdough-GIF-1-superJumbo.gif"},
            {"type": "text", "text": "What is shown in this video?"}
        ]
    }
 ]
 ```

 ### Audio

 Set `"type": "audio"` and use `"url"` for links or `"path"` for local files.

 ```py
 message = [
    {
        "role": "user",
        "content": [
            {"type": "audio", "url": "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac"},
            {"type": "text", "text": "Transcribe the speech."}
        ]
    }
 ]
 ```

 ### Mixed multiple

 The `content` list accepts any combination of types. The model processes all inputs together, enabling comparisons or cross-modal reasoning.

 ```py
 message = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://assets.bonappetit.com/photos/57ad4ebc53e63daf11a4ddc7/master/w_1280,c_limit/kouign-amann.jpg"},
            {"type": "video", "url": "https://static01.nyt.com/images/2019/10/01/dining/01Sourdough-GIF-1/01Sourdough-GIF-1-superJumbo.gif"},
            {"type": "text", "text": "What does the image and video share in common?"},
        ],
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://assets.bonappetit.com/photos/57ad4ebc53e63daf11a4ddc7/master/w_1280,c_limit/kouign-amann.jpg"},
            {"type": "image", "url": "https://assets.bonappetit.com/photos/57e191f49f19b4610e6b7693/master/w_1600%2Cc_limit/undefined"},
            {"type": "text", "text": "What type of pastries are these?"},
        ],
    }
 ]
 ```

 ## Batched

 Batched inference processes multiple conversations in a single forward pass to improve throughput and efficiency. Wrap each conversation in its own list, then pass them together as a list of lists. 

 ```py
 messages = [
    [
        {"role": "user",
        "content": [
                {"type": "image", "url": "https://assets.bonappetit.com/photos/57ad4ebc53e63daf11a4ddc7/master/w_1280,c_limit/kouign-amann.jpg"},
                {"type": "text", "text": "What type of pastry is this?"}
            ]
        },
    ],
    [
        {"role": "user",
        "content": [
                {"type": "image", "url": "https://assets.bonappetit.com/photos/57e191f49f19b4610e6b7693/master/w_1600%2Cc_limit/undefined"},
                {"type": "text", "text": "What type of pastry is this?"}
            ]
        },
    ],
 ]
 ```

 ## Multi-turn

 Conversations span multiple exchanges, alternating between `"user"` and `"assistant"` roles. Each turn adds a new message to the list, giving the model access to the full conversation history. This context helps the model generate more appropriate responses.

 ```py
 message = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://assets.bonappetit.com/photos/57ad4ebc53e63daf11a4ddc7/master/w_1280,c_limit/kouign-amann.jpg"},
            {"type": "text", "text": "What pastry is shown in the image?"}
        ]
    },
    {
        "role": "assistant",
        "content": [{"type": "text", "text": "This is kouign amann, a laminated dough pastry (i.e., dough folded with layers of butter) that also incorporates sugar between layers so that during baking the sugar caramelizes."}]
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://static01.nyt.com/images/2023/07/21/multimedia/21baguettesrex-hbkc/21baguettesrex-hbkc-videoSixteenByNineJumbo1600.jpg"},
            {"type": "text", "text": "Compare it to this image now."}
        ]
    }
 ]
 ```
--- a/docs/source/en/continuous_batching.md
+++ b/docs/source/en/continuous_batching.md
@@ -14,52 +14,30 @@ rendered properly in your Markdown viewer.

 -->

 # Continuous Batching
 # Continuous batching

 Continuous Batching (CB) is an advanced technique to optimize the inference of transformer models by dynamically grouping multiple requests into batches. This approach maximizes GPU utilization and throughput, specifically for workloads with many variable-length inputs.
 Continuous batching maximizes GPU utilization. It increases throughput and reduces latency by using dynamic scheduling to rearrange the batch at each step. The system removes completed requests and adds new requests immediately to prevent GPU idling. Chunked prefill prevents expensive prefill work from stalling the batch while still allowing new requests still join.

 We are particularly interested in having Continuous Batching in transformers for the following use cases:
 - Evaluation of models on large datasets with variable-length inputs
 - Generating outputs for multiple sequences for GRPO policies
 Continuous batching works with [transformers serve](./serving), a server for deploying local models, and [`~ContinuousMixin.generate_batch`].

 CB is what makes inference engines like vLLM or SGLang efficient. That being said, transformers does not aim to be a production-ready inference engine, but a complete framework for model development. For this reason, CB is available in `transformers serve`.
 ## generate_batch

 If you are not familiar with some of the core concepts CB is built upon, we invite you to read the associated blog post: [Continuous Batching: Efficient Inference for Large Language Models](https://huggingface.co/blog/continuous-batching). _broken link for now_

 ## API Reference

 ## Usage Examples

 The main way to use CB in transformers is via the `generate_batch` method.

 Unlike `generate`, CB takes already tokenized inputs, known as input IDs. Each sequence of input IDs is represented as a list of integers, in python: `list[int]`. Since 

 For a more detailed example, please refer to: [examples/continuous_batching](./path/to/example)

 ### `generate_batch` example

 We have created a `ContinuousMixin` that is inherited by the `GenerationMixin` so that all auto regressive text models support CB.

 This adds the `generate_batch` method to all models that inherit from `GenerationMixin`.

 You can use it as follows:
 The [`~ContinuousMixin.generate_batch`] method works with all autoregressive text models. It accepts a list of tokenized inputs and a [`GenerationConfig`] to configure generation settings.

 ```py
 import datasets
 import torch

 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig

 model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-4B-Instruct-2507",
    attn_implementation="spda_paged",
    device_map="cuda",  # if you need cuda
    attn_implementation="sdpa_paged",
    device_map="cuda",
    dtype=torch.bfloat16,
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Instruct-2507", padding_side="left")

 # prepare a batch of inputs
 dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
 dataset = dataset.select(range(args.samples))
 tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
@@ -67,11 +45,11 @@ simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]

 generation_config = GenerationConfig(
    max_new_tokens=32,
    use_cuda_graph=False,  # Not supported for simple version
    use_cuda_graph=False,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    do_sample=False,
    max_batch_tokens=512,  # max number of tokens in a batch, this is just a default value you should tune based on your hardware
    max_batch_tokens=512,
 )

 batch_outputs = model.generate_batch(
@@ -84,59 +62,45 @@ for request_id, output in batch_outputs.items():
    print(f"Request {request_id} output: {generated_text}")
 ```

 ### `ContinuousBatchingManager` example

 If you want more control w.r.t. how you want to schedule requests using CB, you can use the `ContinuousBatchingManager` class directly.

 This is what we use in `transformers serve` because requests arrive asynchronously and we can leverage the asynchronous nature of the CB process to make things more efficient.
 ## ContinuousBatchingManager

 Under the hood, the `ContinuousBatchingManager` creates a background thread that receives inputs from a python `queue.Queue` which it uses to get requests to batch in each forward pass.
 The [`ContinuousBatchingManager`] orchestrates the background thread by pulling requests from the queue and filling the GPU to capacity. Every iteration checks for finished requests and schedules new ones to join the batch. Use this manager to customize request scheduling.

 Note that the manager is thread safe!
 Call [`~ContinuousMixin.init_continuous_batching`] to initialize the manager with a [`GenerationConfig`] and [`~ContinuousBatchingManager.start`] the background thread.

 ```py
 import datasets
 import torch

 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig
 from transformers.generation.continuous_batching import RequestStatus

 model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-4B-Instruct-2507",
    attn_implementation="spda_paged",
    device_map="cuda",  # if you need cuda
    dtype=torch.bfloat16,
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")

 # prepare a batch of inputs
 dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
 dataset = dataset.select(range(args.samples))
 tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
 simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]

 # initialize the manager, available method thanks to the `ContinuousMixin`
 manager = model.init_continuous_batching(generation_config=generation_config)

 # start the background thread
 manager.start()
 ```

 # this is for demonstration purposes only, in practice this is most useful to do concurrently
 for i, input in enumerate(simple_batch_inputs):
    request_id = manager.add_request(input_ids=input, request_id=f"request_{i}")  # if you do not specify a request_id, one will be generated for you
 Use [`~ContinuousBatchingManager.add_request`] to asynchronously submit individual requests. Provide a specific request id or the manager wgenerates one automatically.

 # Can be done in an other thread
 for id, request in manager.get_result():
 ```py
 for i, input_ids in enumerate(simple_batch_inputs):
    request_id = manager.add_request(input_ids=input_ids, request_id=f"request_{i}")
 ```

 Retrieve *all* results as they arrive with [`~ContinuousBatchingManager.get_result`].

 ```py
 for request_id, request in manager.get_result():
    generated_text = tokenizer.decode(request.generated_tokens, skip_special_tokens=True)
    print(f"Request {id} output: {generated_text}")
    print(f"Request {request_id} output: {generated_text}")
 ```

 # you can also get results for a specific request id
 result = manager.get_result(request_id="request_5")  # this is blocking and will wait for the result to be ready
 Use the `request_id` of a specific request to get its results. This is a blocking operation that waits until the result is ready.

 # or get results for a request that is streaming
 ```py
 result = manager.get_result(request_id="request_5")
 ```

 Stream partial results for a specific request with [`~ContinuousBatchingManager.request_id_iter`].

 ```py
 manager.add_request(
    input_ids=input,
    input_ids=input_ids,
    request_id="streaming_request",
    stream=True,
 )
@@ -146,49 +110,65 @@ for chunk in manager.request_id_iter(request_id="streaming_request"):
    # FIXME: stop iteration in `request_id_iter` when finished instead of doing it externally
    if chunk.status == RequestStatus.FINISHED:
        break
 ```

 Call [`~ContinuousBatchingManager.stop`] to terminate the manager.

 # stop the background thread before exiting the process
 ```py
 manager.stop()
 ```

 ## Supported & Unsupported Features
 ## PagedAttention

 ### Supported Features
 PagedAttention breaks large key-value caches into smaller, non-contiguous fixed-size pages to avoid GPU memory fragmentation and support variable-length requests. Transformers automatically enables PagedAttention when using continuous batching.

 - Dynamic scheduling of variable-length requests
 - Chunked prefill
 - Paged Attention Cache
 - Sliding window attention
 - Chat templates
 You could explicitly enable PagedAttention when instantiating a model rather than waiting for [`~ContinuousMixin.generate_batch`] to dynamically enable it.

 ### Unsupported Features
 ```py
 import torch
 from transformers import AutoModelForCausalLM

 model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-4B-Instruct-2507",
    attn_implementation="paged|flash_attention_2",
    device_map="cuda",
    torch_dtype=torch.bfloat16
 )
 ```

 At the moment, the following features are not supported with CB. We plan to add support to the following:
 ## Sliding window attention

 - Prefix caching
 - Beam search
 - tool calling
 Sliding window attention limits the backward context of a token to save compute. Generation cost stays proportional to window size. This reduces compute per step and simplifies continuous batching.

 The others are unplanned, but depending on community requests we might consider adding them:
 Transformers models like Mistral and Gemma 2 natively support sliding window attention. Manually enable it in the model config if the architecture supports it. This helps with fine-tuning or running custom experiments.

 - MTP (multi token prediction)
 - Medusa
 ```py
 from transformers import AutoConfig

 ## Performance Considerations
 config = AutoConfig.from_pretrained("google/gemma-2-2b")
 config.sliding_window = 4096

 model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b",
    config=config,
    attn_implementation="paged|flash_attention_2",
    device_map="cuda",
    dtype=torch.bfloat16,
 )
 ```

 ## Integration with Serving
 Usage remains the same with [`~ContinuousMixin.generate_batch`].

 You can use CB in `transformers serve` by passing the `--continuous-batching` flag when starting the server.
 ## How it works

 ## Monitoring
 The [`ContinuousMixin`] class serves as the main interface for continuous batching through [`~ContinuousMixin.generate_batch`]. This method internally creates a [`ContinuousBatchingManager`].

 We have added `opentelemetry` support to Continuous Batching to help you monitor its performance in production. To enable it, you need to install the `opentelemetry` extra when installing `transformers`:
 [`ContinuousBatchingManager`] manages requests by creating a background thread for the generation loop and adding requests to the queue. The manager is thread-safe, allowing asynchronous request additions while the model generates.

 ```sh
 # this installs `opentelemetry-api`, `opentelemetry-sdk` and `opentelemetry-exporter-otlp`
 pip install transformers[open-telemetry]
 ```
 The [`Scheduler`] selects requests for processing at each step based on the token budget. [`FIFOScheduler`] is the default scheduler. It prioritizes decoding requests over prefilling requests and assigns them to specific memory blocks. [`PrefillFirstScheduler`] prioritizes prefill requests instead.

 [`ContinuousBatchingManager`] runs the model forward pass for the scheduled requests. It then collects and returns the results.

 This will enable traces and metrics collection in CB. You will then have to setup the backend to collect and visualize the traces and metrics.
 ## Resources

 The [Continuous batching](https://huggingface.co/blog/continuous_batching) blog post explains KV caching, chunked prefill, and ragged batching with dynamic scheduling in more detail.
--- a/docs/source/en/internal/import_utils.md
+++ b/docs/source/en/internal/import_utils.md
@@ -97,3 +97,5 @@ You can specify the following operators: `==`, `>`, `>=`, `<`, `<=`, `!=`.
 [[autodoc]] utils.import_utils.define_import_structure

 [[autodoc]] utils.import_utils.requires

 [[autodoc]] utils.import_utils.requires_backends
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@@ -360,7 +360,7 @@ Quantization reduces the size of model weights by storing them in a lower precis
 If you aren't limited by your GPU, you don't necessarily need to quantize your model because it can increase latency slightly (except for AWQ and fused AWQ modules) due to the extra step required to quantize and dequantize the weights.

 > [!TIP]
 > There are many quantization libraries (see the [Quantization](./quantization) guide for more details) available, such as Quanto, AQLM, VPTQ, AWQ, and AutoGPTQ. Feel free to try them out and see which one works best for your use case. We also recommend reading the [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) blog post which compares AutoGPTQ and bitsandbytes.
 > There are many quantization libraries (see the [Quantization](./quantization) guide for more details) available, such as Quanto, AQLM, VPTQ, AWQ, and GPT-QModel. Feel free to try them out and see which one works best for your use case. We also recommend reading the [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) blog post for a comparison of different approaches.

 Use the Model Memory Calculator below to estimate and compare how much memory is required to load a model. For example, try estimating the memory required to load [Mistral-7B-v0.1](https://hf.co/mistralai/Mistral-7B-v0.1).

--- a/docs/source/en/llm_tutorial_optimization.md
+++ b/docs/source/en/llm_tutorial_optimization.md
@@ -286,7 +286,7 @@ Overall, we saw that running OctoCoder in 8-bit precision reduced the required G

 4-bit quantization allows the model to be run on GPUs such as RTX3090, V100, and T4 which are quite accessible for most people.

 For more information on quantization and to see how one can quantize models to require even less GPU VRAM memory than 4-bit, we recommend looking into the [`AutoGPTQ`](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#autogptq-integration%60) implementation.
 For more information on quantization and to see how one can quantize models to require even less GPU VRAM memory than 4-bit, we recommend looking into the [`GPT-QModel`](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#gptqmodel) implementation.

 > As a conclusion, it is important to remember that model quantization trades improved memory efficiency against accuracy and in some cases inference time.

--- a/docs/source/en/main_classes/callback.md
+++ b/docs/source/en/main_classes/callback.md
@@ -36,7 +36,6 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi
 - [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
 - [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed.
 - [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
 - [`~integrations.NeptuneCallback`] if [neptune](https://neptune.ai/) is installed.
 - [`~integrations.AzureMLCallback`] if [azureml-sdk](https://pypi.org/project/azureml-sdk/) is
  installed.
 - [`~integrations.CodeCarbonCallback`] if [codecarbon](https://pypi.org/project/codecarbon/) is
@@ -84,8 +83,6 @@ Here is the list of the available [`TrainerCallback`] in the library:

 [[autodoc]] integrations.CodeCarbonCallback

 [[autodoc]] integrations.NeptuneCallback

 [[autodoc]] integrations.ClearMLCallback

 [[autodoc]] integrations.DagsHubCallback
--- a/docs/source/en/main_classes/text_generation.md
+++ b/docs/source/en/main_classes/text_generation.md
@@ -42,3 +42,23 @@ like token streaming.
 [[autodoc]] GenerationMixin
    - generate
    - compute_transition_scores

 ## ContinuousMixin

 [[autodoc]] generation.ContinuousMixin

 ## ContinuousBatchingManager

 [[autodoc]] generation.ContinuousBatchingManager

 ## Scheduler

 [[autodoc]] generation.Scheduler

 ## FIFOScheduler

 [[autodoc]] generation.FIFOScheduler

 ## PrefillFirstScheduler

 [[autodoc]] generation.PrefillFirstScheduler
--- a/docs/source/en/model_doc/afmoe.md
+++ b/docs/source/en/model_doc/afmoe.md
@@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.

 -->
 *This model was released on {release_date} and added to Hugging Face Transformers on 2025-11-18.*
 *This model was released on {release_date} and added to Hugging Face Transformers on 2025-11-29.*

 <div style="float: right;">
    <div class="flex flex-wrap space-x-1">
--- a/docs/source/en/model_doc/fast_vlm.md
+++ b/docs/source/en/model_doc/fast_vlm.md
@@ -14,7 +14,7 @@ rendered properly in your Markdown viewer.

 -->

 *This model was released on 2025-05-06 and added to Hugging Face Transformers on 2025-10-07.*
 *This model was released on 2025-05-06 and added to Hugging Face Transformers on 2025-12-02.*

 # FastVLM

@@ -57,14 +57,14 @@ Setting it for the entire model, e.g.

 will result in an error.

 ### Formatting Prompts with Chat Templates  
 ### Formatting Prompts with Chat Templates

 Each **checkpoint** is trained with a specific prompt format, depending on the underlying large language model backbone. To ensure correct formatting, use the processor’s `apply_chat_template` method.  
 Each **checkpoint** is trained with a specific prompt format, depending on the underlying large language model backbone. To ensure correct formatting, use the processor’s `apply_chat_template` method.

 **Important:**  
 - You must construct a conversation history — passing a plain string won't work.  
 - Each message should be a dictionary with `"role"` and `"content"` keys.  
 - The `"content"` should be a list of dictionaries for different modalities like `"text"` and `"image"`.  
 **Important:**
 - You must construct a conversation history — passing a plain string won't work.
 - Each message should be a dictionary with `"role"` and `"content"` keys.
 - The `"content"` should be a list of dictionaries for different modalities like `"text"` and `"image"`.

 ## Usage examples

--- a/docs/source/en/model_doc/glm46v.md
+++ b/docs/source/en/model_doc/glm46v.md
@@ -1,3 +1,20 @@
 <!--Copyright 2025 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.

 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.

 -->
 *This model was released on {release_date} and added to Hugging Face Transformers on 2025-11-15.*

 # GLM-4.6V

 ## Glm46VConfig
--- a/docs/source/en/model_doc/lasr.md
+++ b/docs/source/en/model_doc/lasr.md
@@ -0,0 +1,109 @@
 <!--Copyright 2025 The HuggingFace Inc. team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.

 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.

 -->
 *This model was released on {release_date} and added to Hugging Face Transformers on 2025-12-05.*

 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>

 # LASR

 ## Overview

 TODO

 ## Usage

 ### Basic usage

 <hfoptions id="usage">
 <hfoption id="Pipeline">

 ```py
 from transformers import pipeline

 pipe = pipeline("automatic-speech-recognition", model="path/to/lasr-model")
 out = pipe("path/to/audio.mp3")
 print(out)
 ```

 </hfoption>
 <hfoption id="AutoModel">

 ```py
 from transformers import AutoModelForCTC, AutoProcessor
 from datasets import load_dataset, Audio
 import torch

 device = "cuda" if torch.cuda.is_available() else "cpu"

 processor = AutoProcessor.from_pretrained("path/to/lasr-model")
 model = AutoModelForCTC.from_pretrained("path/to/lasr-model", dtype="auto", device_map=device)

 ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))
 speech_samples = [el['array'] for el in ds["audio"][:5]]

 inputs = processor(speech_samples, sampling_rate=processor.feature_extractor.sampling_rate)
 inputs.to(model.device, dtype=model.dtype)
 outputs = model.generate(**inputs)
 print(processor.batch_decode(outputs))
 ```

 </hfoption>
 </hfoptions>

 ### Making The Model Go Brrr

 TODO

 ### Training

 TODO

 ## LasrTokenizer

 [[autodoc]] LasrTokenizer

 ## LasrFeatureExtractor

 [[autodoc]] LasrFeatureExtractor
    - __call__

 ## LasrProcessor

 [[autodoc]] LasrProcessor
    - __call__
    - batch_decode
    - decode

 ## LasrEncoderConfig

 [[autodoc]] LasrEncoderConfig

 ## LasrCTCConfig

 [[autodoc]] LasrCTCConfig

 ## LasrEncoder

 [[autodoc]] LasrEncoder

 ## LasrForCTC

 [[autodoc]] LasrForCTC

--- a/docs/source/en/model_doc/ministral3.md
+++ b/docs/source/en/model_doc/ministral3.md
@@ -16,6 +16,7 @@ limitations under the License.
 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer.

 -->
 *This model was released on 2025-12-01 and added to Hugging Face Transformers on 2025-12-01.*


 # Ministral3
--- a/docs/source/en/model_doc/musicgen.md
+++ b/docs/source/en/model_doc/musicgen.md
@@ -60,7 +60,7 @@ This model was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-ga

 ```bash
 python src/transformers/models/musicgen/convert_musicgen_transformers.py \
    --checkpoint small --pytorch_dump_folder /output/path --safe_serialization 
    --checkpoint small --pytorch_dump_folder /output/path
 ```

 ## Generation
--- a/docs/source/en/model_doc/nanochat.md
+++ b/docs/source/en/model_doc/nanochat.md
@@ -13,6 +13,7 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.

 -->
 *This model was released on {release_date} and added to Hugging Face Transformers on 2025-11-27.*

 # NanoChat

--- a/docs/source/en/model_doc/paddleocr_vl.md
+++ b/docs/source/en/model_doc/paddleocr_vl.md
@@ -0,0 +1,248 @@
 <!--Copyright 2025 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.

 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.

 -->
 *This model was released on 2025.10.16 and added to Hugging Face Transformers on 2025.12.10*

 # PaddleOCR-VL

 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>

 ## Overview

 **Huggingface Hub**: [PaddleOCR-VL](https://huggingface.co/collections/PaddlePaddle/paddleocr-vl) | **Github Repo**: [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)

 **Official Website**: [Baidu AI Studio](https://aistudio.baidu.com/paddleocr) | **arXiv**: [Technical Report](https://arxiv.org/pdf/2510.14528)

 **PaddleOCR-VL** is a SOTA and resource-efficient model tailored for document parsing. Its core component is PaddleOCR-VL-0.9B, a compact yet powerful vision-language model (VLM) that integrates a NaViT-style dynamic resolution visual encoder with the ERNIE-4.5-0.3B language model to enable accurate element recognition. This innovative model efficiently supports 109 languages and excels in recognizing complex elements (e.g., text, tables, formulas, and charts), while maintaining minimal resource consumption. Through comprehensive evaluations on widely used public benchmarks and in-house benchmarks, PaddleOCR-VL achieves SOTA performance in both page-level document parsing and element-level recognition. It significantly outperforms existing solutions, exhibits strong competitiveness against top-tier VLMs, and delivers fast inference speeds. These strengths make it highly suitable for practical deployment in real-world scenarios.

 <div align="center">
 <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/allmetric.png" width="800"/>
 </div>

 ### **Core Features**

 1. **Compact yet Powerful VLM Architecture:** We present a novel vision-language model that is specifically designed for resource-efficient inference, achieving outstanding performance in element recognition. By integrating a NaViT-style dynamic high-resolution visual encoder with the lightweight ERNIE-4.5-0.3B language model, we significantly enhance the model’s recognition capabilities and decoding efficiency. This integration maintains high accuracy while reducing computational demands, making it well-suited for efficient and practical document processing applications.

 2. **SOTA Performance on Document Parsing:** PaddleOCR-VL achieves state-of-the-art performance in both page-level document parsing and element-level recognition. It significantly outperforms existing pipeline-based solutions and exhibiting strong competitiveness against leading vision-language models (VLMs) in document parsing. Moreover, it excels in recognizing complex document elements, such as text, tables, formulas, and charts, making it suitable for a wide range of challenging content types, including handwritten text and historical documents. This makes it highly versatile and suitable for a wide range of document types and scenarios.

 3. **Multilingual Support:** PaddleOCR-VL Supports 109 languages, covering major global languages, including but not limited to Chinese, English, Japanese, Latin, and Korean, as well as languages with different scripts and structures, such as Russian (Cyrillic script), Arabic, Hindi (Devanagari script), and Thai. This broad language coverage substantially enhances the applicability of our system to multilingual and globalized document processing scenarios.

 ### **Model Architecture** 

 <div align="center">
 <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/paddleocrvl.png" width="800"/>
 </div>

 ## Usage

 ### Usage tips

 > [!IMPORTANT]
 > We currently recommend using the [PaddleOCR official method for inference](https://www.paddleocr.ai/latest/en/version3.x/pipeline_usage/PaddleOCR-VL.html), as it is faster and supports page-level document parsing. 
 > The example code below only supports element-level recognition.

 We have four types of element-level recognition:

 - Text recognition, indicated by the prompt `OCR:`.
 - Formula recognition, indicated by the prompt `Formula Recognition:`.
 - Table recognition, indicated by the prompt `Table Recognition:`.
 - Chart recognition, indicated by the prompt `Chart Recognition:`.

 The following examples are all based on text recognition, with the prompt `OCR:`.

 ### Single input inference

 The example below demonstrates how to generate text with PaddleOCRVL using [`Pipeline`] or the [`AutoModel`].

 <hfoptions id="usage">
 <hfoption id="Pipeline">

 ```py
 from transformers import pipeline

 pipe = pipeline("image-text-to-text", model="PaddlePaddle/PaddleOCR-VL", dtype="bfloat16")
 messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/ocr_demo2.jpg"},
            {"type": "text", "text": "OCR:"},
        ]
    }
 ]
 result = pipe(text=messages)
 print(result[0]["generated_text"])
 ```

 </hfoption>

 <hfoption id="AutoModel">

 ```py
 from transformers import AutoProcessor, AutoModelForImageTextToText

 model = AutoModelForImageTextToText.from_pretrained("PaddlePaddle/PaddleOCR-VL", dtype="bfloat16")
 processor = AutoProcessor.from_pretrained("PaddlePaddle/PaddleOCR-VL")
 messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/ocr_demo2.jpg"},
            {"type": "text", "text": "OCR:"},
        ]
    }
 ]
 inputs = processor.apply_chat_template(
 	messages,
 	add_generation_prompt=True,
 	tokenize=True,
 	return_dict=True,
 	return_tensors="pt",
 ).to(model.device)

 outputs = model.generate(**inputs, max_new_tokens=100)
 result = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:-1])
 print(result)
 ```

 </hfoption>
 </hfoptions>

 ### Batched inference

 PaddleOCRVL also supports batched inference. We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Here is how you can do it with PaddleOCRVL using [`Pipeline`] or the [`AutoModel`]:

 <hfoptions id="usage">
 <hfoption id="Pipeline">

 ```py
 from transformers import pipeline

 pipe = pipeline("image-text-to-text", model="PaddlePaddle/PaddleOCR-VL", dtype="bfloat16")
 messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/ocr_demo2.jpg"},
            {"type": "text", "text": "OCR:"},
        ]
    }
 ]
 result = pipe(text=[messages, messages])
 print(result[0][0]["generated_text"])
 print(result[1][0]["generated_text"])
 ```

 </hfoption>

 <hfoption id="AutoModel">

 ```py
 from transformers import AutoProcessor, AutoModelForImageTextToText

 model = AutoModelForImageTextToText.from_pretrained("PaddlePaddle/PaddleOCR-VL", dtype="bfloat16")
 processor = AutoProcessor.from_pretrained("PaddlePaddle/PaddleOCR-VL")
 messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/ocr_demo2.jpg"},
            {"type": "text", "text": "OCR:"},
        ]
    }
 ]
 batch_messages = [messages, messages]
 inputs = processor.apply_chat_template(
 	batch_messages,
 	add_generation_prompt=True,
 	tokenize=True,
 	return_dict=True,
 	return_tensors="pt",
    padding=True,
    padding_side='left',
 ).to(model.device)

 generated_ids = model.generate(**inputs, max_new_tokens=100)
 generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
 result = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 print(result)
 ```

 </hfoption>
 </hfoptions>

 ### Using Flash Attention 2

 Flash Attention 2 is an even faster, optimized version of the previous optimization, please refer to the [FlashAttention](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention).

 For example:

 ```shell
 pip install flash-attn --no-build-isolation
 ```

 ```python
 from transformers import AutoModelForImageTextToText
 model = AutoModelForImageTextToText.from_pretrained("PaddlePaddle/PaddleOCR-VL", dtype="bfloat16", attn_implementation="flash_attention_2")
 ```

 ## PaddleOCRVLForConditionalGeneration

 [[autodoc]] PaddleOCRVLForConditionalGeneration
    - forward

 ## PaddleOCRVLConfig

 [[autodoc]] PaddleOCRVLConfig

 ## PaddleOCRVisionConfig

 [[autodoc]] PaddleOCRVisionConfig

 ## PaddleOCRTextConfig

 [[autodoc]] PaddleOCRTextConfig

 ## PaddleOCRTextModel

 [[autodoc]] PaddleOCRTextModel

 ## PaddleOCRVisionModel

 [[autodoc]] PaddleOCRVisionModel

 ## PaddleOCRVLImageProcessor

 [[autodoc]] PaddleOCRVLImageProcessor

 ## PaddleOCRVLImageProcessorFast

 [[autodoc]] PaddleOCRVLImageProcessorFast

 ## PaddleOCRVLModel

 [[autodoc]] PaddleOCRVLModel

 ## PaddleOCRVLProcessor

 [[autodoc]] PaddleOCRVLProcessor

 ## PaddleOCRVisionTransformer

 [[autodoc]] PaddleOCRVisionTransformer
--- a/docs/source/en/model_doc/roformer.md
+++ b/docs/source/en/model_doc/roformer.md
@@ -99,8 +99,7 @@ echo -e "水在零度时会[MASK]" | transformers run --task fill-mask --model j

 ## RoFormerTokenizerFast

 [[autodoc]] RoFormerTokenizerFast
    - build_inputs_with_special_tokens
 `RoFormerTokenizerFast` is an alias for [`RoFormerTokenizer`].

 ## RoFormerModel

--- a/docs/source/en/model_doc/sam3.md
+++ b/docs/source/en/model_doc/sam3.md
@@ -354,6 +354,21 @@ When running the same text prompt on multiple images, pre-compute text embedding
 ...     print(f"Image {i+1}: {len(results['masks'])} '{text_prompt}' objects found")
 ```

 ### Custom Resolution Inference

 <div class="warning">
 ⚠️ **Performance Note**: Custom resolutions may degrade accuracy. The model is meant to be used at 1008px resolution.
 </div>

 For faster inference or lower memory usage:

 ```python
 >>> config = Sam3Config.from_pretrained("facebook/sam3")
 >>> config.image_size = 560
 >>> model = Sam3Model.from_pretrained("facebook/sam3", config=config).to(device)
 >>> processor = Sam3Processor.from_pretrained("facebook/sam3", size={"height": 560, "width": 560})
 ```

 ### Prompt Label Conventions

 SAM3 uses the following label conventions:
--- a/docs/source/en/model_doc/sam3_video.md
+++ b/docs/source/en/model_doc/sam3_video.md
@@ -188,6 +188,21 @@ For real-time applications, SAM3 Video supports processing video frames as they
 >>> print(f"Masks are at original video resolution: {frame_0_outputs['masks'].shape}")
 ```

 #### Custom Resolution Inference

 <div class="warning">
 ⚠️ **Performance Note**: Custom resolutions may degrade accuracy. The model is meant to be used at 1008px resolution.
 </div>

 For faster inference or lower memory usage:

 ```python
 >>> config = Sam3VideoConfig.from_pretrained("facebook/sam3")
 >>> config.image_size = 560
 >>> model = Sam3VideoModel.from_pretrained("facebook/sam3", config=config).to(device, dtype=torch.bfloat16)
 >>> processor = Sam3VideoProcessor.from_pretrained("facebook/sam3", size={"height": 560, "width": 560})
 ```

 ## Sam3VideoConfig

 [[autodoc]] Sam3VideoConfig
--- a/docs/source/en/model_doc/t5gemma2.md
+++ b/docs/source/en/model_doc/t5gemma2.md
@@ -14,6 +14,8 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.

 -->
 *This model was released on {release_date} and added to Hugging Face Transformers on 2025-12-01.*

 <div style="float: right;">
    <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
--- a/docs/source/en/optimization_overview.md
+++ b/docs/source/en/optimization_overview.md
@@ -0,0 +1,178 @@
 <!--Copyright 2025 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.

 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.

 -->

 # Overview

 Transformers provides multiple inference optimization techniques to make models fast, affordable, and accessible. Options include alternative attention mechanisms for reduced memory traffic, code compilation for faster execution, and optimized kernels for throughput. Stack these techniques for maximum performance.

 > [!NOTE]
 > Memory and speed are closely related but not the same. Shrinking your memory footprint makes a model "faster" because there is less data to move around. Pure speed optimizations don't always reduce memory and sometimes increase usage. Choose the appropriate optimization based on your use case and hardware.

 Use the table below to pick an optimization technique.

 | Technique | Speed | Memory |
 |---|:---:|:---:|
 | [Compilation](#compilation) | ✅ | |
 | [Attention backends](#attention-backends) | ✅ | ✅ |
 | [Kernels](#kernels) | ✅ | ✅ |
 | [Quantization](#quantization) | ✅ | ✅ |
 | [Caching](#caching) | ✅ | ✅ |
 | [Parallelism](#parallelism) | ✅ | |
 | [Continuous batching](#continuous-batching) | ✅ | |

 This guide gives you a quick start on Transformers optimizations.

 ## Compilation

 [torch.compile](./perf_torch_compile) reduces Python overhead, fuses operations, and creates kernels tuned for your shapes and hardware. The first run warms it up and subsequent runs use the faster compiled path.

 Pass a [fixed size cache](./kv_cache#fixed-size-cache) to [`~GenerationMixin.generate`] to trigger `torch.compile` automatically.

 ```py
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM

 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B", dtype=torch.float16, device_map="auto")
 input = tokenizer("The French Bread Law states", return_tensors="pt").to(model.device)

 output = model.generate(**input, do_sample=False, max_new_tokens=20, cache_implementation="static")
 tokenizer.batch_decode(output, skip_special_tokens=True)[0]
 ```

 > [!WARNING]
 > Avoid calling `torch.compile(model)` outside of [`~GenerationMixin.generate`] to prevent the model from recompiling every step.

 ## Attention backends

 Alternative [attention backends](./attention_interface) lower memory traffic. For example, FlashAttention tiles attention computations and avoids large intermediate tensors to reduce memory footprint.

 Set `attn_implementation` in [`~PreTrainedModel.from_pretrained`] to load an optimized attention backend.

 ```py
 from transformers import AutoModelForCausalLM

 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B", attn_implementation="flash_attention_2")
 ```

 ## Kernels

 Kernels fuse operations to boost throughput and reduce memory usage. The [Kernels](https://huggingface.co/docs/kernels/en/index) library loads optimized compute kernels from the [Hub](https://huggingface.co/kernels-community) in a flexible and version-safe way.

 The example below loads an optimized FlashAttention-2 kernel without installing the package.

 ```py
 import torch
 from transformers import AutoModelForCausalLM

 model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-0.6B", attn_implementation="kernels-community/flash-attn2"
 )
 ```

 ## Quantization

 [Quantization](./quantization/overview) shrinks the size of every parameter which lowers memory footprint and increases speed because you can do more operations.

 Pass a quantization config to the `quantization_config` argument in [`~PreTrainedModel.from_pretrained`]. Each quantization backend has a different config with different arguments. The example below quantizes a model to 4-bits and configures the computation dtype with the [bitsandbytes](./quantization/bitsandbytes) backend.

 ```py
 import torch
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig

 bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)

 model = AutoModelForCausalLM.from_pretrained(
    "allenai/Olmo-3-7B-Think", quantization_config=bnb_config
 )
 ```

 ## Caching

 [Caching](./kv_cache) speeds up generation by reusing past keys and values instead of recomputing them for every token. To offset and reduce the memory cost of storing past keys and values, Transformers 
 supports offloading the cache to the CPU. Only the current layer remains on the GPU.

 Use the `cache_implementation` argument in [`~GenerationMixin.generate`] to set a cache strategy.

 ```py
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM

 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
 model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-0.6B", attn_implementation="kernels-community/flash-attn2"
 )
 inputs = tokenizer("The Le Décret Pain states that a baguette must,", return_tensors="pt")
 outputs = model.generate(**inputs, do_sample=False, max_new_tokens=50, cache_implementation="offloaded")
 ```

 ## Parallelism

 [Parallelism](./perf_infer_gpu_multi) distributes a model across devices so models too big for one device run fast. This approach uses more memory due to sharding overhead and communication to sync results.

 [Tensor parallelism](./perf_infer_gpu_multi) splits a model layer across devices. Set `tp_plan="auto"` in [`~PreTrainedModel.from_pretrained`] to enable it.

 ```py
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer

 model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", tp_plan="auto")
 print(model._tp_plan)
 ```

 ## Continuous batching

 [Continuous batching](./continuous_batching) maximizes throughput by keeping the GPU busy with dynamic scheduling and chunked prefill. [Serving](./serving.md) applications use it to process multiple incoming requests concurrently.

 Use [`~ContinuousMixin.generate_batch`] to enable continuous batching.

 ```py
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig

 model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-0.6B",
    attn_implementation="paged|sdpa",
    device_map="cuda",
    torch_dtype=torch.bfloat16,
 )
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")

 prompts = [
    "The Le Décret Pain states that a baguette must",
    "Explain gravity in one sentence.",
    "Name the capital of France.",
 ]
 inputs = [tokenizer.encode(p) for p in prompts]

 generation_config = GenerationConfig(
    max_new_tokens=32,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    do_sample=False,
    max_batch_tokens=512,
 )

 outputs = model.generate_batch(
    inputs=inputs,
    generation_config=generation_config,
 )

 for request_id, output in outputs.items():
    text = tokenizer.decode(output.generated_tokens, skip_special_tokens=True)
    print(f"[{request_id}] {text}")
 ```
--- a/docs/source/en/perf_infer_gpu_multi.md
+++ b/docs/source/en/perf_infer_gpu_multi.md
@@ -306,3 +306,7 @@ The most important part of DTensor is the `placement` attribute because it tells
    ```

 - `Partial()` - Indicates a tensor is pending a reduction operation (not typically relevant for usage in Transformers).

 ## Resources

 Read the [Tensor Parallelism (TP) in Transformers: 5 Minutes to Understand](https://huggingface.co/blog/qgallouedec/tp) blog post for a quick overview of tensor parallelism and learn how column and row parallel setups differ.
--- a/docs/source/en/philosophy.md
+++ b/docs/source/en/philosophy.md
@@ -21,7 +21,7 @@ Transformers is a PyTorch-first library. It provides models that are faithful to
 A longer, in-depth article with examples, visualizations and timelines is available [here](https://huggingface.co/spaces/transformers-community/Transformers-tenets) as our canonical reference.

 > [!NOTE]
 > Our philosophy evolves through practice. What follows are out current, stable principles.
 > Our philosophy evolves through practice. What follows are our current, stable principles.

 ## Who this library is for

--- a/docs/source/en/quantization/contribute.md
+++ b/docs/source/en/quantization/contribute.md
@@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # Contribute

 Transformers supports many quantization methods such as QLoRA, GPTQ, LLM.int8, and AWQ. However, there are still many more quantization approaches that haven't been integrated yet. To make adding and using these quantization methods with Transformers easier, use the [`~quantizers.HfQuantizer`] class.  [`~quantizers.HfQuantizer`] is designed to be an internal helper class for adding a quantization method instead of something applied to every PyTorch module.
 Transformers supports many quantization methods such as QLoRA, GPTQ, LLM.int8, and AWQ. However, there are still many more quantization approaches that haven't been integrated yet. To make adding and using these quantization methods with Transformers easier, use the [`~quantizers.HfQuantizer`] class. [`~quantizers.HfQuantizer`] is designed to be an internal helper class for adding a quantization method instead of something applied to every PyTorch module.

 This guide will show you how to integrate a new quantization method with [`~quantizers.HfQuantizer`].

@@ -28,16 +28,16 @@ Before integrating a new quantization method into Transformers, ensure the metho
 - The method can run on commonly-used hardware (CPU, GPU, etc.).
 - The method is wrapped in a [nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html) ([`~bitsandbytes.nn.Linear8bitLt`], [`~bitsandbytes.nn.Linear4bit`]), and the quantized linear layer should have the following definition.

    ```py
    class Linear4bit(nn.Module):
        def __init__(self, ...):
            ...
        
        def forward(self, x):
            return my_4bit_kernel(x, self.weight, self.bias)
    ```
  ```py
  class Linear4bit(nn.Module):
      def __init__(self, ...):
          ...

    This way, Transformers models are easily quantized by replacing instances of [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) with a target class.
      def forward(self, x):
          return my_4bit_kernel(x, self.weight, self.bias)
  ```

  This way, Transformers models are easily quantized by replacing instances of [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) with a target class.

 - The quantization method should be serializable. You can save the quantized weights locally or push them to the Hub.
 - Make sure the package containing the quantization kernels/primitive is stable (no frequent breaking changes).
@@ -46,26 +46,109 @@ Some quantization methods may require "pre-quantizing" the model through data ca

 ## Create new HFQuantizer class

 1. Create a new quantization config class inside [src/transformers/utils/quantization_config.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/utils/quantization_config.py). Add the new quantization config to the [_import_structure](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/__init__.py#L1088) inside Transformers' [src/transformers/__init__.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/__init__.py) file.
 0. The best starting point would be to have a look at another quantization method such as Finegrained Fp8. You will have to update or create three files in total: the [config file](https://github.com/huggingface/transformers/blob/main/src/transformers/utils/quantization_config.py), the [integration file](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/finegrained_fp8.py) and the [quantizer file](https://github.com/huggingface/transformers/blob/main/src/transformers/quantizers/quantizer_finegrained_fp8.py).

 1. Create a new quantization config class inside [src/transformers/utils/quantization_config.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/utils/quantization_config.py). Add the new quantization config to the [\_import_structure](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/__init__.py#L1088) inside Transformers' [src/transformers/\_\_init\_\_.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/__init__.py) file.

 2. Create a new file inside [src/transformers/quantizers/](https://github.com/huggingface/transformers/tree/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers) named `quantizer_your_method.py`, and make it inherit from [`~quantizers.HfQuantizer]. Make sure to add the new quantizer and quantization config in the quantization auto-mapping in [src/transformers/quantizers/auto.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers/auto.py).

 3. Define the following class attributes and property methods for your quantization method.
 3. Define the following class attributes and property methods for your quantization method:

    - `requires_calibration`: Whether the quantization method requires a data calibration process. If set to `True`, you can only support inference (with quantized weights) and not inference and quantization.
    - `required_packages`: A list of strings of the required packages to use the quantized weights. You might need to define some new utility methods such as `is_auto_awq_available` in [transformers/src/utils/import_utils.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/utils/import_utils.py).
    - `requires_parameters_quantization`: Only required if your quantization method requires extra attention to the underlying [nn.Parameter](https://pytorch.org/docs/stable/generated/torch.nn.parameter.Parameter.html) object. For example, bitsandbytes uses [`~bitsandbytes.nn.Params4bit`] and [`~bitsandbytes.nn.Int8Params`], which requires some extra attention when quantizing the model. Most of the recent quantization method packs int2 and int4 weights inside [torch.uint8](https://pytorch.org/docs/stable/tensors.html) weights, so this flag should not be really required (set to `False` by default).
    - `is_serializable`: A property method to determine whether the method is serializable or not.
    - `is_trainable`:  A property method to determine whether you can fine-tune models on top of the quantization method (with or without PEFT approaches).
   - `requires_calibration`: Whether the quantization method requires a data calibration process. If set to `True`, you can only support inference (with quantized weights) and not inference and quantization.
   - `is_serializable`: A property method to determine whether the method is serializable or not.
   - `is_trainable`: A property method to determine whether you can fine-tune models on top of the quantization method (with or without PEFT approaches).

 4. Write the `validate_environment` and `update_dtype` methods. These methods are called before creating the quantized model to ensure users use the right configuration. Refer to other quantizers for an example of it is implemented.

 5. Write the `_process_model_before_weight_loading` method. In Transformers, the quantized models are initialized first on the `"meta"` device before loading the weights. This means the `_process_model_before_weight_loading` method takes care of manipulating the model skeleton to replace some modules ([nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)) with the target modules (quantization modules).

    You can define module replacement logic or any other utility method by creating a new file in [transformers/src/integrations/](https://github.com/huggingface/transformers/tree/abbffc4525566a48a9733639797c812301218b83/src/transformers/integrations) and exposing the relevant methods in that folder's `__init__.py` file. The best starting point would be to have a look at another quantization method such as [quantizer_awq.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers/quantizer_awq.py).
 You can define module replacement logic or any other utility method by creating a new file in [transformers/src/integrations/](https://github.com/huggingface/transformers/tree/abbffc4525566a48a9733639797c812301218b83/src/transformers/integrations) and exposing the relevant methods in that folder's `__init__.py` file.

 6. Add the `get_quantize_ops` method to the quantizer class if the quantization supports quantizing on the fly. In transformers, we materialize each tensor and apply a sequence of different operations on it. In our case, the quantization operation happens at the end. You need to create a `XXXQuantize`, a subclass of `ConversionOps`, and add a `convert` method. In the `convert` method, you need to quantize the weights and return a dictionary of quantized params.

 7. Add the `get_weight_conversions` method to the quantizer class if the quantization supports loading pre-quantized weights. In transformers, we can collect multiple tensors and apply operations on them. This is particularly useful when we have tensors in the checkpoint that require to be regrouped to re-create the quantized tensors.

 8. Write the `_process_model_after_weight_loading` method if needed. This method enables implementing additional features that require manipulating the model after loading the weights.

 9. Document everything! Make sure your quantization method is documented by adding a new file under `docs/source/en/quantization`.

 10. You should add tests by adding the package in our nightly Dockerfile inside `docker/transformers-quantization-latest-gpu` and then adding a new test file in `tests/quantization/xxx`. Feel free to check out existing quantization methods to see how it is implemented.

 ## Files overview

 | File                                         | Purpose                                                                                          |
 | -------------------------------------------- | ------------------------------------------------------------------------------------------------ |
 | `utils/quantization_config.py`               | Define `YourMethodConfig` inheriting from `QuantizationConfigMixin`                              |
 | `quantizers/quantizer_your_method.py`        | Implement `YourMethodHfQuantizer` inheriting from `HfQuantizer`                                  |
 | `integrations/your_method.py`                | Implement `ConversionOps` subclasses and helper functions                                        |
 | `quantizers/auto.py`                         | Register quantizer and config in `AUTO_QUANTIZER_MAPPING` and `AUTO_QUANTIZATION_CONFIG_MAPPING` |
 | `docs/source/en/quantization/your_method.md` | Document usage for users                                                                         |
 | `tests/quantization/your_method/`            | Add integration tests                                                                            |

 ## Understanding `get_quantize_ops` vs `get_weight_conversions`

 These two methods handle different scenarios for loading weights. Understanding when to use each is essential.

 ### `get_quantize_ops` — Quantize on the fly

 Use this when loading a **non-quantized checkpoint** (e.g., float16/bfloat16 weights) and quantizing during load.

 ```
 Checkpoint: model.safetensors (float16 weights for example)
     ↓
 get_quantize_ops → YourQuantize.convert()
     ↓
 Result: Quantized weights in memory
 ```

 The `convert` method receives one tensor at a time, quantizes it, and can return a dictionary of quantized params, for example:

 ```py
 class YourQuantize(ConversionOps):
    def convert(self, input_dict, model, full_layer_name, missing_keys, **kwargs):
        # input_dict = {"layer.weight": <float16 tensor>}
        value = list(input_dict.values())[0]
        module, tensor_name = get_module_from_name(model, full_layer_name)

        # Quantize and assign
        quantized, scale, zero_point = your_quantize_fn(value)
        return {full_layer_name: quantized, full_layer_name + ".scale": scale, full_layer_name + ".zero_point": zero_point}
 ```

 ### `get_weight_conversions` — Load pre-quantized checkpoints

 Use this when loading a **pre-quantized checkpoint** where the quantized weights are saved as several separate components (such as data, scale, and zero point), and these need to be combined into one tensor during loading. Not all quantization methods require this reconstruction step: for example, some methods like FP8 simply load weights and scales as-is, without combining them. Others, such as torchao, do require reassembling the quantized tensor from its multiple saved components.

 ```
 Checkpoint: model.safetensors (quantized components)
  - layer._weight_qdata
  - layer._weight_scale
  - layer._weight_zero_point
     ↓
 get_weight_conversions → WeightConverter + YourDeserialize.convert()
     ↓
 Result: Reconstructed quantized tensor → layer.weight
 ```

 The `WeightConverter` collects related tensors based on `source_patterns`, then passes them to your `convert` method:

 6. Write the `_process_model_after_weight_loading` method. This method enables implementing additional features that require manipulating the model after loading the weights.
 ```py
 def get_weight_conversions(self):
    if self.pre_quantized:
        return [
            WeightConverter(
                source_patterns=["_weight_qdata", "_weight_scale", "_weight_zero_point"],
                target_patterns="weight",
                operations=[YourDeserialize(self)],
            ),
        ]
    return []

 7. Document everything! Make sure your quantization method is documented by adding a new file under `docs/source/en/quantization`.

 8. You should add tests by adding the package in our nightly Dockerfile inside `docker/transformers-quantization-latest-gpu` and then adding a new test file in `tests/quantization/xxx`. Feel free to check out existing quantization methods to see how it is implemented.
 class YourDeserialize(ConversionOps):
    def convert(self, input_dict, model, full_layer_name, **kwargs):
        # input_dict contains all collected tensors
        # Reconstruct the quantized tensor from components
        reconstructed_tensor = reconstruct_from_components(input_dict)
        return {full_layer_name: reconstructed_tensor}
 ```
--- a/docs/source/en/quantization/gptq.md
+++ b/docs/source/en/quantization/gptq.md
@@ -16,10 +16,9 @@ rendered properly in your Markdown viewer.

 # GPTQ

 The [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save memory usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory. Inference is also faster because a lower bitwidth takes less time to communicate.
 The [GPT-QModel](https://github.com/ModelCloud/GPTQModel) project (Python package `gptqmodel`) implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save memory usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory. Inference is also faster because a lower bitwidth takes less time to communicate.

 > [!WARNING]
 > AutoGPTQ is likely to be deprecated in the future due to lack of continued support for new models and features. See the [GPTQModel](#gptqmodel) section for more details.
 AutoGPTQ is no longer supported in Transformers. Install GPT-QModel] instead.

 Install Accelerate, Transformers and Optimum first.

@@ -27,25 +26,12 @@ Install Accelerate, Transformers and Optimum first.
 pip install --upgrade accelerate optimum transformers
 ```

 Then run the command below to install a GPTQ library.

 <hfoptions id="install">
 <hfoption id="GPTQmodel">
 Then run the command below to install GPT-QModel].

 ```bash
 pip install gptqmodel --no-build-isolation
 ```

 </hfoption>
 <hfoption id="AutoGPTQ">

 ```bash
 pip install auto-gptq --no-build-isolation
 ```

 </hfoption>
 </hfoptions>

 Create a [`GPTQConfig`] class and set the number of bits to quantize to, a dataset to calbrate the weights for quantization, and a tokenizer to prepare the dataset.

 ```py
@@ -58,7 +44,7 @@ gptq_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer)
 You can pass your own dataset as a list of strings, but it is highly recommended to use the same dataset from the GPTQ paper.

 ```py
 dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
 dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on the GPTQ algorithm."]
 gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer)
 ```

@@ -121,51 +107,16 @@ from transformers import AutoModelForCausalLM, GPTQConfig
 model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=GPTQConfig(bits=4, backend="marlin"))
 ```

 ## ExLlama

 > [!WARNING]
 > Only 4-bit models are supported, and we recommend deactivating the ExLlama kernels if you're finetuning a quantized model with PEFT.

 [ExLlama](https://github.com/turboderp/exllama) is a Python/C++/CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object.

 To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter in [`GPTQConfig`].

 ```py
 import torch
 from transformers import AutoModelForCausalLM, GPTQConfig

 gptq_config = GPTQConfig(bits=4, exllama_config={"version":2})
 model = AutoModelForCausalLM.from_pretrained(
    "{your_username}/opt-125m-gptq",
    device_map="auto",
    quantization_config=gptq_config
 )
 ```

 The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ 0.4.2+, disable the ExLlama kernel in [`GPTQConfig`]. This overwrites the attributes related to the ExLlama kernels in the quantization config of the `config.json` file.

 ```py
 import torch
 from transformers import AutoModelForCausalLM, GPTQConfig

 gptq_config = GPTQConfig(bits=4, use_exllama=False)
 model = AutoModelForCausalLM.from_pretrained(
    "{your_username}/opt-125m-gptq",
    device_map="cpu",
    quantization_config=gptq_config
 )
 ```

 ## GPTQModel
 ## GPT-QModel]

 It is recommended to use GPTQModel, originally a maintained fork of AutoGPTQ, because it has since diverged from AutoGTPQ with some significant features. GPTQModel has faster quantization, lower memory usage, and more accurate default quantization.
 GPT-QModel] is the actively maintained backend for GPTQ in Transformers. It was originally forked from AutoGPTQ, but has since diverged with significant improvements such as faster quantization, lower memory usage, and more accurate defaults.

 GPTQModel provides asymmetric quantization which can potentially lower quantization errors compared to symmetric quantization. It is not backward compatible with AutoGPTQ, and not all kernels (Marlin) support asymmetric quantization.
 GPT-QModel] provides asymmetric quantization which can potentially lower quantization errors compared to symmetric quantization. It is not backward compatible with legacy AutoGPTQ checkpoints, and not all kernels (Marlin) support asymmetric quantization.

 GPTQModel also has broader support for the latest LLM models, multimodal models (Qwen2-VL and Ovis1.6-VL), platforms (Linux, macOS, Windows 11), and hardware (AMD ROCm, Apple Silicon, Intel/AMD CPUs, and Intel Datacenter Max/Arc GPUs, etc.).
 GPT-QModel] also has broader support for the latest LLM models, multimodal models (Qwen2-VL and Ovis1.6-VL), platforms (Linux, macOS, Windows 11), and hardware (AMD ROCm, Apple Silicon, Intel/AMD CPUs, and Intel Datacenter Max/Arc GPUs, etc.).

 The Marlin kernels are also updated for A100 GPUs and other kernels are updated to include auto-padding for legacy models and models with non-uniform in/out-features.

 ## Resources

 Run the GPTQ quantization with PEFT [notebook](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb?usp=sharing) for a hands-on experience, and read [Making LLMs lighter with AutoGPTQ and transformers](https://huggingface.co/blog/gptq-integration) to learn more about the AutoGPTQ integration.
 Run the GPTQ quantization with PEFT [notebook](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb?usp=sharing) for a hands-on experience.
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -32,8 +32,7 @@ Use the Space below to help you pick a quantization method depending on your har
 | [EETQ](./eetq)                            | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | ?               | 8            | 🟢               | 🟢                          | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
 | [FP-Quant](./fp_quant)                          | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 4           | 🔴               | 🟢                          | 🟢                      | https://github.com/IST-DASLab/FP-Quant      |
 | [GGUF / GGML (llama.cpp)](../gguf)        | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🟢              | 🔴              | 1/8          | 🔴               | [See Notes](../gguf)     | [See Notes](../gguf) | https://github.com/ggerganov/llama.cpp      |
 | [GPTQModel](./gptq)                       | 🔴                   | 🟢 | 🟢        | 🟢        | 🟢                                 | 🟢 | 🔴              | 2/3/4/8      | 🟢               | 🟢                          | 🟢                      | https://github.com/ModelCloud/GPTQModel        |
 | [AutoGPTQ](./gptq)                        | 🔴                   | 🔴              | 🟢        | 🟢        | 🔴                                 | 🔴              | 🔴              | 2/3/4/8      | 🟢               | 🟢                          | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
 | [GPT-QModel](./gptq)                     | 🔴                   | 🟢 | 🟢        | 🟢        | 🟢                                 | 🟢 | 🔴              | 2/3/4/8      | 🟢               | 🟢                          | 🟢                      | https://github.com/ModelCloud/GPTQModel        |
 | [HIGGS](./higgs)                          | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 2/4          | 🔴               | 🟢                          | 🟢                      | https://github.com/HanGuo97/flute           |
 | [HQQ](./hqq)                              | 🟢                   | 🟢              | 🟢        | 🔴        | 🔴                                 | 🟢              | 🟢              | 1/8          | 🟢               | 🔴                          | 🟢                      | https://github.com/mobiusml/hqq/            |
 | [optimum-quanto](./quanto)                | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🟢              | 🟢              | 2/4/8        | 🔴               | 🔴                          | 🟢                      | https://github.com/huggingface/optimum-quanto       |
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@@ -639,30 +639,35 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))

 ## Serialization

 torchao implements [torch.Tensor subclasses](https://pytorch.org/docs/stable/notes/extending.html#subclassing-torch-tensor) for maximum flexibility in supporting new quantized torch.Tensor formats. [Safetensors](https://huggingface.co/docs/safetensors/en/index) serialization and deserialization does not work with torchao.

 To avoid arbitrary user code execution, torchao sets `weights_only=True` in [torch.load](https://pytorch.org/docs/stable/generated/torch.load.html) to ensure only tensors are loaded. Any known user functions can be whitelisted with [add_safe_globals](https://pytorch.org/docs/stable/notes/serialization.html#torch.serialization.add_safe_globals).
 Saving the quantized model with `save_pretrained` (in [safetensors](https://huggingface.co/docs/safetensors/en/index) format) is only supported for torchao >= v0.15. For any version below, it is only possible to manually save as unsafe `.bin` checkpoints with [torch.save](https://docs.pytorch.org/docs/stable/generated/torch.save.html).

 <hfoptions id="serialization-examples">
 <hfoption id="save-locally">

 ```py
 # don't serialize model with Safetensors
 # torchao >= 0.15
 output_dir = "llama3-8b-int4wo-128"
 quantized_model.save_pretrained("llama3-8b-int4wo-128", safe_serialization=False)
 quantized_model.save_pretrained("llama3-8b-int4wo-128")
 ```

 </hfoption>
 <hfoption id="push-to-huggingface-hub">

 ```py
 # don't serialize model with Safetensors
 # torchao >= 0.15
 USER_ID = "your_huggingface_user_id"
 REPO_ID = "llama3-8b-int4wo-128"
 quantized_model.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128", safe_serialization=False)
 quantized_model.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128")
 tokenizer.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128")
 ```


 ```py
 # torchao < 0.15 -> unsafe serialization
 filename = "llama3-8b-int4wo-128/pytorch_model.bin"
 torch.save(quantized_model.state_dict(), filename)
 ```

 </hfoption>
 </hfoptions>

@@ -687,7 +692,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
 )
 # save the quantized model
 output_dir = "llama-3.1-8b-torchao-int8"
 quantized_model.save_pretrained(output_dir, safe_serialization=False)
 quantized_model.save_pretrained(output_dir)

 # reload the quantized model
 reloaded_model = AutoModelForCausalLM.from_pretrained(
@@ -724,7 +729,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
 )
 # save the quantized model
 output_dir = "llama-3.1-8b-torchao-int4-cpu"
 quantized_model.save_pretrained(output_dir, safe_serialization=False)
 quantized_model.save_pretrained(output_dir)

 # reload the quantized model
 reloaded_model = AutoModelForCausalLM.from_pretrained(
--- a/docs/source/en/tasks/image_text_to_text.md
+++ b/docs/source/en/tasks/image_text_to_text.md
@@ -33,7 +33,8 @@ This guide focuses on inference with an instruction-tuned model.
 Let's begin installing the dependencies.

 ```bash
 pip install -q transformers accelerate flash_attn
 pip install -q transformers accelerate 
 pip install flash-attn --no-build-isolation
 ```

 Let's initialize the model and the processor.
@@ -45,12 +46,12 @@ import torch

 device = Accelerator().device
 model = AutoModelForImageTextToText.from_pretrained(
    "HuggingFaceM4/idefics2-8b",
    "Qwen/Qwen3-VL-4B-Instruct",
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
 ).to(device)

 processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
 processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-4B-Instruct")
 ```

 This model has a [chat template](./chat_templating) that helps user parse chat outputs. Moreover, the model can also accept multiple images as input in a single conversation or message. We will now prepare the inputs.
@@ -65,24 +66,29 @@ The image inputs look like the following.
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg" alt="A bee on a pink flower"/>
 </div>

 ```python
 from PIL import Image
 import requests

 img_urls =["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png",
           "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"]
 images = [Image.open(requests.get(img_urls[0], stream=True).raw),
          Image.open(requests.get(img_urls[1], stream=True).raw)]
 Structure your conversation as shown below for a single prompt with image and text inputs.

 ```python
 messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png"},
            {"type": "text", "text": "What do we see in this image?"},
        ]
    }
 ]
 ```

 Below is an example of the chat template. We can feed conversation turns and the last message as an input by appending it at the end of the template.
 Alternate between the `user` and `assistant` role to ground the model with prior context to generate better responses.

 ```python
 messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png"},
            {"type": "text", "text": "What do we see in this image?"},
        ]
    },
@@ -95,7 +101,7 @@ messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
            {"type": "text", "text": "And how about this image?"},
        ]
    },
@@ -105,19 +111,20 @@ messages = [
 We will now call the processors' [`~ProcessorMixin.apply_chat_template`] method to preprocess its output along with the image inputs.

 ```python
 prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
 inputs = processor(text=prompt, images=[images[0], images[1]], return_tensors="pt").to(device)
 inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(device)
 ```

 We can now pass the preprocessed inputs to the model.

 ```python
 input_len = len(inputs.input_ids[0])

 with torch.no_grad():
    generated_ids = model.generate(**inputs, max_new_tokens=500)
 generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
    generated_ids = model.generate(**inputs, max_new_tokens=200)
 generated_texts = processor.batch_decode(generated_ids[:, input_len:], skip_special_tokens=True)

 print(generated_texts)
 ## ['User: What do we see in this image? \nAssistant: In this image we can see two cats on the nets. \nUser: And how about this image? \nAssistant: In this image we can see flowers, plants and insect.']
 ## ['In this image we can see flowers, plants and insect.']
 ```

 ## Pipeline
@@ -289,19 +296,38 @@ VLMs are often large and need to be optimized to fit on smaller hardware. Transf
 First, install dependencies.

 ```bash
 pip install -U quanto bitsandbytes
 pip install -U optimum-quanto bitsandbytes
 ```

 To quantize a model during loading, we need to first create [`QuantoConfig`]. Then load the model as usual, but pass `quantization_config` during model initialization.
 To quantize a model during loading, we need to first create [`QuantoConfig`]. Then load the model as usual, but pass `quantization_config` during model initialization. 

 ```python
 from transformers import AutoModelForImageTextToText, QuantoConfig

 model_id = "HuggingFaceM4/idefics2-8b"
 model_id = "Qwen/Qwen3-VL-4B-Instruct"
 quantization_config = QuantoConfig(weights="int8")
 quantized_model = AutoModelForImageTextToText.from_pretrained(
    model_id, device_map="auto", quantization_config=quantization_config
 )

 messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png"},
            {"type": "text", "text": "What do we see in this image?"},
        ]
    },
 ]
 inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device)
 input_len = len(inputs.input_ids[0])

 with torch.no_grad():
    generated_ids = model.generate(**inputs, cache_implementation="static", max_new_tokens=100)
 generated_texts = processor.batch_decode(generated_ids[:, input_len:], skip_special_tokens=True)

 print(generated_texts[0])
 ## ['In this image, we see two tabby cats resting on a large, tangled pile of fishing nets. The nets are a mix of brown, orange, and red colors, with some blue and green ropes visible in the background. The cats appear relaxed and comfortable, nestled into the fibers of the nets. One cat is in the foreground, looking slightly to the side, while the other is positioned further back, looking directly at the camera. The scene suggests a coastal or fishing-related setting, possibly near']
 ```

 And that's it, we can use the model the same way with no changes.
@@ -312,3 +338,4 @@ Here are some more resources for the image-text-to-text task.

 - [Image-text-to-text task page](https://huggingface.co/tasks/image-text-to-text) covers model types, use cases, datasets, and more.
 - [Vision Language Models Explained](https://huggingface.co/blog/vlms) is a blog post that covers everything about vision language models and supervised fine-tuning using [TRL](https://huggingface.co/docs/trl/en/index).
 - [Learn how to fine-tune vision language models using TRL](https://huggingface.co/blog/trl-vlm-alignment)
--- a/docs/source/en/tasks/mask_generation.md
+++ b/docs/source/en/tasks/mask_generation.md
@@ -24,8 +24,9 @@ Mask generation models are trained on large amounts of data and operate in two m
 - Prompting mode: In this mode, the model takes in an image and a prompt, where a prompt can be a 2D point location (XY coordinates) in the image within an object or a bounding box surrounding an object. In prompting mode, the model only returns the mask over the object
 that the prompt is pointing out.
 - Segment Everything mode: In segment everything, given an image, the model generates every mask in the image. To do so, a grid of points is generated and overlaid on the image for inference.
 - Video Inference: The model accepts a video, and a point or box prompt in a video frame, which is tracked throughout the video. You can get more information on how to do video inference by following [SAM 2 docs](../model_doc/sam2).

 Mask generation task is supported by [Segment Anything Model (SAM)](model_doc/sam). It's a powerful model that consists of a Vision Transformer-based image encoder, a prompt encoder, and a two-way transformer mask decoder. Images and prompts are encoded, and the decoder takes these embeddings and generates valid masks.
 Mask generation task is supported by [Segment Anything Model (SAM)](../model_doc/sam) and [Segment Anything Model 2 (SAM2)](../model_doc/sam2), while video inference is supported by [Segment Anything Model 2 (SAM2)](../model_doc/sam2). SAM is a powerful model that consists of a Vision Transformer-based image encoder, a prompt encoder, and a two-way transformer mask decoder. Images and prompts are encoded, and the decoder takes these embeddings and generates valid masks.  Meanwhile, SAM 2 extends SAM by adding a memory module to track the masks. 

 <div class="flex justify-center">
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sam.png" alt="SAM Architecture"/>
@@ -53,7 +54,7 @@ The easiest way to infer mask generation models is to use the `mask-generation`
 ```python
 >>> from transformers import pipeline

 >>> checkpoint = "facebook/sam-vit-base"
 >>> checkpoint = "facebook/sam2-hiera-base-plus"
 >>> mask_generator = pipeline(model=checkpoint, task="mask-generation")
 ```

@@ -80,20 +81,12 @@ masks = mask_generator(image, points_per_batch=128, pred_iou_thresh=0.88)
 The `masks` looks like the following:

 ```bash
 {'masks': [array([[False, False, False, ...,  True,  True,  True],
         [False, False, False, ...,  True,  True,  True],
         [False, False, False, ...,  True,  True,  True],
         ...,
         [False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False]]),
  array([[False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False],
         ...,
 'scores': tensor([0.9972, 0.9917,
        ...,
 }
 {'masks': [tensor([[False, False, False,  ...,  True,  True,  True],
          [False, False, False,  ...,  True,  True,  True],
          [False, False, False,  ...,  True,  True,  True],
          ...,
          [False, False, False,  ..., False, False, False], .. 
 'scores': tensor([0.9874, 0.9793, 0.9780, 0.9776, ... 0.9016])}
 ```

 We can visualize them like this:
@@ -134,7 +127,7 @@ processor = SamProcessor.from_pretrained("facebook/sam-vit-base")

 To do point prompting, pass the input point to the processor, then take the processor output
 and pass it to the model for inference. To post-process the model output, pass the outputs and
 `original_sizes` and `reshaped_input_sizes` we take from the processor's initial output. We need to pass these
 `original_sizes` are taken from the processor's initial output. We need to pass these
 since the processor resizes the image, and the output needs to be extrapolated.

 ```python
@@ -143,7 +136,7 @@ input_points = [[[2592, 1728]]] # point location of the bee
 inputs = processor(image, input_points=input_points, return_tensors="pt").to(device)
 with torch.no_grad():
    outputs = model(**inputs)
 masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
 masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu())
 ```

 We can visualize the three masks in the `masks` output.
@@ -199,7 +192,6 @@ with torch.no_grad():
 mask = processor.image_processor.post_process_masks(
    outputs.pred_masks.cpu(),
    inputs["original_sizes"].cpu(),
    inputs["reshaped_input_sizes"].cpu()
 )[0][0][0].numpy()
 ```

@@ -235,3 +227,326 @@ plt.show()
 <div class="flex justify-center">
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/box_inference.png" alt="Visualized Inference"/>
 </div>

 ## Fine-tuning for Mask Generation 

 We will fine-tune SAM2.1 on small part of MicroMat dataset for image matting. We need to install the [monai](https://github.com/Project-MONAI/MONAI) library to use DICE loss, and [trackio](https://huggingface.co/docs/trackio/index) for logging the masks during training.

 ```bash 
 pip install -q datasets monai trackio
 ``` 
 We can now load our dataset and take a look.

 ```python
 from datasets import load_dataset

 dataset = load_dataset("merve/MicroMat-mini", split="train")
 dataset
 # Dataset({
 #    features: ['image', 'mask', 'prompt', 'image_id', 'object_id', 'sample_idx', 'granularity', 
 # 'image_path', 'mask_path', 'prompt_path'],  num_rows: 94
 #})
 ```
 We need image, mask and prompt columns. We split for train and test.

 ```python
 dataset = dataset.train_test_split(test_size=0.1)
 train_ds = dataset["train"]
 val_ds = dataset["test"]
 ```

 Let's take a look at a sample.
 ```python
 train_ds[0]
 ``` 
 ```
 {'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=2040x1356>,
 'mask': <PIL.PngImagePlugin.PngImageFile image mode=L size=2040x1356>,
 'prompt': '{"point": [[137, 1165, 1], [77, 1273, 0], [58, 1351, 0]], "bbox": [0, 701, 251, 1356]}',
 'image_id': '0034',
 'object_id': '34',
 'sample_idx': 1,
 'granularity': 'fine',
 'image_path': '/content/MicroMat-mini/img/0034.png',
 'mask_path': '/content/MicroMat-mini/mask/0034_34.png',
 'prompt_path': '/content/MicroMat-mini/prompt/0034_34.json'}
 ```
 Prompts are string of dictionaries, so you can get the bounding boxes as shown below.
 ```python
 import json

 json.loads(train_ds["prompt"][0])["bbox"]
 # [0, 701, 251, 1356]
 ``` 

 Visualize an example image, prompt and mask.

 ```python
 import matplotlib.pyplot as plt
 import numpy as np

 def show_mask(mask, ax):
    color = np.array([0.12, 0.56, 1.0, 0.6])
    mask = np.array(mask)
    h, w = mask.shape
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, 4)
    ax.imshow(mask_image)
    x0, y0, x1, y1 = eval(train_ds["prompt"][0])["bbox"]
    ax.add_patch(
        plt.Rectangle((x0, y0), x1 - x0, y1 - y0,
                      fill=False, edgecolor="lime", linewidth=2))

 example = train_ds[0]
 image = np.array(example["image"])
 ground_truth_mask = np.array(example["mask"])

 fig, ax = plt.subplots()
 ax.imshow(image)
 show_mask(ground_truth_mask, ax)
 ax.set_title("Ground truth mask")
 ax.set_axis_off()

 plt.show() 
 ```

 Now we can define our dataset for loading the data. SAMDataset wraps our dataset and formats each sample the way the SAM processor expects. So instead of raw images and masks, you get processed images, bounding boxes, and ground-truth masks ready for training.

 By default, processor resizes images, so on top of images and masks, it also returns original sizes. We also need to binarize the mask as it has values [0, 255].

 ```python
 from torch.utils.data import Dataset
 import torch

 class SAMDataset(Dataset):
  def __init__(self, dataset, processor):
    self.dataset = dataset
    self.processor = processor

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    item = self.dataset[idx]
    image = item["image"]
    prompt = eval(item["prompt"])["bbox"]
    inputs = self.processor(image, input_boxes=[[prompt]], return_tensors="pt")
    inputs["ground_truth_mask"] = (np.array(item["mask"]) > 0).astype(np.float32)
    inputs["original_image_size"] = torch.tensor(image.size[::-1])


    return inputs
 ``` 

 We can initialize the processor and the dataset with it. 

 ```python 
 from transformers import Sam2Processor

 processor = Sam2Processor.from_pretrained("facebook/sam2.1-hiera-small")
 train_dataset = SAMDataset(dataset=train_ds, processor=processor)
 ``` 

 We need to define a data collator that will turn varying size of ground truth masks to batches of reshaped masks in same shape. We reshape them using nearest neighbor interpolation. We also make batched tensors for rest of the elements in the batch. If your masks are all of same size, feel free to skip this step.

 ```python
 import torch.nn.functional as F

 def collate_fn(batch, target_hw=(256, 256)):

    pixel_values = torch.cat([item["pixel_values"] for item in batch], dim=0)
    original_sizes = torch.stack([item["original_sizes"] for item in batch])
    input_boxes = torch.cat([item["input_boxes"] for item in batch], dim=0)
    ground_truth_masks = torch.cat([
        F.interpolate(
            torch.as_tensor(x["ground_truth_mask"]).unsqueeze(0).unsqueeze(0).float(),
            size=(256, 256),
            mode="nearest"
        )
        for x in batch
    ], dim=0).long()

    return {
        "pixel_values": pixel_values,
        "original_sizes": original_sizes,
        "input_boxes": input_boxes,
        "ground_truth_mask": ground_truth_masks,
        "original_image_size": torch.stack([item["original_image_size"] for item in batch]),
    }

 from torch.utils.data import DataLoader
 train_dataloader = DataLoader(
    train_dataset,
    batch_size=4,
    shuffle=True,
    collate_fn=collate_fn,
 )
 ``` 

 Let's take a look at what the data loader yields.

 ```python
 batch = next(iter(train_dataloader))
 for k,v in batch.items():
  print(k,v.shape)

 # pixel_values torch.Size([4, 3, 1024, 1024])
 # original_sizes torch.Size([4, 1, 2])
 # input_boxes torch.Size([4, 1, 4])
 # ground_truth_mask torch.Size([4, 1, 256, 256])
 #original_image_size torch.Size([4, 2])
 ```
 We will now load the model and freeze the vision and the prompt encoder to only train the mask decoder. 

 ```python
 from transformers import Sam2Model

 model = Sam2Model.from_pretrained("facebook/sam2.1-hiera-small")

 for name, param in model.named_parameters():
  if name.startswith("vision_encoder") or name.startswith("prompt_encoder"):
    param.requires_grad_(False)
 ``` 

 We can now define the optimizer and the loss function.
 ```python 
 from torch.optim import Adam
 import monai

 optimizer = Adam(model.mask_decoder.parameters(), lr=1e-5, weight_decay=0)
 seg_loss = monai.losses.DiceCELoss(sigmoid=True, squared_pred=True, reduction='mean')
 ```

 Let's see how the model performs before training.

 ```python
 import matplotlib.pyplot as plt

 item = val_ds[1]
 img = item["image"]
 bbox = json.loads(item["prompt"])["bbox"]
 inputs = processor(images=img, input_boxes=[[bbox]], return_tensors="pt").to(model.device)

 with torch.no_grad():
  outputs = model(**inputs)

 masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"])[0]
 preds = masks.squeeze(0)
 mask = (preds[0] > 0).cpu().numpy()

 overlay = np.asarray(img, dtype=np.uint8).copy()
 overlay[mask] = 0.55 * overlay[mask] + 0.45 * np.array([0, 255, 0], dtype=np.float32)

 plt.imshow(overlay)
 plt.axis("off")
 plt.show()
 ```

 ![SAM2 result after training](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sam2_before_training.png)

 We need to log our predictions to trackio so we can monitor the model improvement in the middle of the training. 

 ```python
 from PIL import Image
 import trackio
 import json


@torch.no_grad()
 def predict_fn(img, bbox):

  inputs = processor(images=img, input_boxes=[[bbox]], return_tensors="pt").to(model.device)

  with torch.no_grad():
      outputs = model(**inputs)

  masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"])[0]
  return masks

 def log_eval_masks_trackio(dataset, indices, step, predict_fn,  project=None, sample_cap=8):
    logs = {"eval/step": int(step)}
    for idx in indices[:sample_cap]:
        item = dataset[idx] 
        img = item["image"]
        bbox = json.loads(item["prompt"])["bbox"]
        preds = predict_fn(img, bbox)
        preds = preds.squeeze(0)
        mask = (preds[0] > 0).cpu().numpy()  

        overlay = np.asarray(img, dtype=np.uint8).copy()
        overlay[mask] = 0.55 * overlay[mask] + 0.45 * np.array([0, 255, 0], dtype=np.float32)
        logs[f"{idx}/overlay"] = trackio.Image(overlay, caption="overlay")
        
    trackio.log(logs)
 ```
 We can now write our training loop and train!

 Notice how we log our loss and evaluation masks with trackio.

 ```python
 from tqdm import tqdm
 from statistics import mean
 import trackio
 import torch

 num_epochs = 30

 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)

 model.train()
 trackio.init(project="mask-eval")
 for epoch in range(num_epochs):
    epoch_losses = []
    for batch in tqdm(train_dataloader):
      outputs = model(pixel_values=batch["pixel_values"].to(device),
                      input_boxes=batch["input_boxes"].to(device),
                      multimask_output=False)

      predicted_masks = outputs.pred_masks.squeeze(1)
      ground_truth_masks = batch["ground_truth_mask"].float().to(device)
      loss = seg_loss(predicted_masks, ground_truth_masks)

      optimizer.zero_grad()
      loss.backward()

      optimizer.step()
      epoch_losses.append(loss.item())
      
    log_eval_masks_trackio(dataset=val_ds, indices=[0, 3, 6, 9], step=epoch, predict_fn=predict_fn, project="mask-eval")
    print(f'Epoch: {epoch}')
    print(f'Mean loss: {mean(epoch_losses)}')
    trackio.log({"loss": mean(epoch_losses)})

 trackio.finish()
 ```


 Let's put the trained model to test.

 ```python
 import matplotlib.pyplot as plt

 item = val_ds[1]
 img = item["image"]
 bbox = json.loads(item["prompt"])["bbox"]

 inputs = processor(images=img, input_boxes=[[bbox]], return_tensors="pt").to(model.device)

 with torch.no_grad():
  outputs = model(**inputs)

 preds = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"])[0]

 preds = preds.squeeze(0)
 mask = (preds[0] > 0).cpu().numpy()

 overlay = np.asarray(img, dtype=np.uint8).copy()
 overlay[mask] = 0.55 * overlay[mask] + 0.45 * np.array([0, 255, 0], dtype=np.float32)

 plt.imshow(overlay)
 plt.axis("off")
 plt.show()
 ```
 Great improvement after only training for 20 epochs on a small dataset!

 ![SAM2 result after training](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sam2_after_training.png)
--- a/docs/source/en/tasks/semantic_segmentation.md
+++ b/docs/source/en/tasks/semantic_segmentation.md
@@ -219,7 +219,7 @@ Start by loading a smaller subset of the SceneParse150 dataset from the 🤗 Dat
 ```py
 >>> from datasets import load_dataset

 >>> ds = load_dataset("scene_parse_150", split="train[:50]")
 >>> ds = load_dataset("merve/scene_parse_150", split="train[:50]")
 ```

 Split the dataset's `train` split into a train and test set with the [`~datasets.Dataset.train_test_split`] method:
--- a/docs/source/en/tasks/text-to-speech.md
+++ b/docs/source/en/tasks/text-to-speech.md
@@ -22,16 +22,14 @@ Text-to-speech (TTS) is the task of creating natural-sounding speech from text,
 languages and for multiple speakers. Several text-to-speech models are currently available in 🤗 Transformers, such as [Dia](../model_doc/dia), [CSM](../model_doc/csm),
 [Bark](../model_doc/bark), [MMS](../model_doc/mms), [VITS](../model_doc/vits) and [SpeechT5](../model_doc/speecht5).

 You can easily generate audio using the `"text-to-audio"` pipeline (or its alias - `"text-to-speech"`). Some models, like Dia,
 can also be conditioned to generate non-verbal communications such as laughing, sighing and crying, or even add music.
 Here's an example of how you would use the `"text-to-speech"` pipeline with Dia:
 You can easily generate audio using the `"text-to-audio"` pipeline (or its alias - `"text-to-speech"`).
 Here's an example of how you would use the `"text-to-speech"` pipeline with [CSM](https://huggingface.co/sesame/csm-1b):

 ```py
 ```python
 >>> from transformers import pipeline

 >>> pipe = pipeline("text-to-speech", model="nari-labs/Dia-1.6B-0626")
 >>> text = "[S1] (clears throat) Hello! How are you? [S2] I'm good, thanks! How about you?"
 >>> output = pipe(text)
 >>> pipe = pipeline("text-to-audio", model="sesame/csm-1b")
 >>> output = pipe("Hello from Sesame.")
 ```

 Here's a code snippet you can use to listen to the resulting audio in a notebook:
@@ -41,7 +39,44 @@ Here's a code snippet you can use to listen to the resulting audio in a notebook
 >>> Audio(output["audio"], rate=output["sampling_rate"])
 ```

 For more examples on what Bark and other pretrained TTS models can do, refer to our
 By default, CSM uses a random voice. You can do voice cloning by providing a reference audio as part of a chat template dictionary:

 ```python
 >>> import soundfile as sf
 >>> import torch
 >>> from datasets import Audio, load_dataset
 >>> from transformers import pipeline

 >>> pipe = pipeline("text-to-audio", model="sesame/csm-1b")

 >>> ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
 >>> ds = ds.cast_column("audio", Audio(sampling_rate=24000))
 >>> conversation = [
 ...     {
 ...         "role": "0",
 ...         "content": [
 ...             {"type": "text", "text": "What are you working on?"},
 ...             {"type": "audio", "path": ds[0]["audio"]["array"]},
 ...         ],
 ...     },
 ...     {"role": "0", "content": [{"type": "text", "text": "How much money can you spend?"}]},
 ... ]
 >>> output = pipe(conversation)
 ```

 Some models, like [Dia](https://huggingface.co/nari-labs/Dia-1.6B-0626), can also be conditioned to generate non-verbal communications such as laughing, sighing and crying, or even add music. Below is such an example:

 ```python
 >>> from transformers import pipeline

 >>> pipe = pipeline("text-to-speech", model="nari-labs/Dia-1.6B-0626")
 >>> text = "[S1] (clears throat) Hello! How are you? [S2] I'm good, thanks! How about you?"
 >>> output = pipe(text)
 ```

 Note that Dia also accepts speaker tags such as [S1] and [S2] to generate a conversation between unique voices.

 For more examples on what CSM and other pretrained TTS models can do, refer to our
 [Audio course](https://huggingface.co/learn/audio-course/chapter6/pre-trained_models).

 If you are looking to fine-tune a TTS model, the only text-to-speech models currently available in 🤗 Transformers
--- a/docs/source/en/tasks/video_text_to_text.md
+++ b/docs/source/en/tasks/video_text_to_text.md
@@ -18,9 +18,13 @@ rendered properly in your Markdown viewer.

 [[open-in-colab]]

 Video-text-to-text models, also known as video language models or vision language models with video input, are language models that take a video input. These models can tackle various tasks, from video question answering to video captioning.
 Video-text-to-text, also known as video language models are models that can process video and output text. These models can tackle various tasks, from video question answering to video captioning. 

 These models have nearly the same architecture as [image-text-to-text](../image_text_to_text) models except for some changes to accept video data, since video data is essentially image frames with temporal dependencies. Some image-text-to-text models take in multiple images, but this alone is inadequate for a model to accept videos. Moreover, video-text-to-text models are often trained with all vision modalities. Each example might have videos, multiple videos, images and multiple images. Some of these models can also take interleaved inputs. For example, you can refer to a specific video inside a string of text by adding a video token in text like "What is happening in this video? `<video>`".
 These models have nearly the same architecture as [image-text-to-text](../image_text_to_text) models except for some changes to accept video data, since video data is essentially image frames with temporal dependencies. Some image-text-to-text models take in multiple images, but this alone is inadequate for a model to accept videos. 

 Moreover, video-text-to-text models are often trained with all vision modalities. Each example might have videos, multiple videos, images and multiple images. Some of these models can also take interleaved inputs. For example, you can refer to a specific video inside a string of text by adding a video token in text like "What is happening in this video? `<video>`".

 Note that these models process videos with no audio. [Any-to-any](../any-to-any) models on the other hand can process videos with audio in them.

 In this guide, we provide a brief overview of video LMs and show how to use them with Transformers for inference.

@@ -30,81 +34,27 @@ To begin with, there are multiple types of video LMs:
 - chat fine-tuned models for conversation
 - instruction fine-tuned models

 This guide focuses on inference with an instruction-tuned model, [llava-hf/llava-interleave-qwen-7b-hf](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf) which can take in interleaved data. Alternatively, you can try [llava-interleave-qwen-0.5b-hf](https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf) if your hardware doesn't allow running a 7B model.
 This guide focuses on inference with an instruction-tuned model, [llava-hf/llava-onevision-qwen2-0.5b-ov-hf](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf) which can take in interleaved data. Alternatively, you can try [llava-interleave-qwen-0.5b-hf](https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf) if your hardware doesn't allow running a 7B model.

 Let's begin installing the dependencies.

 ```bash
 pip install -q transformers accelerate flash_attn 
 pip install -q transformers accelerate flash_attn torchcodec
 ```

 Let's initialize the model and the processor.

 ```python
 from transformers import LlavaProcessor, LlavaForConditionalGeneration
 from transformers import AutoProcessor, LlavaForConditionalGeneration
 import torch
 model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
 model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"

 processor = LlavaProcessor.from_pretrained(model_id)
 processor = AutoProcessor.from_pretrained(model_id, device="cuda")

 model = LlavaForConditionalGeneration.from_pretrained(model_id, device_map="auto", dtype=torch.float16)
 ```

 Some models directly consume the `<video>` token, and others accept `<image>` tokens equal to the number of sampled frames. This model handles videos in the latter fashion. We will write a simple utility to handle image tokens, and another utility to get a video from a url and sample frames from it.

 ```python
 import uuid
 import requests
 import cv2
 from PIL import Image

 def replace_video_with_images(text, frames):
  return text.replace("<video>", "<image>" * frames)

 def sample_frames(url, num_frames):

    response = requests.get(url)
    path_id = str(uuid.uuid4())

    path = f"./{path_id}.mp4" 

    with open(path, "wb") as f:
      f.write(response.content)

    video = cv2.VideoCapture(path)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    interval = total_frames // num_frames
    frames = []
    for i in range(total_frames):
        ret, frame = video.read()
        pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        if not ret:
            continue
        if i % interval == 0:
            frames.append(pil_img)
    video.release()
    return frames[:num_frames]
 ```

 Let's get our inputs. We will sample frames and concatenate them.

 ```python
 video_1 = "https://huggingface.co/spaces/merve/llava-interleave/resolve/main/cats_1.mp4"
 video_2 = "https://huggingface.co/spaces/merve/llava-interleave/resolve/main/cats_2.mp4"

 video_1 = sample_frames(video_1, 6)
 video_2 = sample_frames(video_2, 6)

 videos = video_1 + video_2

 videos

 # [<PIL.Image.Image image mode=RGB size=1920x1080>,
 # <PIL.Image.Image image mode=RGB size=1920x1080>,
 # <PIL.Image.Image image mode=RGB size=1920x1080>, ...]
 ```

 Both videos have cats.
 We will infer with two videos, both have cats.

 <div class="container">
  <div class="video-container">
@@ -120,28 +70,96 @@ Both videos have cats.
  </div>
 </div>

 Now we can preprocess the inputs.

 This model has a prompt template that looks like following. First, we'll put all the sampled frames into one list. Since we have eight frames in each video, we will insert 12 `<image>` tokens to our prompt. Add `assistant` at the end of the prompt to trigger the model to give answers. Then we can preprocess.
 Videos are series of image frames. Depending on the hardware limitations, downsampling is required. If the number of downsampled frames are too little, predictions will be low quality. 


 Video-text-to-text models have processors with video processor abstracted in them. You can pass video inference related arguments to [`~ProcessorMixin.apply_chat_template`] function.

 > [!WARNING]
 > You can learn more about video processors [here](../main_classes/video_processor).

 We can define our chat history, passing in video with a URL like below.
 ```python
 user_prompt = "Are these two cats in these two videos doing the same thing?"
 toks = "<image>" * 12
 prompt = "<|im_start|>user"+ toks + f"\n{user_prompt}<|im_end|><|im_start|>assistant"
 inputs = processor(text=prompt, images=videos, return_tensors="pt").to(model.device, model.dtype)
 messages = [
    {
        "role": "user",
        "content": [
            {"type": "video", "video": "https://huggingface.co/spaces/merve/llava-interleave/resolve/main/cats_1.mp4"},
            {"type": "text", "text": "Describe what is happening in this video."},
        ],
    }
 ]
 ```

 We can now call [`~GenerationMixin.generate`] for inference. The model outputs the question in our input and answer, so we only take the text after the prompt and `assistant` part from the model output.
 You can preprocess the videos by passing in messages, setting `do_sample_frames` to True and passing in `num_frames`. Here we sample 10 frames. 

 ```python
 output = model.generate(**inputs, max_new_tokens=100, do_sample=False)
 print(processor.decode(output[0][2:], skip_special_tokens=True)[len(user_prompt)+10:])
 inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt",
    num_frames=10,
    do_sample_frames=True
 )
 inputs.to(model.device)
 ```
 The inputs contain `input_ids` for tokenized text, `pixel_values_videos` for 10 frames and `attention_mask` for which tokens . 

 # The first cat is shown in a relaxed state, with its eyes closed and a content expression, while the second cat is shown in a more active state, with its mouth open wide, possibly in a yawn or a vocalization.
 We can now infer with our preprocessed inputs and decode them.

 ```python
 generated_ids = model.generate(**inputs, max_new_tokens=128)
 input_length = len(inputs["input_ids"][0])
 output_text = processor.batch_decode(
    generated_ids[:, input_length:], skip_special_tokens=True, clean_up_tokenization_spaces=False
 )
 output_text = processor.batch_decode(
    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
 )
 print(output_text[0])

 #"The video features a fluffy, long-haired cat with a mix of brown and white fur, lying on a beige carpeted floor. The cat's eyes are wide open, and its whiskers are prominently visible. The cat appears to be in a relaxed state, with its head slightly"
 ```

 You can also interleave multiple videos with text directly in chat template like below.

 ```python
 messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Here's a video."},
            {"type": "video", "video": "https://huggingface.co/spaces/merve/llava-interleave/resolve/main/cats_1.mp4"},
            {"type": "text", "text": "Here's another video."},
            {"type": "video", "video": "https://huggingface.co/spaces/merve/llava-interleave/resolve/main/cats_2.mp4"},
            {"type": "text", "text": "Describe similarities in these videos."},
        ],
    }
 ]
 ```

 And voila!
 The inference remains the same as the previous example.

 To learn more about chat templates and token streaming for video-text-to-text models, refer to the [image-text-to-text](../tasks/image_text_to_text) task guide because these models work similarly.
 ```python
 inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt",
    num_frames=100,
    do_sample_frames=True
 )
 inputs.to(model.device)

 generated_ids = model.generate(**inputs, max_new_tokens=50)
 input_length = len(inputs["input_ids"][0])
 output_text = processor.batch_decode(
    generated_ids[:, input_length:], skip_special_tokens=True, clean_up_tokenization_spaces=False
 )
 print(output_text)
 #['Both videos feature a cat with a similar appearance, characterized by a fluffy white coat with black markings, a pink nose, and a pink tongue. The cat\'s eyes are wide open, and it appears to be in a state of alertness or excitement. ']
 ```
--- a/docs/source/ja/main_classes/callback.md
+++ b/docs/source/ja/main_classes/callback.md
@@ -37,7 +37,6 @@ rendered properly in your Markdown viewer.
 - [`~integrations.WandbCallback`] [wandb](https://www.wandb.com/) がインストールされている場合。
 - [`~integrations.CometCallback`] [comet_ml](https://www.comet.com/site/) がインストールされている場合。
 - [mlflow](https://www.mlflow.org/) がインストールされている場合は [`~integrations.MLflowCallback`]。
 - [`~integrations.NeptuneCallback`] [neptune](https://neptune.ai/) がインストールされている場合。
 - [`~integrations.AzureMLCallback`] [azureml-sdk](https://pypi.org/project/azureml-sdk/) の場合
  インストールされています。
 - [`~integrations.CodeCarbonCallback`] [codecarbon](https://pypi.org/project/codecarbon/) の場合
@@ -82,8 +81,6 @@ rendered properly in your Markdown viewer.

 [[autodoc]] integrations.CodeCarbonCallback

 [[autodoc]] integrations.NeptuneCallback

 [[autodoc]] integrations.ClearMLCallback

 [[autodoc]] integrations.DagsHubCallback
--- a/docs/source/ja/main_classes/quantization.md
+++ b/docs/source/ja/main_classes/quantization.md
@@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # Quantize 🤗 Transformers models

 ## `AutoGPTQ` Integration
 ## GPT-QModel Integration


 🤗 Transformers には、言語モデルで GPTQ 量子化を実行するための `optimum` API が統合されています。パフォーマンスを大幅に低下させることなく、推論速度を高速化することなく、モデルを 8、4、3、さらには 2 ビットでロードおよび量子化できます。これは、ほとんどの GPU ハードウェアでサポートされています。
@@ -24,14 +24,14 @@ rendered properly in your Markdown viewer.
 量子化モデルの詳細については、以下を確認してください。
 - [GPTQ](https://huggingface.co/papers/2210.17323) 論文
 - GPTQ 量子化に関する `optimum` [ガイド](https://huggingface.co/docs/optimum/llm_quantization/usage_guides/quantization)
 - バックエンドとして使用される [`AutoGPTQ`](https://github.com/PanQiWei/AutoGPTQ) ライブラリ
 - バックエンドとして使用される `GPT-QModel` (https://github.com/ModelCloud/GPTQModel) ライブラリ

 ### Requirements

 以下のコードを実行するには、以下の要件がインストールされている必要があります： 

 - 最新の `AutoGPTQ` ライブラリをインストールする。
 `pip install auto-gptq` をインストールする。
 - 最新の `GPT-QModel` ライブラリをインストールする。
 `pip install gptqmodel --no-build-isolation` を実行する。

 - 最新の `optimum` をソースからインストールする。
 `git+https://github.com/huggingface/optimum.git` をインストールする。
@@ -63,7 +63,7 @@ gptq_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer)
 独自のデータセットを文字列のリストとして渡すことができることに注意してください。ただし、GPTQ 論文のデータセットを使用することを強くお勧めします。

 ```python
 dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
 dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on the GPTQ algorithm."]
 quantization = GPTQConfig(bits=4, dataset = dataset, tokenizer=tokenizer)
 ```

--- a/docs/source/ko/llm_optims.md
+++ b/docs/source/ko/llm_optims.md
@@ -372,7 +372,7 @@ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable
 양자화는 LLM 가중치를 더 낮은 정밀도로 저장하여 크기를 줄입니다. 이는 메모리 사용량을 줄이며 GPU 메모리에 제약이 있는 경우 추론을 위해 LLM을 로드하는 것을 더 용이하게 합니다. GPU가 충분하다면, 모델을 양자화할 필요는 없습니다. 추가적인 양자화 및 양자화 해제 단계로 인해 약간의 지연이 발생할 수 있기 때문입니다(AWQ 및 융합 AWQ 모듈 제외).

 > [!TIP]
 > 다양한 양자화 라이브러리(자세한 내용은 [Quantization](./quantization) 가이드를 참조하십시오)가 있습니다. 여기에는 Quanto, AQLM, VPTQ, AWQ 및 AutoGPTQ가 포함됩니다. 사용 사례에 가장 잘 맞는 라이브러리를 사용해 보십시오. 또한 AutoGPTQ와 bitsandbytes를 비교하는 [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) 블로그 게시물을 읽어보는 것을 추천합니다.
 > 다양한 양자화 라이브러리(자세한 내용은 [Quantization](./quantization) 가이드를 참조하십시오)가 있습니다. 여기에는 Quanto, AQLM, VPTQ, AWQ 및 GPT-QModel이 포함됩니다. 사용 사례에 가장 잘 맞는 라이브러리를 사용해 보십시오. 또한 gptqmodel과 bitsandbytes를 비교하는 [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) 블로그 게시물을 읽어보는 것을 추천합니다.

 아래의 모델 메모리 계산기를 사용하여 모델을 로드하는 데 필요한 메모리를 추정하고 비교해 보십시오. 예를 들어 [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)를 로드하는 데 필요한 메모리를 추정해 보십시오.

--- a/docs/source/ko/llm_tutorial_optimization.md
+++ b/docs/source/ko/llm_tutorial_optimization.md
@@ -269,7 +269,7 @@ flush()

 4비트 양자화는 RTX3090, V100, T4와 같은 GPU에서 모델을 실행할 수 있게 해주며, 이는 대부분의 사람들이 접근할 수 있는 GPU입니다.

 양자화에 대한 더 많은 정보를 확인하고 4비트보다 더 적은 GPU VRAM 메모리로 모델을 양자화하거나, 더 많은 양자화 관련 정보를 보려면 [`AutoGPTQ`](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#autogptq-integration%60) 구현을 참조하는 것을 추천합니다.
 양자화에 대한 더 많은 정보를 확인하고 4비트보다 더 적은 GPU VRAM 메모리로 모델을 양자화하거나, 더 많은 양자화 관련 정보를 보려면 [`GPT-QModel`](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#gptqmodel) 구현을 참조하는 것을 추천합니다.

 > 결론적으로, 모델 양자화는 향상된 메모리 효율성과 모델 정확성 간의 균형을 맞추는 것이며, 경우에 따라 추론 시간에도 영향을 미칠 수 있습니다.

--- a/docs/source/ko/main_classes/callback.md
+++ b/docs/source/ko/main_classes/callback.md
@@ -36,7 +36,6 @@ rendered properly in your Markdown viewer.
 사용됩니다.
 - [`~integrations.CometCallback`]는 [comet_ml](https://www.comet.com/site/)이 설치되어 있으면 사용됩니다.
 - [`~integrations.MLflowCallback`]는 [mlflow](https://www.mlflow.org/)가 설치되어 있으면 사용됩니다.
 - [`~integrations.NeptuneCallback`]는 [neptune](https://neptune.ai/)이 설치되어 있으면 사용됩니다.
 - [`~integrations.AzureMLCallback`]는 [azureml-sdk](https://pypi.org/project/azureml-sdk/)가 설치되어
 있으면 사용됩니다.
 - [`~integrations.CodeCarbonCallback`]는 [codecarbon](https://pypi.org/project/codecarbon/)이 설치되어
@@ -82,8 +81,6 @@ rendered properly in your Markdown viewer.

 [[autodoc]] integrations.CodeCarbonCallback

 [[autodoc]] integrations.NeptuneCallback

 [[autodoc]] integrations.ClearMLCallback

 [[autodoc]] integrations.DagsHubCallback
--- a/docs/source/ko/model_doc/llama2.md
+++ b/docs/source/ko/model_doc/llama2.md
@@ -82,7 +82,7 @@ LLaMA2를 시작하는 데 도움이 될 Hugging Face의 공식 및 커뮤니티
 - 개인 컴퓨터에서 QLoRA와 TRL을 사용하여 Llama 2 모델을 미세 조정하는 방법에 대한 [노트북](https://colab.research.google.com/drive/1SYpgFpcmtIUzdE7pxqknrM4ArCASfkFQ?usp=sharing)입니다. 🌎

 ⚡️ 추론
 - AutoGPTQ 라이브러리의 GPTQ를 사용하여 Llama 2 모델을 양자화하는 방법에 대한 [노트북](https://colab.research.google.com/drive/1TC56ArKerXUpbgRy5vM3woRsbTEVNq7h?usp=sharing)입니다. 🌎
 - GPT-QModel 라이브러리의 GPTQ를 사용하여 Llama 2 모델을 양자화하는 방법에 대한 [노트북](https://colab.research.google.com/drive/1TC56ArKerXUpbgRy5vM3woRsbTEVNq7h?usp=sharing)입니다. 🌎
 - 로컬 컴퓨터나 Google Colab에서 4-bit 양자화로 Llama 2 채팅 모델을 실행하는 방법에 대한 [노트북](https://colab.research.google.com/drive/1X1z9Q6domMKl2CnEM0QGHNwidLfR4dW2?usp=sharing)입니다. 🌎

 🚀 배포
--- a/docs/source/ko/quantization/gptq.md
+++ b/docs/source/ko/quantization/gptq.md
@@ -22,12 +22,12 @@ PEFT를 활용한 GPTQ 양자화를 사용해보시려면 이 [노트북](https:

 </Tip>

 [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) 라이브러리는 GPTQ 알고리즘을 구현합니다. 이는 훈련 후 양자화 기법으로, 가중치 행렬의 각 행을 독립적으로 양자화하여 오차를 최소화하는 가중치 버전을 찾습니다. 이 가중치는 int4로 양자화되지만, 추론 중에는 실시간으로 fp16으로 복원됩니다. 이는 int4 가중치가 GPU의 전역 메모리 대신 결합된 커널에서 역양자화되기 때문에 메모리 사용량을 4배 절약할 수 있으며, 더 낮은 비트 너비를 사용함으로써 통신 시간이 줄어들어 추론 속도가 빨라질 것으로 기대할 수 있습니다.
 [GPT-QModel](https://github.com/ModelCloud/GPTQModel) 라이브러리는 GPTQ 알고리즘을 구현합니다. 이는 훈련 후 양자화 기법으로, 가중치 행렬의 각 행을 독립적으로 양자화하여 오차를 최소화하는 가중치 버전을 찾습니다. 이 가중치는 int4로 양자화되지만, 추론 중에는 실시간으로 fp16으로 복원됩니다. 이는 int4 가중치가 GPU의 전역 메모리 대신 결합된 커널에서 역양자화되기 때문에 메모리 사용량을 4배 절약할 수 있으며, 더 낮은 비트 너비를 사용함으로써 통신 시간이 줄어들어 추론 속도가 빨라질 것으로 기대할 수 있습니다.

 시작하기 전에 다음 라이브러리들이 설치되어 있는지 확인하세요:

 ```bash
 pip install auto-gptq
 pip install gptqmodel --no-build-isolation
 pip install --upgrade accelerate optimum transformers
 ```

@@ -44,7 +44,7 @@ gptq_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer)
 자신의 데이터셋을 문자열 리스트 형태로 전달할 수도 있지만, GPTQ 논문에서 사용한 동일한 데이터셋을 사용하는 것을 강력히 권장합니다.

 ```py
 dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
 dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on the GPTQ algorithm."]
 gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer)
 ```

@@ -91,30 +91,3 @@ from transformers import AutoModelForCausalLM

 model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto")
 ```

 ## ExLlama [[exllama]]

 [ExLlama](https://github.com/turboderp/exllama)은 [Llama](model_doc/llama) 모델의 Python/C++/CUDA 구현체로, 4비트 GPTQ 가중치를 사용하여 더 빠른 추론을 위해 설계되었습니다(이 [벤치마크](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)를 참고하세요). ['GPTQConfig'] 객체를 생성할 때 ExLlama 커널이 기본적으로 활성화됩니다. 추론 속도를 더욱 높이기 위해, `exllama_config` 매개변수를 구성하여 [ExLlamaV2](https://github.com/turboderp/exllamav2) 커널을 사용할 수 있습니다:

 ```py
 import torch
 from transformers import AutoModelForCausalLM, GPTQConfig

 gptq_config = GPTQConfig(bits=4, exllama_config={"version":2})
 model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=gptq_config)
 ```

 <Tip warning={true}>

 4비트 모델만 지원되며, 양자화된 모델을 PEFT로 미세 조정하는 경우 ExLlama 커널을 비활성화할 것을 권장합니다.

 </Tip>

 ExLlama 커널은 전체 모델이 GPU에 있을 때만 지원됩니다. AutoGPTQ(버전 0.4.2 이상)로 CPU에서 추론을 수행하는 경우 ExLlama 커널을 비활성화해야 합니다. 이를 위해 config.json 파일의 양자화 설정에서 ExLlama 커널과 관련된 속성을 덮어써야 합니다.

 ```py
 import torch
 from transformers import AutoModelForCausalLM, GPTQConfig
 gptq_config = GPTQConfig(bits=4, use_exllama=False)
 model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="cpu", quantization_config=gptq_config)
 ```
--- a/docs/source/zh/llm_tutorial.md
+++ b/docs/source/zh/llm_tutorial.md
@@ -261,7 +261,7 @@ LLMs是[仅解码器](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt)
 ### 延迟、吞吐量和内存利用率

 1. [指南](llm_tutorial_optimization),如何优化LLMs以提高速度和内存利用；
 2. [指南](main_classes/quantization), 关于`quantization`，如bitsandbytes和autogptq的指南，教您如何大幅降低内存需求。
 2. [指南](main_classes/quantization), 关于`quantization`，如bitsandbytes和GPT-QModel的指南，教您如何大幅降低内存需求。

 ### 相关库

--- a/docs/source/zh/main_classes/callback.md
+++ b/docs/source/zh/main_classes/callback.md
@@ -30,7 +30,6 @@ Callbacks是“只读”的代码片段，除了它们返回的[TrainerControl]
 - [`~integrations.WandbCallback`]，如果安装了[wandb](https://www.wandb.com/)。
 - [`~integrations.CometCallback`]，如果安装了[comet_ml](https://www.comet.com/site/)。
 - [`~integrations.MLflowCallback`]，如果安装了[mlflow](https://www.mlflow.org/)。
 - [`~integrations.NeptuneCallback`]，如果安装了[neptune](https://neptune.ai/)。
 - [`~integrations.AzureMLCallback`]，如果安装了[azureml-sdk](https://pypi.org/project/azureml-sdk/)。
 - [`~integrations.CodeCarbonCallback`]，如果安装了[codecarbon](https://pypi.org/project/codecarbon/)。
 - [`~integrations.ClearMLCallback`]，如果安装了[clearml](https://github.com/allegroai/clearml)。
@@ -71,8 +70,6 @@ Callbacks是“只读”的代码片段，除了它们返回的[TrainerControl]

 [[autodoc]] integrations.CodeCarbonCallback

 [[autodoc]] integrations.NeptuneCallback

 [[autodoc]] integrations.ClearMLCallback

 [[autodoc]] integrations.DagsHubCallback
--- a/docs/source/zh/main_classes/quantization.md
+++ b/docs/source/zh/main_classes/quantization.md
@@ -113,22 +113,22 @@ model = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-alpha-AWQ", att

 [[autodoc]] AwqConfig

 ## `AutoGPTQ` 集成
 ## GPT-QModel 集成

 🤗 Transformers已经整合了`optimum` API，用于对语言模型执行GPTQ量化。您可以以8、4、3甚至2位加载和量化您的模型，而性能无明显下降，并且推理速度更快！这受到大多数GPU硬件的支持。

 要了解更多关于量化模型的信息，请查看：
 - [GPTQ](https://huggingface.co/papers/2210.17323)论文
 - `optimum`关于GPTQ量化的[指南](https://huggingface.co/docs/optimum/llm_quantization/usage_guides/quantization)
 - 用作后端的[`AutoGPTQ`](https://github.com/PanQiWei/AutoGPTQ)库
 - 用作后端的`GPT-QModel` (https://github.com/ModelCloud/GPTQModel)库


 ### 要求

 为了运行下面的代码，您需要安装：

 - 安装最新版本的 `AutoGPTQ` 库
 `pip install auto-gptq`
 - 安装最新版本的 `GPT-QModel` 库
 `pip install gptqmodel --no-build-isolation`

 - 从源代码安装最新版本的`optimum`
 `pip install git+https://github.com/huggingface/optimum.git`
@@ -162,7 +162,7 @@ gptq_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer)


 ```python
 dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
 dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on the GPTQ algorithm."]
 quantization = GPTQConfig(bits=4, dataset = dataset, tokenizer=tokenizer)
 ```

--- a/examples/3D_parallel.py
+++ b/examples/3D_parallel.py
@@ -340,7 +340,7 @@ def main():
    else:
        # Fallback to regular save for non-distributed case
        save_dir = "test_model_nondist"
        model.save_pretrained(save_dir, safe_serialization=False)
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)  # Save tokenizer too
        logger.info(f"Saved model to {save_dir}")

--- a/examples/modular-transformers/configuration_duplicated_method.py
+++ b/examples/modular-transformers/configuration_duplicated_method.py
@@ -8,7 +8,7 @@
 from typing import Optional

 from ...configuration_utils import PreTrainedConfig
 from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
 from ...modeling_rope_utils import RopeParameters


 class DuplicatedMethodConfig(PreTrainedConfig):
@@ -129,7 +129,7 @@ class DuplicatedMethodConfig(PreTrainedConfig):
        eos_token_id: Optional[int] = 2,
        pretraining_tp: Optional[int] = 1,
        tie_word_embeddings: Optional[bool] = False,
        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
        attention_bias: Optional[bool] = False,
        attention_dropout: Optional[float] = 0.0,
        mlp_bias: Optional[bool] = False,
@@ -157,14 +157,7 @@ class DuplicatedMethodConfig(PreTrainedConfig):
        self.attention_dropout = attention_dropout
        self.mlp_bias = mlp_bias
        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
        rope_scaling = kwargs.pop("rope_scaling", None)
        self.rope_parameters = rope_scaling or rope_parameters

        # Validate the correctness of rotary position embeddings parameters
        rope_theta = kwargs.get("rope_theta", 10000.0)
        standardize_rope_params(self, rope_theta=rope_theta)
        rope_config_validation(self)
        self.rope_parameters = rope_parameters

        super().__init__(
            pad_token_id=pad_token_id,
--- a/examples/modular-transformers/configuration_my_new_model.py
+++ b/examples/modular-transformers/configuration_my_new_model.py
@@ -8,7 +8,7 @@
 from typing import Optional

 from ...configuration_utils import PreTrainedConfig
 from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
 from ...modeling_rope_utils import RopeParameters


 class MyNewModelConfig(PreTrainedConfig):
@@ -165,7 +165,7 @@ class MyNewModelConfig(PreTrainedConfig):
        eos_token_id: Optional[int] = 2,
        pretraining_tp: Optional[int] = 1,
        tie_word_embeddings: Optional[bool] = False,
        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
        attention_bias: Optional[bool] = False,
        attention_dropout: Optional[float] = 0.0,
        mlp_bias=True,
@@ -194,14 +194,7 @@ class MyNewModelConfig(PreTrainedConfig):
        self.attention_dropout = attention_dropout
        self.mlp_bias = mlp_bias
        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
        rope_scaling = kwargs.pop("rope_scaling", None)
        self.rope_parameters = rope_scaling or rope_parameters

        # Validate the correctness of rotary position embeddings parameters
        rope_theta = kwargs.get("rope_theta", 10000.0)
        standardize_rope_params(self, rope_theta=rope_theta)
        rope_config_validation(self)
        self.rope_parameters = rope_parameters

        super().__init__(
            pad_token_id=pad_token_id,
--- a/examples/modular-transformers/configuration_my_new_model2.py
+++ b/examples/modular-transformers/configuration_my_new_model2.py
@@ -7,7 +7,7 @@
 from typing import Optional

 from ...configuration_utils import PreTrainedConfig
 from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
 from ...modeling_rope_utils import RopeParameters


 class MyNewModel2Config(PreTrainedConfig):
@@ -68,7 +68,7 @@ class MyNewModel2Config(PreTrainedConfig):
        eos_token_id: Optional[int] = 2,
        pretraining_tp: Optional[int] = 1,
        tie_word_embeddings: Optional[bool] = False,
        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
        attention_bias: Optional[bool] = False,
        attention_dropout: Optional[float] = 0.0,
        mlp_bias: Optional[bool] = False,
@@ -96,14 +96,7 @@ class MyNewModel2Config(PreTrainedConfig):
        self.attention_dropout = attention_dropout
        self.mlp_bias = mlp_bias
        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
        rope_scaling = kwargs.pop("rope_scaling", None)
        self.rope_parameters = rope_scaling or rope_parameters

        # Validate the correctness of rotary position embeddings parameters
        rope_theta = kwargs.get("rope_theta", 10000.0)
        standardize_rope_params(self, rope_theta=rope_theta)
        rope_config_validation(self)
        self.rope_parameters = rope_parameters

        super().__init__(
            pad_token_id=pad_token_id,
--- a/examples/modular-transformers/configuration_new_model.py
+++ b/examples/modular-transformers/configuration_new_model.py
@@ -6,7 +6,8 @@
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # Example where we only want to overwrite the defaults of an init

 from ...configuration_utils import PreTrainedConfig, layer_type_validation

 from ...configuration_utils import PreTrainedConfig


 class NewModelConfig(PreTrainedConfig):
@@ -59,14 +60,14 @@ class NewModelConfig(PreTrainedConfig):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        use_bidirectional_attention (`bool`, *optional*):
            If True, the model will attend to all text tokens instead of using a causal mask.

@@ -116,20 +117,12 @@ class NewModelConfig(PreTrainedConfig):
        eos_token_id=1,
        bos_token_id=2,
        tie_word_embeddings=True,
        rope_theta=10000.0,
        rope_parameters=None,
        attention_bias=False,
        attention_dropout=0.0,
        use_bidirectional_attention=False,
        layer_types=None,
        **kwargs,
    ):
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
@@ -142,15 +135,18 @@ class NewModelConfig(PreTrainedConfig):
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.use_bidirectional_attention = use_bidirectional_attention
        self.rope_parameters = rope_parameters

        self.layer_types = layer_types
        if self.layer_types is None:
            self.layer_types = ["full_attention" for _ in range(self.num_hidden_layers)]
        layer_type_validation(self.layer_types, self.num_hidden_layers)
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    @property
    def num_heads(self):
--- a/examples/modular-transformers/modeling_add_function.py
+++ b/examples/modular-transformers/modeling_add_function.py
@@ -10,6 +10,8 @@ from typing import Optional
 import torch
 from torch import nn

 from ...integrations import use_kernel_func_from_hub


 def rotate_half(x):
    """Rotates half the hidden dims of the input."""
@@ -18,6 +20,7 @@ def rotate_half(x):
    return torch.cat((-x2, x1), dim=-1)


@use_kernel_func_from_hub("rotary_pos_emb")
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

--- a/examples/modular-transformers/modeling_dummy_bert.py
+++ b/examples/modular-transformers/modeling_dummy_bert.py
@@ -10,24 +10,20 @@ from typing import Optional, Union
 import torch
 from torch import nn

 from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from ...masking_utils import create_causal_mask
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_attention_mask_for_sdpa
 from ...masking_utils import create_bidirectional_mask, create_causal_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import TransformersKwargs, auto_docstring, is_torch_flex_attn_available
 from ...pytorch_utils import apply_chunking_to_forward
 from ...utils import TransformersKwargs, auto_docstring
 from ...utils.generic import check_model_inputs
 from .configuration_dummy_bert import DummyBertConfig


 if is_torch_flex_attn_available():
    from ...integrations.flex_attention import make_flex_block_causal_mask


 class DummyBertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

@@ -106,7 +102,7 @@ def eager_attention_forward(
    # Take the dot product between "query" and "key" to get the raw attention scores.
    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling

    if attention_mask is not None and attention_mask.ndim == 4:
    if attention_mask is not None:
        attention_mask = attention_mask[:, :, :, : key.shape[-2]]
        attn_weights = attn_weights + attention_mask

@@ -148,7 +144,7 @@ class DummyBertSelfAttention(nn.Module):
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Cache] = None,
        past_key_values: Optional[Cache] = None,
        cache_position: Optional[torch.Tensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torch.Tensor]:
@@ -160,14 +156,14 @@ class DummyBertSelfAttention(nn.Module):
        key_layer = self.key(hidden_states).view(*hidden_shape).transpose(1, 2)
        value_layer = self.value(hidden_states).view(*hidden_shape).transpose(1, 2)

        if past_key_value is not None:
        if past_key_values is not None:
            # decoder-only dummy_bert can have a simple dynamic cache for example
            current_past_key_value = past_key_value
            if isinstance(past_key_value, EncoderDecoderCache):
                current_past_key_value = past_key_value.self_attention_cache
            current_past_key_values = past_key_values
            if isinstance(past_key_values, EncoderDecoderCache):
                current_past_key_values = past_key_values.self_attention_cache

            # save all key/value_layer to cache to be re-used for fast auto-regressive generation
            key_layer, value_layer = current_past_key_value.update(
            key_layer, value_layer = current_past_key_values.update(
                key_layer,
                value_layer,
                self.layer_idx,
@@ -221,7 +217,7 @@ class DummyBertCrossAttention(nn.Module):
        hidden_states: torch.Tensor,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[EncoderDecoderCache] = None,
        past_key_values: Optional[EncoderDecoderCache] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torch.Tensor]:
        # determine input shapes
@@ -234,22 +230,22 @@ class DummyBertCrossAttention(nn.Module):
        # get query proj
        query_layer = self.query(hidden_states).view(*q_input_shape).transpose(1, 2)

        is_updated = past_key_value.is_updated.get(self.layer_idx) if past_key_value is not None else False
        if past_key_value is not None and is_updated:
        is_updated = past_key_values.is_updated.get(self.layer_idx) if past_key_values is not None else False
        if past_key_values is not None and is_updated:
            # reuse k,v, cross_attentions
            key_layer = past_key_value.cross_attention_cache.layers[self.layer_idx].keys
            value_layer = past_key_value.cross_attention_cache.layers[self.layer_idx].values
            key_layer = past_key_values.cross_attention_cache.layers[self.layer_idx].keys
            value_layer = past_key_values.cross_attention_cache.layers[self.layer_idx].values
        else:
            key_layer = self.key(encoder_hidden_states).view(*kv_input_shape).transpose(1, 2)
            value_layer = self.value(encoder_hidden_states).view(*kv_input_shape).transpose(1, 2)

            if past_key_value is not None:
            if past_key_values is not None:
                # save all states to the cache
                key_layer, value_layer = past_key_value.cross_attention_cache.update(
                key_layer, value_layer = past_key_values.cross_attention_cache.update(
                    key_layer, value_layer, self.layer_idx
                )
                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
                past_key_value.is_updated[self.layer_idx] = True
                past_key_values.is_updated[self.layer_idx] = True

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
@@ -290,25 +286,6 @@ class DummyBertAttention(nn.Module):
        attention_class = DummyBertCrossAttention if is_cross_attention else DummyBertSelfAttention
        self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx)
        self.output = DummyBertSelfOutput(config)
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # Prune linear layers
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # Update hyper params and store pruned heads
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
@@ -316,7 +293,7 @@ class DummyBertAttention(nn.Module):
        attention_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Cache] = None,
        past_key_values: Optional[Cache] = None,
        cache_position: Optional[torch.Tensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torch.Tensor]:
@@ -325,7 +302,7 @@ class DummyBertAttention(nn.Module):
            hidden_states,
            encoder_hidden_states=encoder_hidden_states,
            attention_mask=attention_mask,
            past_key_value=past_key_value,
            past_key_values=past_key_values,
            cache_position=cache_position,
            **kwargs,
        )
@@ -388,14 +365,14 @@ class DummyBertLayer(GradientCheckpointingLayer):
        attention_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Cache] = None,
        past_key_values: Optional[Cache] = None,
        cache_position: Optional[torch.Tensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torch.Tensor]:
        self_attention_output, _ = self.attention(
            hidden_states,
            attention_mask,
            past_key_value=past_key_value,
            past_key_values=past_key_values,
            cache_position=cache_position,
            **kwargs,
        )
@@ -413,7 +390,7 @@ class DummyBertLayer(GradientCheckpointingLayer):
                None,  # attention_mask
                encoder_hidden_states,
                encoder_attention_mask,
                past_key_value=past_key_value,
                past_key_values=past_key_values,
                **kwargs,
            )
            attention_output = cross_attention_output
@@ -452,7 +429,7 @@ class DummyBertEncoder(nn.Module):
                attention_mask,
                encoder_hidden_states,  # as a positional argument for gradient checkpointing
                encoder_attention_mask=encoder_attention_mask,
                past_key_value=past_key_values,
                past_key_values=past_key_values,
                cache_position=cache_position,
                **kwargs,
            )
@@ -503,7 +480,6 @@ class DummyBertLMPredictionHead(nn.Module):
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=True)

        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

    def forward(self, hidden_states):
@@ -527,21 +503,12 @@ class DummyBertPreTrainedModel(PreTrainedModel):
        "cross_attentions": DummyBertCrossAttention,
    }

    @torch.no_grad()
    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
            module.weight.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.zero_()
            module.weight.fill_(1.0)
        elif isinstance(module, DummyBertLMPredictionHead):
            module.bias.zero_()
        super()._init_weights(module)
        if isinstance(module, DummyBertLMPredictionHead):
            init.zeros_(module.bias)


@auto_docstring(
@@ -582,14 +549,6 @@ class DummyBertModel(DummyBertPreTrainedModel):
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @check_model_inputs
    @auto_docstring
    def forward(
@@ -615,19 +574,22 @@ class DummyBertModel(DummyBertPreTrainedModel):
            use_cache = False

        if use_cache and past_key_values is None:
            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
            past_key_values = (
                EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
                if encoder_hidden_states is not None or self.config.is_encoder_decoder
                else DynamicCache(config=self.config)
            )

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

        if input_ids is not None:
            device = input_ids.device
            input_shape = input_ids.shape
            seq_length = input_ids.shape[1]
        else:
            device = inputs_embeds.device
            input_shape = inputs_embeds.shape[:-1]
            seq_length = inputs_embeds.shape[1]

        seq_length = input_shape[1]
        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
        if cache_position is None:
            cache_position = torch.arange(past_key_values_length, past_key_values_length + seq_length, device=device)
@@ -641,7 +603,6 @@ class DummyBertModel(DummyBertPreTrainedModel):
        )

        attention_mask, encoder_attention_mask = self._create_attention_masks(
            input_shape=input_shape,
            attention_mask=attention_mask,
            encoder_attention_mask=encoder_attention_mask,
            embedding_output=embedding_output,
@@ -672,7 +633,6 @@ class DummyBertModel(DummyBertPreTrainedModel):

    def _create_attention_masks(
        self,
        input_shape,
        attention_mask,
        encoder_attention_mask,
        embedding_output,
@@ -680,95 +640,27 @@ class DummyBertModel(DummyBertPreTrainedModel):
        cache_position,
        past_key_values,
    ):
        if attention_mask is not None and attention_mask.dim() == 2:
            if self.config.is_decoder:
                attention_mask = create_causal_mask(
                    config=self.config,
                    input_embeds=embedding_output,
                    attention_mask=attention_mask,
                    cache_position=cache_position,
                    past_key_values=past_key_values,
                )
            else:
                attention_mask = self._update_full_mask(
                    attention_mask,
                    embedding_output,
                )
        elif attention_mask is not None and attention_mask.dim() == 3:
            if "flash" in self.config._attn_implementation or self.config._attn_implementation == "flex_attention":
                raise ValueError(
                    "Passing attention mask with a 3D/4D shape does not work with type "
                    f"{self.config._attn_implementation} - please use either `sdpa` or `eager` instead."
                )
            attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
        if self.config.is_decoder:
            attention_mask = create_causal_mask(
                config=self.config,
                input_embeds=embedding_output,
                attention_mask=attention_mask,
                cache_position=cache_position,
                past_key_values=past_key_values,
            )
        else:
            attention_mask = create_bidirectional_mask(
                config=self.config,
                input_embeds=embedding_output,
                attention_mask=attention_mask,
            )

        if encoder_attention_mask is not None:
            if encoder_attention_mask.dim() == 2:
                encoder_attention_mask = self._update_cross_attn_mask(
                    encoder_hidden_states,
                    encoder_attention_mask,
                    embedding_output.shape[:2],
                    embedding_output,
                )
            else:
                if "flash" in self.config._attn_implementation or self.config._attn_implementation == "flex_attention":
                    raise ValueError(
                        "Passing attention mask with a 3D/4D shape does not work with type "
                        f"{self.config._attn_implementation} - please use either `sdpa` or `eager` instead."
                    )
                encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
            encoder_attention_mask = create_bidirectional_mask(
                config=self.config,
                input_embeds=embedding_output,
                attention_mask=encoder_attention_mask,
                encoder_hidden_states=encoder_hidden_states,
            )

        return attention_mask, encoder_attention_mask

    def _update_full_mask(
        self,
        attention_mask: Union[torch.Tensor, None],
        inputs_embeds: torch.Tensor,
    ):
        if attention_mask is not None:
            if "flash" in self.config._attn_implementation:
                attention_mask = attention_mask if 0 in attention_mask else None
            elif self.config._attn_implementation == "sdpa":
                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
            elif self.config._attn_implementation == "flex_attention":
                if isinstance(attention_mask, torch.Tensor):
                    attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False)
            else:
                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)

        return attention_mask

    def _update_cross_attn_mask(
        self,
        encoder_hidden_states: Union[torch.Tensor, None],
        encoder_attention_mask: Union[torch.Tensor, None],
        input_shape: torch.Size,
        inputs_embeds: torch.Tensor,
    ):
        # expand encoder attention mask
        if encoder_hidden_states is not None and encoder_attention_mask is not None:
            if "flash" in self.config._attn_implementation:
                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
            elif self.config._attn_implementation == "sdpa":
                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
                    encoder_attention_mask,
                    inputs_embeds.dtype,
                    tgt_len=input_shape[-1],
                )
            elif self.config._attn_implementation == "flex_attention":
                if isinstance(encoder_attention_mask, torch.Tensor):
                    encoder_attention_mask = make_flex_block_causal_mask(
                        encoder_attention_mask,
                        query_length=input_shape[-1],
                        is_causal=False,
                    )
            else:
                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                encoder_attention_mask = _prepare_4d_attention_mask(
                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
                )

        return encoder_attention_mask
--- a/examples/modular-transformers/modeling_from_uppercase_model.py
+++ b/examples/modular-transformers/modeling_from_uppercase_model.py
@@ -4,6 +4,7 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_from_uppercase_model.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨

 from collections.abc import Callable
 from typing import Optional, Union

@@ -13,6 +14,8 @@ from torch import nn
 from ...activations import ACT2FN
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs
 from .configuration_from_uppercase_model import FromUppercaseModelTextConfig, FromUppercaseModelVisionConfig


@@ -24,8 +27,7 @@ def eager_attention_forward(
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
    output_attentions: bool = True,
    **kwargs,
    **kwargs: Unpack[TransformersKwargs],
 ):
    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
    if attention_mask is not None:
@@ -35,8 +37,6 @@ def eager_attention_forward(

    attn_output = torch.matmul(attn_weights, value)
    attn_output = attn_output.transpose(1, 2).contiguous()
    if not output_attentions:
        attn_weights = None
    return attn_output, attn_weights


@@ -67,8 +67,7 @@ class FromUppercaseModelAttention(nn.Module):
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
        **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        """Input shape: Batch x Time x Channel"""

@@ -81,15 +80,6 @@ class FromUppercaseModelAttention(nn.Module):
        queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        # FROM_UPPERCASE_MODEL text model uses both `causal_attention_mask` and `attention_mask`
        # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
        if self.config._attn_implementation == "flash_attention_2":
            self.is_causal = causal_attention_mask is not None
        else:
            if attention_mask is not None and causal_attention_mask is not None:
                attention_mask = attention_mask + causal_attention_mask
            elif causal_attention_mask is not None:
                attention_mask = causal_attention_mask

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
@@ -101,17 +91,14 @@ class FromUppercaseModelAttention(nn.Module):
            keys,
            values,
            attention_mask,
            is_causal=self.is_causal,
            scaling=self.scale,
            dropout=0.0 if not self.training else self.dropout,
            output_attentions=output_attentions,
            **kwargs,
        )

        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
        attn_output = self.out_proj(attn_output)

        if not output_attentions:
            attn_weights = None
        return attn_output, attn_weights


@@ -143,27 +130,15 @@ class FromUppercaseModelEncoderLayer(GradientCheckpointingLayer):
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        **kwargs: Unpack[TransformersKwargs],
    ) -> torch.FloatTensor:
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
        hidden_states, attn_weights = self.self_attn(
        hidden_states, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
            **kwargs,
        )
        hidden_states = residual + hidden_states

@@ -172,9 +147,4 @@ class FromUppercaseModelEncoderLayer(GradientCheckpointingLayer):
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (attn_weights,)

        return outputs
        return hidden_states
--- a/examples/modular-transformers/modeling_global_indexing.py
+++ b/examples/modular-transformers/modeling_global_indexing.py
@@ -13,6 +13,7 @@ from torch import nn
 from transformers.modeling_utils import AttentionInterface

 from ...cache_utils import Cache
 from ...integrations import use_kernel_func_from_hub, use_kernelized_func
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs
 from .configuration_global_indexing import GlobalIndexingConfig
@@ -25,6 +26,7 @@ def rotate_half(x):
    return torch.cat((-x2, x1), dim=-1)


@use_kernel_func_from_hub("rotary_pos_emb")
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

@@ -100,6 +102,7 @@ ALL_ATTENTION_FUNCTIONS = AttentionInterface()
 ALL_ATTENTION_FUNCTIONS["flex_attention"] = custom_flex


@use_kernelized_func(apply_rotary_pos_emb)
 class GlobalIndexingAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

@@ -129,8 +132,8 @@ class GlobalIndexingAttention(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[TransformersKwargs],
--- a/examples/modular-transformers/modeling_multimodal2.py
+++ b/examples/modular-transformers/modeling_multimodal2.py
@@ -17,7 +17,9 @@ from ...activations import ACT2FN
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...utils import auto_docstring, can_return_tuple, torch_int
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, torch_int
 from ...utils.generic import check_model_inputs
 from .configuration_multimodal2 import Multimodal2Config, Multimodal2TextConfig, Multimodal2VisionConfig


@@ -29,8 +31,7 @@ def eager_attention_forward(
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
    output_attentions: bool = True,
    **kwargs,
    **kwargs: Unpack[TransformersKwargs],
 ):
    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
    if attention_mask is not None:
@@ -40,8 +41,6 @@ def eager_attention_forward(

    attn_output = torch.matmul(attn_weights, value)
    attn_output = attn_output.transpose(1, 2).contiguous()
    if not output_attentions:
        attn_weights = None
    return attn_output, attn_weights


@@ -72,8 +71,7 @@ class Multimodal2VisionAttention(nn.Module):
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
        **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        """Input shape: Batch x Time x Channel"""

@@ -86,15 +84,6 @@ class Multimodal2VisionAttention(nn.Module):
        queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        # MULTIMODAL2_VISION text model uses both `causal_attention_mask` and `attention_mask`
        # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
        if self.config._attn_implementation == "flash_attention_2":
            self.is_causal = causal_attention_mask is not None
        else:
            if attention_mask is not None and causal_attention_mask is not None:
                attention_mask = attention_mask + causal_attention_mask
            elif causal_attention_mask is not None:
                attention_mask = causal_attention_mask

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
@@ -106,17 +95,14 @@ class Multimodal2VisionAttention(nn.Module):
            keys,
            values,
            attention_mask,
            is_causal=self.is_causal,
            scaling=self.scale,
            dropout=0.0 if not self.training else self.dropout,
            output_attentions=output_attentions,
            **kwargs,
        )

        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
        attn_output = self.out_proj(attn_output)

        if not output_attentions:
            attn_weights = None
        return attn_output, attn_weights


@@ -135,86 +121,11 @@ class Multimodal2VisionMLP(nn.Module):
        return hidden_states


 class Multimodal2Attention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: Union[Multimodal2VisionConfig, Multimodal2TextConfig]):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.scale = self.head_dim**-0.5
        self.dropout = config.attention_dropout
        self.is_causal = False

        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        """Input shape: Batch x Time x Channel"""

        batch_size, seq_length, embed_dim = hidden_states.shape

        queries = self.q_proj(hidden_states)
        keys = self.k_proj(hidden_states)
        values = self.v_proj(hidden_states)

        queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        # MULTIMODAL2 text model uses both `causal_attention_mask` and `attention_mask`
        # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
        if self.config._attn_implementation == "flash_attention_2":
            self.is_causal = causal_attention_mask is not None
        else:
            if attention_mask is not None and causal_attention_mask is not None:
                attention_mask = attention_mask + causal_attention_mask
            elif causal_attention_mask is not None:
                attention_mask = causal_attention_mask

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

        attn_output, attn_weights = attention_interface(
            self,
            queries,
            keys,
            values,
            attention_mask,
            is_causal=self.is_causal,
            scaling=self.scale,
            dropout=0.0 if not self.training else self.dropout,
            output_attentions=output_attentions,
        )

        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
        attn_output = self.out_proj(attn_output)

        if not output_attentions:
            attn_weights = None
        return attn_output, attn_weights


 class Multimodal2VisionEncoderLayer(GradientCheckpointingLayer):
    def __init__(self, config):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.self_attn = Multimodal2Attention(config)
        self.self_attn = Multimodal2VisionAttention(config)
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
        self.mlp = Multimodal2VisionMLP(config)
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
@@ -223,27 +134,15 @@ class Multimodal2VisionEncoderLayer(GradientCheckpointingLayer):
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        **kwargs: Unpack[TransformersKwargs],
    ) -> torch.FloatTensor:
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
        hidden_states, attn_weights = self.self_attn(
        hidden_states, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
            **kwargs,
        )
        hidden_states = residual + hidden_states

@@ -252,12 +151,7 @@ class Multimodal2VisionEncoderLayer(GradientCheckpointingLayer):
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (attn_weights,)

        return outputs
        return hidden_states


 class Multimodal2VisionEncoder(nn.Module):
@@ -279,9 +173,7 @@ class Multimodal2VisionEncoder(nn.Module):
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutput:
        r"""
        Args:
@@ -296,53 +188,17 @@ class Multimodal2VisionEncoder(nn.Module):
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )

        encoder_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        hidden_states = inputs_embeds
        for idx, encoder_layer in enumerate(self.layers):
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            layer_outputs = encoder_layer(
        for encoder_layer in self.layers:
            hidden_states = encoder_layer(
                hidden_states,
                attention_mask,
                causal_attention_mask,
                output_attentions=output_attentions,
                **kwargs,
            )

            hidden_states = layer_outputs[0]

            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)

        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=encoder_states,
            attentions=all_attentions,
        )


@@ -444,15 +300,9 @@ class Multimodal2VisionTransformer(nn.Module):
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        interpolate_pos_encoding: Optional[bool] = False,
        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutputWithPooling:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )

        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

@@ -461,8 +311,7 @@ class Multimodal2VisionTransformer(nn.Module):

        encoder_outputs: BaseModelOutput = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            **kwargs,
        )

        last_hidden_state = encoder_outputs.last_hidden_state
@@ -472,8 +321,6 @@ class Multimodal2VisionTransformer(nn.Module):
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )


@@ -481,12 +328,18 @@ class Multimodal2VisionTransformer(nn.Module):
 class Multimodal2VisionPreTrainedModel(PreTrainedModel):
    config: Multimodal2Config
    base_model_prefix = "multimodal2_vision"
    input_modalities = ("image", "text")
    supports_gradient_checkpointing = True
    _supports_sdpa = True
    _supports_flash_attn = True
    _supports_flex_attn = True
    _supports_attention_backend = True
    _can_record_outputs = {
        "hidden_states": Multimodal2VisionEncoderLayer,
        "attentions": Multimodal2VisionAttention,
    }

    @torch.no_grad()
    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, Multimodal2VisionMLP):
@@ -500,6 +353,7 @@ MULTIMODAL2_VISION_START_DOCSTRING = "doc"
 class Multimodal2VisionModel(Multimodal2VisionPreTrainedModel):
    config: Multimodal2VisionConfig
    main_input_name = "pixel_values"
    input_modalities = ("image",)
    _no_split_modules = ["Multimodal2VisionEncoderLayer"]

    def __init__(self, config: Multimodal2VisionConfig):
@@ -511,14 +365,13 @@ class Multimodal2VisionModel(Multimodal2VisionPreTrainedModel):
    def get_input_embeddings(self) -> nn.Module:
        return self.vision_model.embeddings.patch_embedding

    @can_return_tuple
    @check_model_inputs(tie_last_hidden_states=False)
    @auto_docstring
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        interpolate_pos_encoding: bool = False,
        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutputWithPooling:
        r"""
        Example:
@@ -543,7 +396,6 @@ class Multimodal2VisionModel(Multimodal2VisionPreTrainedModel):

        return self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            interpolate_pos_encoding=interpolate_pos_encoding,
            **kwargs,
        )
--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@@ -10,8 +10,10 @@ from typing import Optional
 import torch
 from torch import nn

 from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache
 from ...integrations import use_kernel_func_from_hub, use_kernelized_func
 from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
@@ -62,6 +64,7 @@ def rotate_half(x):
    return torch.cat((-x2, x1), dim=-1)


@use_kernel_func_from_hub("rotary_pos_emb")
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

@@ -127,6 +130,7 @@ def eager_attention_forward(
    return attn_output, attn_weights


@use_kernelized_func(apply_rotary_pos_emb)
 class MyNewModel2Attention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

@@ -260,12 +264,12 @@ class MyNewModel2PreTrainedModel(PreTrainedModel):
        "attentions": MyNewModel2Attention,
    }

    @torch.no_grad()
    def _init_weights(self, module):
        super()._init_weights(module)

        # We initialize with 0s to be 1 centered as the RMSNorm here does (1 + weight)
        if "RMSNorm" in module.__class__.__name__:
            module.weight.zero_()
            init.zeros_(module.weight)


 class MyNewModel2ForSequenceClassification(GenericForSequenceClassification, MyNewModel2PreTrainedModel):
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@@ -87,27 +87,17 @@ class NewTaskModelMultiModalProjector(nn.Module):
@auto_docstring
 class NewTaskModelPreTrainedModel(PreTrainedModel):
    config: NewTaskModelConfig
    base_model_prefix = ""
    base_model_prefix = "model"
    input_modalities = ("image", "text")
    supports_gradient_checkpointing = True
    _no_split_modules = ["NewTaskModelMultiModalProjector"]
    _skip_keys_device_placement = "past_key_values"

    _can_compile_fullgraph = False
    _supports_flash_attn = True
    _supports_sdpa = True
    _supports_flex_attn = True
    _supports_attention_backend = True

    def _init_weights(self, module):
        # important: this ported version of NewTaskModelisn't meant for training from scratch - only
        # inference and fine-tuning
        std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range)

        if isinstance(module, nn.Linear):
            module.weight.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.zero_()


 def token_type_ids_mask_function(
    token_type_ids: Optional[torch.Tensor],
@@ -249,12 +239,6 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
    def set_input_embeddings(self, value):
        self.language_model.set_input_embeddings(value)

    def set_decoder(self, decoder):
        self.language_model = decoder

    def get_decoder(self):
        return self.language_model

    def get_image_features(self, pixel_values: torch.FloatTensor):
        """
        Obtains image last hidden states from the vision tower and apply multimodal projection.
@@ -458,28 +442,9 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
    def set_input_embeddings(self, value):
        self.model.set_input_embeddings(value)

    def set_decoder(self, decoder):
        self.model.set_decoder(decoder)

    def get_decoder(self):
        return self.model.get_decoder()

    def get_image_features(self, pixel_values):
        return self.model.get_image_features(pixel_values)

    # Make modules available through conditional class for BC
    @property
    def language_model(self):
        return self.model.language_model

    @property
    def vision_tower(self):
        return self.model.vision_tower

    @property
    def multi_modal_projector(self):
        return self.model.multi_modal_projector

    @can_return_tuple
    @auto_docstring
    def forward(
--- a/examples/modular-transformers/modeling_roberta.py
+++ b/examples/modular-transformers/modeling_roberta.py
@@ -10,24 +10,20 @@ from typing import Optional, Union
 import torch
 import torch.nn as nn

 from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from ...masking_utils import create_causal_mask
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_attention_mask_for_sdpa
 from ...masking_utils import create_bidirectional_mask, create_causal_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import TransformersKwargs, auto_docstring, is_torch_flex_attn_available
 from ...pytorch_utils import apply_chunking_to_forward
 from ...utils import TransformersKwargs, auto_docstring
 from ...utils.generic import check_model_inputs
 from .configuration_roberta import RobertaConfig


 if is_torch_flex_attn_available():
    from ...integrations.flex_attention import make_flex_block_causal_mask


 class RobertaEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

@@ -109,7 +105,7 @@ def eager_attention_forward(
    # Take the dot product between "query" and "key" to get the raw attention scores.
    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling

    if attention_mask is not None and attention_mask.ndim == 4:
    if attention_mask is not None:
        attention_mask = attention_mask[:, :, :, : key.shape[-2]]
        attn_weights = attn_weights + attention_mask

@@ -151,7 +147,7 @@ class RobertaSelfAttention(nn.Module):
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Cache] = None,
        past_key_values: Optional[Cache] = None,
        cache_position: Optional[torch.Tensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torch.Tensor]:
@@ -163,14 +159,14 @@ class RobertaSelfAttention(nn.Module):
        key_layer = self.key(hidden_states).view(*hidden_shape).transpose(1, 2)
        value_layer = self.value(hidden_states).view(*hidden_shape).transpose(1, 2)

        if past_key_value is not None:
        if past_key_values is not None:
            # decoder-only roberta can have a simple dynamic cache for example
            current_past_key_value = past_key_value
            if isinstance(past_key_value, EncoderDecoderCache):
                current_past_key_value = past_key_value.self_attention_cache
            current_past_key_values = past_key_values
            if isinstance(past_key_values, EncoderDecoderCache):
                current_past_key_values = past_key_values.self_attention_cache

            # save all key/value_layer to cache to be re-used for fast auto-regressive generation
            key_layer, value_layer = current_past_key_value.update(
            key_layer, value_layer = current_past_key_values.update(
                key_layer,
                value_layer,
                self.layer_idx,
@@ -224,7 +220,7 @@ class RobertaCrossAttention(nn.Module):
        hidden_states: torch.Tensor,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[EncoderDecoderCache] = None,
        past_key_values: Optional[EncoderDecoderCache] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torch.Tensor]:
        # determine input shapes
@@ -237,22 +233,22 @@ class RobertaCrossAttention(nn.Module):
        # get query proj
        query_layer = self.query(hidden_states).view(*q_input_shape).transpose(1, 2)

        is_updated = past_key_value.is_updated.get(self.layer_idx) if past_key_value is not None else False
        if past_key_value is not None and is_updated:
        is_updated = past_key_values.is_updated.get(self.layer_idx) if past_key_values is not None else False
        if past_key_values is not None and is_updated:
            # reuse k,v, cross_attentions
            key_layer = past_key_value.cross_attention_cache.layers[self.layer_idx].keys
            value_layer = past_key_value.cross_attention_cache.layers[self.layer_idx].values
            key_layer = past_key_values.cross_attention_cache.layers[self.layer_idx].keys
            value_layer = past_key_values.cross_attention_cache.layers[self.layer_idx].values
        else:
            key_layer = self.key(encoder_hidden_states).view(*kv_input_shape).transpose(1, 2)
            value_layer = self.value(encoder_hidden_states).view(*kv_input_shape).transpose(1, 2)

            if past_key_value is not None:
            if past_key_values is not None:
                # save all states to the cache
                key_layer, value_layer = past_key_value.cross_attention_cache.update(
                key_layer, value_layer = past_key_values.cross_attention_cache.update(
                    key_layer, value_layer, self.layer_idx
                )
                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
                past_key_value.is_updated[self.layer_idx] = True
                past_key_values.is_updated[self.layer_idx] = True

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
@@ -293,25 +289,6 @@ class RobertaAttention(nn.Module):
        attention_class = RobertaCrossAttention if is_cross_attention else RobertaSelfAttention
        self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx)
        self.output = RobertaSelfOutput(config)
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # Prune linear layers
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # Update hyper params and store pruned heads
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
@@ -319,7 +296,7 @@ class RobertaAttention(nn.Module):
        attention_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Cache] = None,
        past_key_values: Optional[Cache] = None,
        cache_position: Optional[torch.Tensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torch.Tensor]:
@@ -328,7 +305,7 @@ class RobertaAttention(nn.Module):
            hidden_states,
            encoder_hidden_states=encoder_hidden_states,
            attention_mask=attention_mask,
            past_key_value=past_key_value,
            past_key_values=past_key_values,
            cache_position=cache_position,
            **kwargs,
        )
@@ -391,14 +368,14 @@ class RobertaLayer(GradientCheckpointingLayer):
        attention_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Cache] = None,
        past_key_values: Optional[Cache] = None,
        cache_position: Optional[torch.Tensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torch.Tensor]:
        self_attention_output, _ = self.attention(
            hidden_states,
            attention_mask,
            past_key_value=past_key_value,
            past_key_values=past_key_values,
            cache_position=cache_position,
            **kwargs,
        )
@@ -416,7 +393,7 @@ class RobertaLayer(GradientCheckpointingLayer):
                None,  # attention_mask
                encoder_hidden_states,
                encoder_attention_mask,
                past_key_value=past_key_value,
                past_key_values=past_key_values,
                **kwargs,
            )
            attention_output = cross_attention_output
@@ -455,7 +432,7 @@ class RobertaEncoder(nn.Module):
                attention_mask,
                encoder_hidden_states,  # as a positional argument for gradient checkpointing
                encoder_attention_mask=encoder_attention_mask,
                past_key_value=past_key_values,
                past_key_values=past_key_values,
                cache_position=cache_position,
                **kwargs,
            )
@@ -506,7 +483,6 @@ class RobertaLMPredictionHead(nn.Module):
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=True)

        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

    def forward(self, hidden_states):
@@ -530,21 +506,12 @@ class RobertaPreTrainedModel(PreTrainedModel):
        "cross_attentions": RobertaCrossAttention,
    }

    @torch.no_grad()
    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
            module.weight.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.zero_()
            module.weight.fill_(1.0)
        elif isinstance(module, RobertaLMPredictionHead):
            module.bias.zero_()
        super()._init_weights(module)
        if isinstance(module, RobertaLMPredictionHead):
            init.zeros_(module.bias)


@auto_docstring(
@@ -585,14 +552,6 @@ class RobertaModel(RobertaPreTrainedModel):
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @check_model_inputs
    @auto_docstring
    def forward(
@@ -615,19 +574,22 @@ class RobertaModel(RobertaPreTrainedModel):
            use_cache = False

        if use_cache and past_key_values is None:
            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
            past_key_values = (
                EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
                if encoder_hidden_states is not None or self.config.is_encoder_decoder
                else DynamicCache(config=self.config)
            )

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

        if input_ids is not None:
            device = input_ids.device
            input_shape = input_ids.shape
            seq_length = input_ids.shape[1]
        else:
            device = inputs_embeds.device
            input_shape = inputs_embeds.shape[:-1]
            seq_length = inputs_embeds.shape[1]

        seq_length = input_shape[1]
        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
        if cache_position is None:
            cache_position = torch.arange(past_key_values_length, past_key_values_length + seq_length, device=device)
@@ -641,7 +603,6 @@ class RobertaModel(RobertaPreTrainedModel):
        )

        attention_mask, encoder_attention_mask = self._create_attention_masks(
            input_shape=input_shape,
            attention_mask=attention_mask,
            encoder_attention_mask=encoder_attention_mask,
            embedding_output=embedding_output,
@@ -672,7 +633,6 @@ class RobertaModel(RobertaPreTrainedModel):

    def _create_attention_masks(
        self,
        input_shape,
        attention_mask,
        encoder_attention_mask,
        embedding_output,
@@ -680,95 +640,27 @@ class RobertaModel(RobertaPreTrainedModel):
        cache_position,
        past_key_values,
    ):
        if attention_mask is not None and attention_mask.dim() == 2:
            if self.config.is_decoder:
                attention_mask = create_causal_mask(
                    config=self.config,
                    input_embeds=embedding_output,
                    attention_mask=attention_mask,
                    cache_position=cache_position,
                    past_key_values=past_key_values,
                )
            else:
                attention_mask = self._update_full_mask(
                    attention_mask,
                    embedding_output,
                )
        elif attention_mask is not None and attention_mask.dim() == 3:
            if "flash" in self.config._attn_implementation or self.config._attn_implementation == "flex_attention":
                raise ValueError(
                    "Passing attention mask with a 3D/4D shape does not work with type "
                    f"{self.config._attn_implementation} - please use either `sdpa` or `eager` instead."
                )
            attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
        if self.config.is_decoder:
            attention_mask = create_causal_mask(
                config=self.config,
                input_embeds=embedding_output,
                attention_mask=attention_mask,
                cache_position=cache_position,
                past_key_values=past_key_values,
            )
        else:
            attention_mask = create_bidirectional_mask(
                config=self.config,
                input_embeds=embedding_output,
                attention_mask=attention_mask,
            )

        if encoder_attention_mask is not None:
            if encoder_attention_mask.dim() == 2:
                encoder_attention_mask = self._update_cross_attn_mask(
                    encoder_hidden_states,
                    encoder_attention_mask,
                    embedding_output.shape[:2],
                    embedding_output,
                )
            else:
                if "flash" in self.config._attn_implementation or self.config._attn_implementation == "flex_attention":
                    raise ValueError(
                        "Passing attention mask with a 3D/4D shape does not work with type "
                        f"{self.config._attn_implementation} - please use either `sdpa` or `eager` instead."
                    )
                encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
            encoder_attention_mask = create_bidirectional_mask(
                config=self.config,
                input_embeds=embedding_output,
                attention_mask=encoder_attention_mask,
                encoder_hidden_states=encoder_hidden_states,
            )

        return attention_mask, encoder_attention_mask

    def _update_full_mask(
        self,
        attention_mask: Union[torch.Tensor, None],
        inputs_embeds: torch.Tensor,
    ):
        if attention_mask is not None:
            if "flash" in self.config._attn_implementation:
                attention_mask = attention_mask if 0 in attention_mask else None
            elif self.config._attn_implementation == "sdpa":
                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
            elif self.config._attn_implementation == "flex_attention":
                if isinstance(attention_mask, torch.Tensor):
                    attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False)
            else:
                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)

        return attention_mask

    def _update_cross_attn_mask(
        self,
        encoder_hidden_states: Union[torch.Tensor, None],
        encoder_attention_mask: Union[torch.Tensor, None],
        input_shape: torch.Size,
        inputs_embeds: torch.Tensor,
    ):
        # expand encoder attention mask
        if encoder_hidden_states is not None and encoder_attention_mask is not None:
            if "flash" in self.config._attn_implementation:
                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
            elif self.config._attn_implementation == "sdpa":
                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
                    encoder_attention_mask,
                    inputs_embeds.dtype,
                    tgt_len=input_shape[-1],
                )
            elif self.config._attn_implementation == "flex_attention":
                if isinstance(encoder_attention_mask, torch.Tensor):
                    encoder_attention_mask = make_flex_block_causal_mask(
                        encoder_attention_mask,
                        query_length=input_shape[-1],
                        is_causal=False,
                    )
            else:
                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                encoder_attention_mask = _prepare_4d_attention_mask(
                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
                )

        return encoder_attention_mask
--- a/examples/modular-transformers/modeling_super.py
+++ b/examples/modular-transformers/modeling_super.py
@@ -14,13 +14,13 @@ from transformers.modeling_outputs import CausalLMOutputWithPast

 from ...activations import ACT2FN
 from ...cache_utils import Cache
 from ...integrations import use_kernel_forward_from_hub
 from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring
 from ...utils.generic import check_model_inputs
 from ...utils.generic import check_model_inputs, maybe_autocast
 from .configuration_super import SuperConfig


@@ -50,20 +50,49 @@ class SuperRotaryEmbedding(nn.Module):

    def __init__(self, config: SuperConfig, device=None):
        super().__init__()
        # BC: "rope_type" was originally "type"
        if hasattr(config, "rope_parameters") and isinstance(config.rope_parameters, dict):
            self.rope_type = config.rope_parameters.get("rope_type", config.rope_parameters.get("type"))
        else:
            self.rope_type = "default"
        self.max_seq_len_cached = config.max_position_embeddings
        self.original_max_seq_len = config.max_position_embeddings

        self.config = config
        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
        self.rope_type = self.config.rope_parameters["rope_type"]
        rope_init_fn: Callable = self.compute_default_rope_parameters
        if self.rope_type != "default":
            rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
        inv_freq, self.attention_scaling = rope_init_fn(self.config, device)

        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq
        self.original_inv_freq = inv_freq

    @staticmethod
    def compute_default_rope_parameters(
        config: Optional[SuperConfig] = None,
        device: Optional["torch.device"] = None,
        seq_len: Optional[int] = None,
    ) -> tuple["torch.Tensor", float]:
        """
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        """
        base = config.rope_parameters["rope_theta"]
        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads

        attention_factor = 1.0  # Unused in this type of RoPE

        # Compute the inverse frequencies
        inv_freq = 1.0 / (
            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
        )
        return inv_freq, attention_factor

    @torch.no_grad()
    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
@@ -72,7 +101,7 @@ class SuperRotaryEmbedding(nn.Module):
        position_ids_expanded = position_ids[:, None, :].float()

        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
        with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos() * self.attention_scaling
@@ -104,6 +133,7 @@ def rotate_half(x):
    return torch.cat((-x2, x1), dim=-1)


@use_kernel_func_from_hub("rotary_pos_emb")
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

@@ -169,6 +199,7 @@ def eager_attention_forward(
    return attn_output, attn_weights


@use_kernelized_func(apply_rotary_pos_emb)
 class SuperAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

@@ -198,8 +229,8 @@ class SuperAttention(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[TransformersKwargs],
--- a/examples/modular-transformers/modeling_switch_function.py
+++ b/examples/modular-transformers/modeling_switch_function.py
@@ -12,6 +12,7 @@ import torch
 from torch import nn

 from ...cache_utils import Cache
 from ...integrations import use_kernel_func_from_hub, use_kernelized_func
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs
@@ -26,6 +27,7 @@ def rotate_half(x):
    return rot_x


@use_kernel_func_from_hub("rotary_pos_emb")
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

@@ -91,6 +93,7 @@ def eager_attention_forward(
    return attn_output, attn_weights


@use_kernelized_func(apply_rotary_pos_emb)
 class SwitchFunctionAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

@@ -120,8 +123,8 @@ class SwitchFunctionAttention(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[TransformersKwargs],
--- a/examples/modular-transformers/modeling_test_detr.py
+++ b/examples/modular-transformers/modeling_test_detr.py
@@ -4,6 +4,7 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_test_detr.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨

 import math
 import warnings
 from dataclasses import dataclass
@@ -13,6 +14,7 @@ import torch
 import torch.nn.functional as F
 from torch import Tensor, nn

 from ... import initialization as init
 from ...activations import ACT2FN
 from ...integrations import use_kernel_forward_from_hub
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
@@ -203,10 +205,10 @@ def replace_batch_norm(model):
            new_module = TestDetrFrozenBatchNorm2d(module.num_features)

            if module.weight.device != torch.device("meta"):
                new_module.weight.data.copy_(module.weight)
                new_module.bias.data.copy_(module.bias)
                new_module.running_mean.data.copy_(module.running_mean)
                new_module.running_var.data.copy_(module.running_var)
                new_module.weight.copy_(module.weight)
                new_module.bias.copy_(module.bias)
                new_module.running_mean.copy_(module.running_mean)
                new_module.running_var.copy_(module.running_var)

            model._modules[name] = new_module

@@ -811,6 +813,7 @@ class TestDetrPreTrainedModel(PreTrainedModel):
    config: TestDetrConfig
    base_model_prefix = "model"
    main_input_name = "pixel_values"
    input_modalities = ("image",)
    supports_gradient_checkpointing = True
    _no_split_modules = [
        r"TestDetrConvEncoder",
@@ -818,14 +821,15 @@ class TestDetrPreTrainedModel(PreTrainedModel):
        r"TestDetrDecoderLayer",
    ]

    @torch.no_grad()
    def _init_weights(self, module):
        std = self.config.init_std

        if isinstance(module, TestDetrLearnedPositionEmbedding):
            nn.init.uniform_(module.row_embeddings.weight)
            nn.init.uniform_(module.column_embeddings.weight)
            init.uniform_(module.row_embeddings.weight)
            init.uniform_(module.column_embeddings.weight)
        elif isinstance(module, TestDetrMultiscaleDeformableAttention):
            nn.init.constant_(module.sampling_offsets.weight.data, 0.0)
            init.constant_(module.sampling_offsets.weight, 0.0)
            default_dtype = torch.get_default_dtype()
            thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * (
                2.0 * math.pi / module.n_heads
@@ -838,27 +842,28 @@ class TestDetrPreTrainedModel(PreTrainedModel):
            )
            for i in range(module.n_points):
                grid_init[:, :, i, :] *= i + 1
            with torch.no_grad():
                module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
            nn.init.constant_(module.attention_weights.weight.data, 0.0)
            nn.init.constant_(module.attention_weights.bias.data, 0.0)
            nn.init.xavier_uniform_(module.value_proj.weight.data)
            nn.init.constant_(module.value_proj.bias.data, 0.0)
            nn.init.xavier_uniform_(module.output_proj.weight.data)
            nn.init.constant_(module.output_proj.bias.data, 0.0)

            init.copy_(module.sampling_offsets.bias, grid_init.view(-1))
            init.constant_(module.attention_weights.weight, 0.0)
            init.constant_(module.attention_weights.bias, 0.0)
            init.xavier_uniform_(module.value_proj.weight)
            init.constant_(module.value_proj.bias, 0.0)
            init.xavier_uniform_(module.output_proj.weight)
            init.constant_(module.output_proj.bias, 0.0)
        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
            module.weight.normal_(mean=0.0, std=std)
            init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                module.bias.zero_()
                init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            module.weight.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
            init.normal_(module.weight, mean=0.0, std=std)
            # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
            if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
                init.zeros_(module.weight[module.padding_idx])
        if hasattr(module, "reference_points") and not self.config.two_stage:
            nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
            nn.init.constant_(module.reference_points.bias.data, 0.0)
            init.xavier_uniform_(module.reference_points.weight, gain=1.0)
            init.constant_(module.reference_points.bias, 0.0)
        if hasattr(module, "level_embed"):
            nn.init.normal_(module.level_embed)
            init.normal_(module.level_embed)


 class TestDetrEncoder(TestDetrPreTrainedModel):
@@ -925,6 +930,7 @@ class TestDetrEncoder(TestDetrPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs,
    ):
        r"""
        Args:
@@ -1045,6 +1051,7 @@ class TestDetrDecoder(TestDetrPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs,
    ):
        r"""
        Args:
@@ -1266,9 +1273,6 @@ class TestDetrModel(TestDetrPreTrainedModel):

        self.post_init()

    def get_encoder(self):
        return self.encoder

    def freeze_backbone(self):
        for name, param in self.backbone.conv_encoder.model.named_parameters():
            param.requires_grad_(False)
@@ -1378,6 +1382,7 @@ class TestDetrModel(TestDetrPreTrainedModel):
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ) -> Union[tuple[torch.FloatTensor], TestDetrModelOutput]:
        r"""
        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
--- a/examples/modular-transformers/modular_multimodal2.py
+++ b/examples/modular-transformers/modular_multimodal2.py
@@ -35,6 +35,7 @@ class Multimodal2VisionEncoderLayer(CLIPEncoderLayer):
    def __init__(self, config):
        super().__init__()
        self.mlp = Multimodal2VisionMLP(config)
        self.self_attn = Multimodal2VisionAttention(config)


 class Multimodal2VisionEncoder(CLIPEncoder):
@@ -43,7 +44,8 @@ class Multimodal2VisionEncoder(CLIPEncoder):
        self.layers = nn.ModuleList([Multimodal2VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])


 # Finally here the `Vision` part was correct in CLIP, but we still need to tell it that the encoder arg should use it as well
 # Finally here the `Vision` part was correct in CLIP, but we still need to tell it that the encoder and attn arg should
 # use it as well
 class Multimodal2VisionTransformer(CLIPVisionTransformer):
    def __init__(self, config):
        super().__init__(config)
@@ -51,6 +53,11 @@ class Multimodal2VisionTransformer(CLIPVisionTransformer):


 class Multimodal2VisionPreTrainedModel(CLIPPreTrainedModel):
    _can_record_outputs = {
        "hidden_states": Multimodal2VisionEncoderLayer,
        "attentions": Multimodal2VisionAttention,
    }

    def _init_weights(self, module):
        if isinstance(module, Multimodal2VisionMLP):
            pass
--- a/examples/modular-transformers/modular_new_model.py
+++ b/examples/modular-transformers/modular_new_model.py
@@ -23,11 +23,10 @@ class NewModelConfig(GemmaConfig):
        eos_token_id=1,
        bos_token_id=2,
        tie_word_embeddings=True,
        rope_theta=10000.0,
        rope_parameters=None,
        attention_bias=False,
        attention_dropout=0.0,
        use_bidirectional_attention=False,
        layer_types=None,
        **kwargs,
    ):
        super().__init__(self, **kwargs)
--- a/examples/pytorch/3d_parallel_checks.py
+++ b/examples/pytorch/3d_parallel_checks.py
@@ -458,7 +458,7 @@ def main():
    else:
        # Fallback to regular save for non-distributed case
        save_dir = "test_model_nondist"
        model.save_pretrained(save_dir, safe_serialization=False)
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)  # Save tokenizer too
        logger.info(f"Saved model to {save_dir}")

--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@@ -199,7 +199,6 @@ You can easily log and monitor your runs code. The following are currently suppo
 * [TensorBoard](https://www.tensorflow.org/tensorboard)
 * [Weights & Biases](https://docs.wandb.ai/integrations/huggingface)
 * [Comet ML](https://www.comet.com/docs/v2/integrations/ml-frameworks/transformers/)
 * [Neptune](https://docs.neptune.ai/integrations-and-supported-tools/model-training/hugging-face)
 * [ClearML](https://clear.ml/docs/latest/docs/getting_started/ds/ds_first_steps)
 * [DVCLive](https://dvc.org/doc/dvclive/ml-frameworks/huggingface)

@@ -256,91 +255,6 @@ or if in a Conda environment:
 conda install -c comet_ml -c anaconda -c conda-forge comet_ml
 ```

 ### Neptune

 First, install the Neptune client library. You can do it with either `pip` or `conda`:

 `pip`:

 ```bash
 pip install neptune
 ```

 `conda`:

 ```bash
 conda install -c conda-forge neptune
 ```

 Next, in your model training script, import `NeptuneCallback`:

 ```python
 from transformers.integrations import NeptuneCallback
 ```

 To enable Neptune logging, in your `TrainingArguments`, set the `report_to` argument to `"neptune"`:

 ```python
 training_args = TrainingArguments(
    "quick-training-distilbert-mrpc",
    eval_strategy="steps",
    eval_steps=20,
    report_to="neptune",
 )

 trainer = Trainer(
    model,
    training_args,
    ...
 )
 ```

 **Note:** This method requires saving your Neptune credentials as environment variables (see the bottom of the section).

 Alternatively, for more logging options, create a Neptune callback:

 ```python
 neptune_callback = NeptuneCallback()
 ```

 To add more detail to the tracked run, you can supply optional arguments to `NeptuneCallback`.

 Some examples:

 ```python
 neptune_callback = NeptuneCallback(
    name = "DistilBERT",
    description = "DistilBERT fine-tuned on GLUE/MRPC",
    tags = ["args-callback", "fine-tune", "MRPC"],  # tags help you manage runs in Neptune
    base_namespace="callback",  # the default is "finetuning"
    log_checkpoints = "best",  # other options are "last", "same", and None
    capture_hardware_metrics = False,  # additional keyword arguments for a Neptune run
 )
 ```

 Pass the callback to the Trainer:

 ```python
 training_args = TrainingArguments(..., report_to=None)
 trainer = Trainer(
    model,
    training_args,
    ...
    callbacks=[neptune_callback],
 )
 ```

 Now, when you start the training with `trainer.train()`, your metadata will be logged in Neptune.

 **Note:** Although you can pass your **Neptune API token** and **project name** as arguments when creating the callback, the recommended way is to save them as environment variables:

 | Environment variable | Value                                                |
 | :------------------- | :--------------------------------------------------- |
 | `NEPTUNE_API_TOKEN`  | Your Neptune API token. To find and copy it, click your Neptune avatar and select **Get your API token**. |
 | `NEPTUNE_PROJECT` | The full name of your Neptune project (`workspace-name/project-name`). To find and copy it, head to **project settings** &rarr; **Properties**. |

 For detailed instructions and examples, see the [Neptune docs](https://docs.neptune.ai/integrations/transformers/).

 ### ClearML

 To use ClearML, install the clearml package with:
--- a/examples/pytorch/continuous_batching.py
+++ b/examples/pytorch/continuous_batching.py
@@ -25,7 +25,7 @@ import torch
 from torch.profiler import ProfilerActivity, profile
 from tqdm import tqdm

 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers import AutoModelForCausalLM, AutoTokenizer, CompileConfig
 from transformers.generation import GenerationConfig
 from transformers.generation.continuous_batching.requests import logger

@@ -172,7 +172,7 @@ if __name__ == "__main__":

    # Model parameters
    parser.add_argument("--sliding-window", type=int, default=0)
    parser.add_argument("--attn", type=str, default="kernels-community/flash-attn2", help="Attention implementation")
    parser.add_argument("--attn", type=str, default=None, help="Attention implementation")

    # Performance parameters
    parser.add_argument("--matmul-precision", "-mp", type=str, default="high")  # set to "none" to disable
@@ -182,11 +182,16 @@ if __name__ == "__main__":

    # Benchmark parameters
    parser.add_argument("--samples", type=int, default=500, help="Number of samples to generate")
    parser.add_argument(
        "--input-length", type=int, default=None, help="Length of input sequences. Leave to None to mimic real eval."
    )
    parser.add_argument("--max-new-tokens", type=int, default=512, help="Maximum number of new tokens to generate")
    parser.add_argument("--force-max-length", action="store_true", help="Force generation to stop at max length")

    parser.add_argument("--add-prefix", action="store_true", help="Add a prefix to the samples")
    parser.add_argument("--compare", action="store_true", help="Compare CB generation with classic generate")
    parser.add_argument("--profile", type=str, default=None)
    parser.add_argument("--metrics", action="store_true")
    parser.add_argument("--force-max-length", action="store_true", help="Force generation to stop at max length")

    # Display parameters
    parser.add_argument("--displayed", type=int, default=0, help="Number of samples to display")
@@ -195,6 +200,18 @@ if __name__ == "__main__":

    args = parser.parse_args()

    # Choose attention implementation
    if args.attn is None:
        if args.compile:
            args.attn = "kernels-community/flash-attn3@fake-ops-return-probs"
            logger.warning(
                "No attention implementation was provided and compile is enabled. Using experimental kernel: "
                "kernels-community/flash-attn3@fake-ops-return-probs because compile is not supported on main. Change "
                "this when main supports it."  # TODO: cf comment
            )
        else:
            args.attn = "kernels-community/flash-attn3"

    # Create model
    model_id = "google/gemma-2-2b-it" if args.sliding_window > 0 else "meta-llama/Llama-3.1-8B-Instruct"
    has_system_role = args.sliding_window == 0
@@ -221,9 +238,6 @@ if __name__ == "__main__":
        "no": False, "n": False, "false": False, "f": False, "0": False,
    }[cuda_graph_arg]  # fmt: skip

    if args.compile:
        model.forward = torch.compile(model.forward, mode="max-autotune-no-cudagraphs")

    # Prepare tokenizer and dataset
    tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")

@@ -240,6 +254,12 @@ if __name__ == "__main__":
    else:
        possible_prefixes = [None]

    tokenizer_kwargs = {"add_generation_prompt": True}
    if args.input_length is not None:
        tokenizer_kwargs["max_length"] = args.input_length
        tokenizer_kwargs["truncation"] = True
        tokenizer_kwargs["padding"] = True

    batched_inputs = []
    for item, prefix in zip(dataset, cycle(possible_prefixes)):
        messages = []
@@ -250,12 +270,13 @@ if __name__ == "__main__":
            else:
                question = prefix + "\n\n" + question
        messages.append({"role": "user", "content": question})
        inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
        batched_inputs.append(inputs["input_ids"])
        inputs = tokenizer.apply_chat_template(messages, **tokenizer_kwargs)
        inputs = inputs if isinstance(inputs, list) else inputs["input_ids"]
        batched_inputs.append(inputs)

    # Prepare generation config
    generation_cfg = GenerationConfig(
        max_new_tokens=512,
        max_new_tokens=args.max_new_tokens,
        use_cuda_graph=use_cuda_graph,
        eos_token_id=tokenizer.pad_token_id if args.force_max_length else tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
@@ -266,6 +287,14 @@ if __name__ == "__main__":
        max_batch_tokens=args.max_batch_tokens,
    )

    # Add a compile config if requested
    if args.compile:
        generation_cfg.compile_config = CompileConfig(
            fullgraph=True,
            mode="max-autotune-no-cudagraphs",
            dynamic=True,  # FIXME: if we warmup all graphs, this is not needed anymore
        )

    # If we need to compare, we need to generate the reference outputs
    if args.compare:
        expected_outputs = generate_without_cb(
@@ -313,3 +342,4 @@ if __name__ == "__main__":
 # Example usage:
 # python examples/pytorch/continuous_batching.py --attn sdpa --add-prefix --samples 10 --compare
 # python examples/pytorch/continuous_batching.py --attn flash_attention_2 -mp none --add-prefix --samples 500
 # python examples/pytorch/continuous_batching.py -mp none -cg yes --samples 10 --max-new-tokens 32 --profile profile_wip.json
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -45,6 +45,7 @@ import nltk  # Here to have a nice missing dependency error message early on
 import numpy as np
 from datasets import load_dataset
 from filelock import FileLock
 from huggingface_hub import is_offline_mode

 import transformers
 from transformers import (
@@ -61,7 +62,7 @@ from transformers import (
    Seq2SeqTrainingArguments,
    set_seed,
 )
 from transformers.utils import check_min_version, is_offline_mode
 from transformers.utils import check_min_version
 from transformers.utils.versions import require_version


--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -51,7 +51,7 @@ from accelerate.logging import get_logger
 from accelerate.utils import set_seed
 from datasets import load_dataset
 from filelock import FileLock
 from huggingface_hub import HfApi
 from huggingface_hub import HfApi, is_offline_mode
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm

@@ -66,7 +66,7 @@ from transformers import (
    SchedulerType,
    get_scheduler,
 )
 from transformers.utils import check_min_version, is_offline_mode
 from transformers.utils import check_min_version
 from transformers.utils.versions import require_version


--- a/examples/quantization/custom_quantization_int8_example.py
+++ b/examples/quantization/custom_quantization_int8_example.py
@@ -216,7 +216,7 @@ class Int8SymmetricQuantizer(HfQuantizer):
        """
        return True

    def is_serializable(self, safe_serialization=None):
    def is_serializable(self):
        return True

    @property
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,7 +66,8 @@ markers = [
    "flash_attn_3_test: marks tests related to flash attention 3 (deselect with '-m \"not flash_attn_3_test\"')",
    "flash_attn_test: marks tests related to flash attention (deselect with '-m \"not flash_attn_test\"')",
    "bitsandbytes: select (or deselect with `not`) bitsandbytes integration tests",
    "generate: marks tests that use the GenerationTesterMixin"
    "generate: marks tests that use the GenerationTesterMixin",
    "is_training_test: marks tests that use the TrainingTesterMixin (deselect with '-m \"not is_training_test\"')",
 ]
 log_cli = 1
 log_cli_level = "WARNING"
--- a/setup.py
+++ b/setup.py
@@ -36,35 +36,7 @@ To create the package for pypi.
 5. On the release branch, add a tag in git to mark the release: "git tag v<VERSION> -m 'Adds tag v<VERSION> for pypi' "
   Push the tag to git: git push --tags origin v<RELEASE>-release

 6. Build both the sources and the wheel. Do not change anything in setup.py between
   creating the wheel and the source distribution (obviously).

   Run `make build-release`. This will build the release and do some sanity checks for you. If this ends with an error
   message, you need to fix things before going further.

   You should now have a /dist directory with both .whl and .tar.gz source versions.

 7. Check that everything looks correct by uploading the package to the pypi test server:

   twine upload dist/* -r testpypi
   (pypi suggest using twine as other methods upload files via plaintext.)
   You may have to specify the repository url, use the following command then:
   twine upload dist/* -r testpypi --repository-url=https://test.pypi.org/legacy/

   Check that you can install it in a virtualenv by running:
   pip install -i https://test.pypi.org/simple/ transformers

   Check you can run the following commands:
   python -c "from transformers import pipeline; classifier = pipeline('text-classification'); print(classifier('What a nice release'))"
   python -c "from transformers import *"
   python utils/check_build.py --check_lib

   If making a patch release, double check the bug you are patching is indeed resolved.

 8. Upload the final version to actual pypi:
   twine upload dist/* -r pypi

 9. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
 6. Have a core maintainer review and approve the deployment to pypi.
 """

 import re
@@ -99,7 +71,6 @@ _deps = [
    "blobfile",
    "codecarbon>=2.8.1",
    "cookiecutter==1.7.3",
    "dataclasses",
    "datasets>=2.15.0",  # We need either this pin or pyarrow<21.0.0
    "deepspeed>=0.9.3",
    "diffusers",
@@ -113,7 +84,7 @@ _deps = [
    "GitPython<3.1.19",
    "hf-doc-builder>=0.3.0",
    "hf_xet",
    "huggingface-hub>=1.0.0,<2.0",
    "huggingface-hub>=1.2.1,<2.0",
    "importlib_metadata",
    "ipadic>=1.0.0,<2.0",
    "jinja2>=3.1.0",
@@ -169,7 +140,7 @@ _deps = [
    "tensorboard",
    "timeout-decorator",
    "tiktoken",
    "timm<=1.0.19,!=1.0.18",
    "timm>=1.0.20",
    "tokenizers>=0.22.0,<=0.23.0",
    "torch>=2.2",
    "torchaudio",
--- a/splitted_tests.txt
+++ b/splitted_tests.txt
@@ -1 +0,0 @@
 tests/models/afmoe/test_modeling_afmoe.py
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -20,6 +20,9 @@

 __version__ = "5.0.0.dev0"

 import importlib
 import sys
 import types
 from pathlib import Path
 from typing import TYPE_CHECKING

@@ -174,6 +177,8 @@ _import_structure = {
    "quantizers": [],
    "testing_utils": [],
    "tokenization_python": ["PreTrainedTokenizer", "PythonBackend"],
    "tokenization_utils": [],
    "tokenization_utils_fast": [],
    "tokenization_utils_sentencepiece": ["SentencePieceBackend"],
    "tokenization_utils_base": [
        "AddedToken",
@@ -261,6 +266,7 @@ _import_structure = {
    ],
    "video_utils": [],
    "utils.kernel_config": ["KernelConfig"],
    "utils.import_utils": ["requires_backends"],
 }

 # tokenizers-backed objects
@@ -383,6 +389,8 @@ else:
            "BayesianDetectorConfig",
            "BayesianDetectorModel",
            "ClassifierFreeGuidanceLogitsProcessor",
            "ContinuousBatchingManager",
            "ContinuousMixin",
            "EncoderNoRepeatNGramLogitsProcessor",
            "EncoderRepetitionPenaltyLogitsProcessor",
            "EosTokenCriteria",
@@ -536,6 +544,8 @@ if TYPE_CHECKING:
    from .generation import BayesianDetectorModel as BayesianDetectorModel
    from .generation import ClassifierFreeGuidanceLogitsProcessor as ClassifierFreeGuidanceLogitsProcessor
    from .generation import CompileConfig as CompileConfig
    from .generation import ContinuousBatchingManager as ContinuousBatchingManager
    from .generation import ContinuousMixin as ContinuousMixin
    from .generation import EncoderNoRepeatNGramLogitsProcessor as EncoderNoRepeatNGramLogitsProcessor
    from .generation import EncoderRepetitionPenaltyLogitsProcessor as EncoderRepetitionPenaltyLogitsProcessor
    from .generation import EosTokenCriteria as EosTokenCriteria
@@ -741,6 +751,7 @@ if TYPE_CHECKING:
    from .utils import is_torch_npu_available as is_torch_npu_available
    from .utils import is_torch_xla_available as is_torch_xla_available
    from .utils import is_torch_xpu_available as is_torch_xpu_available
    from .utils.import_utils import requires_backends
    from .utils.kernel_config import KernelConfig as KernelConfig

    # Quantization config
@@ -764,8 +775,6 @@ if TYPE_CHECKING:
    from .utils.quantization_config import VptqConfig as VptqConfig
    from .video_processing_utils import BaseVideoProcessor as BaseVideoProcessor
 else:
    import sys

    _import_structure = {k: set(v) for k, v in _import_structure.items()}

    import_structure = define_import_structure(Path(__file__).parent / "models", prefix="models")
@@ -779,6 +788,26 @@ else:
        extra_objects={"__version__": __version__},
    )

    def _create_tokenization_alias(alias: str, target: str) -> None:
        """
        Lazily redirect legacy tokenization module paths to their replacements without importing heavy deps.
        """

        module = types.ModuleType(alias)
        module.__doc__ = f"Alias module for backward compatibility with `{target}`."

        def _get_target():
            return importlib.import_module(target, __name__)

        module.__getattr__ = lambda name: getattr(_get_target(), name)
        module.__dir__ = lambda: dir(_get_target())

        sys.modules[alias] = module
        setattr(sys.modules[__name__], alias.rsplit(".", 1)[-1], module)

    _create_tokenization_alias(f"{__name__}.tokenization_utils_fast", ".tokenization_utils_tokenizers")
    _create_tokenization_alias(f"{__name__}.tokenization_utils", ".tokenization_utils_sentencepiece")


 if not is_torch_available():
    logger.warning_advice(
--- a/src/transformers/conversion_mapping.py
+++ b/src/transformers/conversion_mapping.py
@@ -186,10 +186,22 @@ _checkpoint_conversion_mapping_cache = None

 def get_checkpoint_conversion_mapping(model_type):
    global _checkpoint_conversion_mapping_cache
    _checkpoint_conversion_mapping_cache = _build_checkpoint_conversion_mapping()
    if _checkpoint_conversion_mapping_cache is None:
        _checkpoint_conversion_mapping_cache = _build_checkpoint_conversion_mapping()
    return deepcopy(_checkpoint_conversion_mapping_cache.get(model_type))


 def register_checkpoint_conversion_mapping(
    model_type: str, mapping: list[WeightConverter | WeightRenaming], overwrite: bool = False
 ) -> None:
    global _checkpoint_conversion_mapping_cache
    if _checkpoint_conversion_mapping_cache is None:
        _checkpoint_conversion_mapping_cache = _build_checkpoint_conversion_mapping()
    if model_type in _checkpoint_conversion_mapping_cache and not overwrite:
        raise ValueError(f"Model type {model_type} already exists in the checkpoint conversion mapping.")
    _checkpoint_conversion_mapping_cache[model_type] = mapping


 # DO NOT MODIFY, KEPT FOR BC ONLY
 VLMS = [
    "aria",
@@ -213,6 +225,7 @@ VLMS = [
    "sam3",
    "sam3_tracker",
    "sam3_tracker_video",
    "paddleocrvl",
 ]


@@ -228,7 +241,7 @@ def get_model_conversion_mapping(
    """
    weight_conversions = []

    # Load models with key mapping
    # Load models with explicit, user-provided key mapping
    if key_mapping is not None:
        weight_conversions = [WeightRenaming(source_patterns=k, target_patterns=v) for k, v in key_mapping.items()]
    elif any(
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -19,6 +19,7 @@ allow to make our dependency on SentencePiece optional.
 """

 import warnings
 from collections.abc import Collection
 from functools import lru_cache
 from typing import Optional

@@ -33,6 +34,64 @@ from .utils.import_utils import PROTOBUF_IMPORT_ERROR

 logger = logging.get_logger(__name__)

 MBART_LANGUAGES = [
    "ar_AR",
    "cs_CZ",
    "de_DE",
    "en_XX",
    "es_XX",
    "et_EE",
    "fi_FI",
    "fr_XX",
    "gu_IN",
    "hi_IN",
    "it_IT",
    "ja_XX",
    "kk_KZ",
    "ko_KR",
    "lt_LT",
    "lv_LV",
    "my_MM",
    "ne_NP",
    "nl_XX",
    "ro_RO",
    "ru_RU",
    "si_LK",
    "tr_TR",
    "vi_VN",
    "zh_CN",
 ]

 MBART50_LANGUAGES = MBART_LANGUAGES + [
    "af_ZA",
    "az_AZ",
    "bn_IN",
    "fa_IR",
    "he_IL",
    "hr_HR",
    "id_ID",
    "ka_GE",
    "km_KH",
    "mk_MK",
    "ml_IN",
    "mn_MN",
    "mr_IN",
    "pl_PL",
    "ps_AF",
    "pt_XX",
    "sv_SE",
    "sw_KE",
    "ta_IN",
    "te_IN",
    "th_TH",
    "tl_XX",
    "uk_UA",
    "ur_PK",
    "xh_ZA",
    "gl_ES",
    "sl_SI",
 ]


 def import_protobuf(error_message=""):
    if is_sentencepiece_available():
@@ -61,15 +120,20 @@ def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str:
    return prepend_scheme


 def generate_merges(vocab, vocab_scores):
 def generate_merges(vocab, vocab_scores, skip_tokens: Optional[Collection[str]] = None):
    skip_tokens = set(skip_tokens) if skip_tokens is not None else set()
    reverse = vocab_scores is not None
    vocab_scores = dict(vocab_scores) if reverse else vocab

    merges = []
    for merge, piece_score in vocab_scores.items():
        if merge in skip_tokens:
            continue
        local = []
        for index in range(1, len(merge)):
            piece_l, piece_r = merge[:index], merge[index:]
            if piece_l in skip_tokens or piece_r in skip_tokens:
                continue
            if piece_l in vocab and piece_r in vocab:
                local.append((piece_l, piece_r, piece_score))
        local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]))
@@ -87,22 +151,49 @@ class SentencePieceExtractor:

    def __init__(self, model: str):
        requires_backends(self, "sentencepiece")
        from sentencepiece import SentencePieceProcessor
        requires_backends(self, "protobuf")

        # from .utils import sentencepiece_model_pb2 as model_pb2
        model_pb2 = import_protobuf()

        self.sp = SentencePieceProcessor()
        self.sp.Load(model)
        m = model_pb2.ModelProto()
        with open(model, "rb") as f:
            m.ParseFromString(f.read())
        self.proto = m

    def extract(self, vocab_scores=None) -> tuple[dict[str, int], list[tuple]]:
    def extract(self, model_type, **kwargs) -> tuple[dict[str, int], list[tuple]]:
        """
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        """
        sp = self.sp
        vocab = {sp.id_to_piece(index): index for index in range(sp.GetPieceSize())}
        self.proto.trainer_spec.unk_id
        if model_type is None:
            from tokenizers.models import BPE, Unigram

        merges = generate_merges(vocab, vocab_scores)
            model_type = Unigram if self.proto.trainer_spec.model_type == 2 else BPE
        vocab = [(piece.piece, piece.score) for piece in self.proto.pieces]

        return vocab, merges
        if model_type.__name__ != "BPE":
            kwargs["unk_id"] = self.proto.trainer_spec.unk_id
            kwargs["vocab"] = vocab
        else:
            from .tokenization_utils_base import generate_merges

            vocab = {word: i for i, (word, score) in enumerate(vocab)}
            merges = generate_merges(vocab)
            kwargs["vocab"] = vocab
            kwargs["merges"] = merges

        # control tokens are special
        # user defined symbols are not
        # both user and control tokens are AddedTokens
        # Add user defined symbols (type == 4) from sentencepiece (https://github.com/google/sentencepiece/blob/6225e08edb2577757163b3f5dbba4c0b670ef445/src/sentencepiece_model.proto#L299C29-L299C33)
        spm_added_tokens = [(id, p.piece, p.type == 3) for id, p in enumerate(self.proto.pieces) if p.type in [3, 4]]
        kwargs["additional_special_tokens"] = [
            AddedToken(token, normalized=False, special=special)
            for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0])
        ]
        return kwargs


 class GemmaSentencePieceExtractor(SentencePieceExtractor):
@@ -549,6 +640,16 @@ class SpmConverter(Converter):
    SpmExtractor = SentencePieceExtractor
    special_tokens = {}

    @classmethod
    def convert_from_spm(cls, vocab=None, **kwargs):
        """
        Hook used when converting directly from a SentencePiece model without a slow tokenizer instance.
        By default, return kwargs unchanged.
        """
        if vocab is not None:
            kwargs["vocab"] = vocab
        return kwargs

    def __init__(self, *args):
        requires_backends(self, "protobuf")

@@ -754,6 +855,25 @@ class CamembertConverter(SpmConverter):
            ],
        )

    @classmethod
    def convert_from_spm(cls, vocab=None, **kwargs):
        pad_token = str(kwargs.get("pad_token", "<pad>"))
        unk_token = str(kwargs.get("unk_token", "<unk>"))
        mask_token = str(kwargs.get("mask_token", "<mask>"))

        vocab_list = [
            ("<s>NOTUSED", 0.0),
            (pad_token, 0.0),
            ("</s>NOTUSED", 0.0),
            (unk_token, 0.0),
            ("<unk>NOTUSED", -100.0),
        ]
        if vocab is not None:
            vocab_list.extend(list(vocab)[1:])
        vocab_list.append((mask_token, 0.0))
        kwargs["vocab"] = vocab_list
        return kwargs


 class DebertaV2Converter(SpmConverter):
    def pre_tokenizer(self, replacement, add_prefix_space):
@@ -840,6 +960,27 @@ class MBartConverter(SpmConverter):
            ],
        )

    @classmethod
    def convert_from_spm(cls, vocab=None, **kwargs):
        bos_token = str(kwargs.get("bos_token", "<s>"))
        pad_token = str(kwargs.get("pad_token", "<pad>"))
        eos_token = str(kwargs.get("eos_token", "</s>"))
        unk_token = str(kwargs.get("unk_token", "<unk>"))
        mask_token = str(kwargs.get("mask_token", "<mask>"))

        vocab_list = [
            (bos_token, 0.0),
            (pad_token, 0.0),
            (eos_token, 0.0),
            (unk_token, 0.0),
        ]
        if vocab is not None:
            vocab_list.extend(list(vocab)[3:])
        vocab_list.extend((lang_code, 0.0) for lang_code in MBART_LANGUAGES)
        vocab_list.append((mask_token, 0.0))
        kwargs["vocab"] = vocab_list
        return kwargs


 class MBart50Converter(SpmConverter):
    def vocab(self, proto):
@@ -867,6 +1008,27 @@ class MBart50Converter(SpmConverter):
            ],
        )

    @classmethod
    def convert_from_spm(cls, vocab=None, **kwargs):
        cls_token = str(kwargs.get("cls_token", "<s>"))
        pad_token = str(kwargs.get("pad_token", "<pad>"))
        eos_token = str(kwargs.get("eos_token", "</s>"))
        unk_token = str(kwargs.get("unk_token", "<unk>"))
        mask_token = str(kwargs.get("mask_token", "<mask>"))

        vocab_list = [
            (cls_token, 0.0),
            (pad_token, 0.0),
            (eos_token, 0.0),
            (unk_token, 0.0),
        ]
        if vocab is not None:
            vocab_list.extend(list(vocab)[3:])
        vocab_list.extend((lang_code, 0.0) for lang_code in MBART50_LANGUAGES)
        vocab_list.append((mask_token, 0.0))
        kwargs["vocab"] = vocab_list
        return kwargs


 class NllbConverter(SpmConverter):
    def vocab(self, proto):
@@ -892,6 +1054,28 @@ class NllbConverter(SpmConverter):
            ],
        )

    @classmethod
    def convert_from_spm(cls, vocab=None, **kwargs):
        bos_token = str(kwargs.get("bos_token", "<s>"))
        pad_token = str(kwargs.get("pad_token", "<pad>"))
        eos_token = str(kwargs.get("eos_token", "</s>"))
        unk_token = str(kwargs.get("unk_token", "<unk>"))

        reordered_vocab = {
            bos_token: 0,
            pad_token: 1,
            eos_token: 2,
            unk_token: 3,
        }
        if vocab is not None:
            tokens = vocab.keys() if isinstance(vocab, dict) else [tok for tok, _ in vocab]
            for token in tokens:
                if token in reordered_vocab:
                    continue
                reordered_vocab[token] = len(reordered_vocab)
        kwargs["vocab"] = reordered_vocab
        return kwargs


 class SeamlessM4TConverter(SpmConverter):
    def vocab(self, proto):
@@ -944,6 +1128,26 @@ class XLMRobertaConverter(SpmConverter):
            ],
        )

    @classmethod
    def convert_from_spm(cls, vocab=None, **kwargs):
        bos_token = str(kwargs.get("bos_token", "<s>"))
        pad_token = str(kwargs.get("pad_token", "<pad>"))
        eos_token = str(kwargs.get("eos_token", "</s>"))
        unk_token = str(kwargs.get("unk_token", "<unk>"))
        mask_token = str(kwargs.get("mask_token", "<mask>"))

        vocab_list = [
            (bos_token, 0.0),
            (pad_token, 0.0),
            (eos_token, 0.0),
            (unk_token, 0.0),
        ]
        if vocab is not None:
            vocab_list.extend(list(vocab)[3:])
        vocab_list.append((mask_token, 0.0))
        kwargs["vocab"] = vocab_list
        return kwargs


 class XLNetConverter(SpmConverter):
    def vocab(self, proto):
@@ -1078,6 +1282,17 @@ class T5Converter(SpmConverter):
            ],
        )

    @classmethod
    def convert_from_spm(cls, vocab=None, **kwargs):
        extra_ids = kwargs.get("extra_ids", 100)
        extra_tokens = [f"<extra_id_{i}>" for i in range(extra_ids - 1, -1, -1)]
        vocab_list = list(vocab) if vocab is not None else []
        vocab_list.extend((token, 0.0) for token in extra_tokens)

        kwargs.setdefault("additional_special_tokens", extra_tokens)
        kwargs["vocab"] = vocab_list
        return kwargs


 class UdopConverter(SpmConverter):
    def post_processor(self):
@@ -1171,7 +1386,7 @@ class CLIPConverter(Converter):
        )
        tokenizer.decoder = decoders.ByteLevel()

        # Hack to have a ByteLevel and TemplaceProcessor
        # Hack to have a ByteLevel and TemplateProcessor
        tokenizer.post_processor = processors.RobertaProcessing(
            sep=(self.original_tokenizer.eos_token, self.original_tokenizer.eos_token_id),
            cls=(self.original_tokenizer.bos_token, self.original_tokenizer.bos_token_id),
--- a/src/transformers/core_model_loading.py
+++ b/src/transformers/core_model_loading.py
@@ -20,7 +20,7 @@ import os
 import re
 from abc import abstractmethod
 from collections import defaultdict
 from collections.abc import MutableMapping, MutableSet
 from collections.abc import Callable, MutableMapping, MutableSet
 from concurrent.futures import Future, ThreadPoolExecutor
 from contextlib import contextmanager
 from copy import deepcopy
@@ -31,7 +31,7 @@ import torch

 from .integrations.accelerate import offload_weight
 from .integrations.tensor_parallel import ALL_PARALLEL_STYLES
 from .utils import is_torch_greater_or_equal, logging
 from .utils import is_env_variable_true, is_torch_greater_or_equal, logging


 _torch_distributed_available = torch.distributed.is_available()
@@ -327,10 +327,6 @@ class WeightTransform:
        self.collected_tensors[source_pattern].append(future)
        self.layer_targets[target_key].add(source_key)

    def reset(self) -> None:
        """Clean-up the collected tensors to make sure we don't keep references to past tensors in memory."""
        self.collected_tensors = defaultdict(list)

    def rename_source_key(self, source_key: str) -> tuple[str, str | None]:
        """
        Return a tuple (renamed_key, source_pattern_producing_the_match).
@@ -375,6 +371,32 @@ class WeightTransform:

        return reverse_transform

    def materialize_tensors(self) -> dict[str, list[torch.Tensor]]:
        """
        Materialize all the tensors that were saved in `self.collected_tensors`. This function removes them from the
        internal attribute to avoid keeping them in memory during the different `self.convert` operations, and return
        a new dictionary (otherwise we use more memory than needed during loading).

        We basically have 3 cases here:
        - async loading (default): the tensors are Future instances that we need to wait for
        - sync loading: the tensors are Callable, we need to call the Callable to actually load them from disk
        - saving: the tensors are already torch.Tensor instances (the existing model weights)
        """
        collected_tensors = {}
        for key in set(self.collected_tensors.keys()):
            # Remove from internal attribute
            tensors = self.collected_tensors.pop(key)
            # Async loading
            if isinstance(tensors[0], Future):
                tensors = [future.result() for future in tensors]
            # Sync loading
            elif callable(tensors[0]):
                tensors = [func() for func in tensors]
            # Add them to the new dictionary
            collected_tensors[key] = tensors

        return collected_tensors


@dataclass(slots=True)
 class WeightRenaming(WeightTransform):
@@ -387,21 +409,21 @@ class WeightRenaming(WeightTransform):
        config=None,
        hf_quantizer=None,
        missing_keys: Optional[MutableSet[str]] = None,
        misc: Optional[MutableMapping[str, str]] = None,
        conversion_errors: Optional[MutableMapping[str, str]] = None,
    ):
        # Collect the tensor if using threading
        for pattern, futures in self.collected_tensors.items():
            self.collected_tensors[pattern] = (
                futures if isinstance(futures[0], torch.Tensor) else [future.result() for future in futures]
            )
        # Collect the tensors here - we use a new dictionary to avoid keeping them in memory in the internal
        # attribute during the whole process
        collected_tensors = self.materialize_tensors()

        # Perform renaming op (for a simple WeightRenaming, `self.source_patterns` and `self.target_patterns` can
        # only be of length 1, and are actually the full key names - we also have only 1 single related tensor)
        target_key = self.target_patterns[0]
        collected_tensors = {target_key: self.collected_tensors[self.source_patterns[0]]}
        collected_tensors = {target_key: collected_tensors[self.source_patterns[0]]}

        if hf_quantizer is not None and self.quantization_operation is not None:
            with log_to_misc(layer_name, misc, (self.collected_tensors, layer_name), self.quantization_operation):
            with log_conversion_errors(
                layer_name, conversion_errors, (len(collected_tensors), layer_name), self.quantization_operation
            ):
                collected_tensors = self.quantization_operation.convert(
                    collected_tensors,
                    source_patterns=self.source_patterns,
@@ -412,7 +434,7 @@ class WeightRenaming(WeightTransform):
                    missing_keys=missing_keys,
                )

        return collected_tensors, misc
        return collected_tensors, conversion_errors


@dataclass(slots=True)
@@ -435,17 +457,14 @@ class WeightConverter(WeightTransform):
        config=None,
        hf_quantizer=None,
        missing_keys: Optional[MutableSet[str]] = None,
        misc: Optional[MutableMapping[str, str]] = None,
        conversion_errors: Optional[MutableMapping[str, str]] = None,
    ):
        # Collect all tensors if using threading
        for pattern, futures in self.collected_tensors.items():
            self.collected_tensors[pattern] = (
                futures if isinstance(futures[0], torch.Tensor) else [future.result() for future in futures]
            )
        # Collect the tensors here - we use a new dictionary to avoid keeping them in memory in the internal
        # attribute during the whole process
        collected_tensors = self.materialize_tensors()

        collected_tensors = self.collected_tensors
        for op in self.operations:
            with log_to_misc(layer_name, misc, (collected_tensors, layer_name), op):
            with log_conversion_errors(layer_name, conversion_errors, (len(collected_tensors), layer_name), op):
                collected_tensors = op.convert(
                    collected_tensors,
                    source_patterns=self.source_patterns,
@@ -462,11 +481,19 @@ class WeightConverter(WeightTransform):
        full_name = layer_name
        if ".*." in layer_name:
            full_name = layer_name.replace(".*.", ".0.")
        prefix, _, suffix = next(full_name.partition(k) for k in collected_tensors.keys() if k in full_name)
        # Rename the tensors
        collected_tensors = {prefix + k + suffix: v for k, v in collected_tensors.items()}

        try:
            prefix, _, suffix = next(full_name.partition(k) for k in collected_tensors.keys() if k in full_name)
            # Rename the tensors
            collected_tensors = {prefix + k + suffix: v for k, v in collected_tensors.items()}
        # some quantizers need to already rename in `convert` as they cannot only rely on prefix and suffix
        except StopIteration:
            pass

        if hf_quantizer is not None and self.quantization_operation is not None:
            with log_to_misc(layer_name, misc, (collected_tensors, layer_name), self.quantization_operation):
            with log_conversion_errors(
                layer_name, conversion_errors, (len(collected_tensors), layer_name), self.quantization_operation
            ):
                collected_tensors = self.quantization_operation.convert(
                    collected_tensors,
                    source_patterns=self.source_patterns,
@@ -476,7 +503,7 @@ class WeightConverter(WeightTransform):
                    model=model,
                    missing_keys=missing_keys,
                )
        return collected_tensors, misc
        return collected_tensors, conversion_errors


 # For I/O bound operations (i.e. here reading files), it is better to have fewer threads, e.g. 4 is a good default.
@@ -485,25 +512,46 @@ class WeightConverter(WeightTransform):
 GLOBAL_WORKERS = min(4, os.cpu_count() or 4)


 def _materialize_copy(tensor, device=None, dtype=None):
 def _materialize_copy(tensor: torch.Tensor, device=None, dtype=None) -> torch.Tensor:
    # This slicing is what actually loads the tensor from the safetensors slice object
    tensor = tensor[...]
    if dtype is not None or device is not None:
        tensor = tensor.to(device=device, dtype=dtype)
    return tensor


 def spawn_materialize(thread_pool, tensor, device=None, dtype=None) -> Future:
 def spawn_materialize(
    thread_pool: ThreadPoolExecutor | None, tensor: torch.Tensor, device=None, dtype=None
 ) -> Future | Callable:
    """Materialize a tensor from file asynchronously if `thread_pool` is provided, or return a Callable that will
    load the tensor synchronously when called."""

    def _job():
        return _materialize_copy(tensor, device, dtype)

    return thread_pool.submit(_job)
    if thread_pool is not None:
        return thread_pool.submit(_job)
    else:
        # Return the Callable here, not the Tensor itself, so we actually delay loading to avoid saturating cpu
        # memory during Conversion
        return _job


 def spawn_tp_materialize(thread_pool, tensor, sharding_method, tensor_idx, dtype=None) -> Future:
 def spawn_tp_materialize(
    thread_pool: ThreadPoolExecutor | None, tensor: torch.Tensor, sharding_method, tensor_idx, dtype=None
 ) -> Future | Callable:
    """Materialize and shard a tensor (according to the TP-plan) from file asynchronously if `thread_pool` is provided, or
    return a Callable that will load the tensor synchronously when called."""

    def _job():
        return sharding_method.shard_tensor(tensor, param_casting_dtype=dtype, tensor_idx=tensor_idx)[0]

    return thread_pool.submit(_job)
    if thread_pool is not None:
        return thread_pool.submit(_job)
    else:
        # Return the Callable here, not the Tensor itself, so we actually delay loading to avoid saturating cpu
        # memory during Conversion
        return _job


 def dot_natural_key(s: str):
@@ -516,13 +564,14 @@ def dot_natural_key(s: str):


@contextmanager
 def log_to_misc(
 def log_conversion_errors(
    first_target_key: str,
    misc: MutableMapping[str, str],
    conversion_errors: MutableMapping[str, str],
    extras: Any = None,
    op: Union[list[ConversionOps], ConversionOps, None] = None,
 ):
    # A simple helper to handle errors with contextual messages.
    """Catch all exceptions during `convert` calls, and log the errors for later. Re-raise a `SkipParameters` exception
    that will be catched later to skip the parameters that raised the original Exception."""
    try:
        yield
    except Exception as e:
@@ -539,19 +588,21 @@ def log_to_misc(

        op_name = _format_op_name(op)
        if isinstance(extras, tuple) and len(extras) == 2:
            values, target_keys = extras
            length, target_keys = extras
            descriptor = f"{op_name} " if op_name else ""
            misc[first_target_key] = (
                f"{e}\nError: {descriptor}on tensors destined for {target_keys}. Ckpt contains: {len(values)}"
            conversion_errors[first_target_key] = (
                f"{e}\nError: {descriptor}on tensors destined for {target_keys}. Ckpt contains: {length}"
            )
        elif isinstance(extras, str):
            suffix = f" via {op_name}" if op_name else ""
            misc[first_target_key] = f"{e}\nError{suffix} when processing parameter {extras}"
            conversion_errors[first_target_key] = f"{e}\nError{suffix} when processing parameter {extras}"
        elif extras is None and op_name:
            misc[first_target_key] = f"{op_name}: {e}"
            conversion_errors[first_target_key] = f"{op_name}: {e}"
        else:
            misc[first_target_key] = f"{extras} |Error: {e}"
        raise SkipLayer()
            conversion_errors[first_target_key] = f"{extras} |Error: {e}"

        # Raise a specific Exception that we can catch easily
        raise SkipParameters()


 def set_param_for_module(
@@ -560,44 +611,42 @@ def set_param_for_module(
    param_value: torch.Tensor,
    mismatch_keys: MutableSet[tuple[str, torch.Size, torch.Size]],
    missing_keys: MutableSet[str],
    misc: MutableMapping[str, Any],
    unexpected_keys: MutableSet[str],
    distributed_operation: Optional[TensorParallelLayer],
    hf_quantizer: HfQuantizer,
 ):
    with log_to_misc(target_name, misc, target_name):
        module_path, _, param_name = target_name.rpartition(".")
        module_obj = model.get_submodule(module_path) if module_path else model

        ref = getattr(module_obj, param_name)
        if ref is None:
            unexpected_keys.add(target_name)
    module_path, _, param_name = target_name.rpartition(".")
    module_obj = model.get_submodule(module_path) if module_path else model

    ref = getattr(module_obj, param_name)
    if ref is None:
        unexpected_keys.add(target_name)
    else:
        use_dtensor = hasattr(distributed_operation, "use_dtensor") and distributed_operation.use_dtensor
        if not isinstance(param_value, torch.nn.Parameter):
            if distributed_operation is not None:
                param_value = DTensor.from_local(
                    param_value,
                    distributed_operation.device_mesh,
                    getattr(distributed_operation, "shard", Replicate()),
                    run_check=False,
                    shape=ref.size(),
                    stride=ref.stride(),
                )
                if not use_dtensor:
                    # we convert to local
                    param_value = param_value.to_local()
            if param_name not in module_obj._buffers:
                param_value = torch.nn.Parameter(param_value, requires_grad=param_value.is_floating_point())

        # Remove from missing keys (it's either mismatched, or all good)
        missing_keys.discard(target_name)
        if ref is not None and ref.shape != param_value.shape and hf_quantizer is None:
            mismatch_keys.add((target_name, param_value.shape, ref.shape))
        else:
            use_dtensor = hasattr(distributed_operation, "use_dtensor") and distributed_operation.use_dtensor
            if not isinstance(param_value, torch.nn.Parameter):
                if distributed_operation is not None:
                    param_value = DTensor.from_local(
                        param_value,
                        distributed_operation.device_mesh,
                        getattr(distributed_operation, "shard", Replicate()),
                        run_check=False,
                        shape=ref.size(),
                        stride=ref.stride(),
                    )
                    if not use_dtensor:
                        # we convert to local
                        param_value = param_value.to_local()
                if param_name not in module_obj._buffers:
                    param_value = torch.nn.Parameter(param_value, requires_grad=param_value.is_floating_point())

            # Remove from missing keys (it's either mismatched, or all good)
            missing_keys.discard(target_name)
            if ref is not None and ref.shape != param_value.shape and hf_quantizer is None:
                mismatch_keys.add((target_name, param_value.shape, ref.shape))
            else:
                # super important otherwise _init_weight will re-init the param
                param_value._is_hf_initialized = True
                setattr(module_obj, param_name, param_value)
            # super important otherwise _init_weight will re-init the param
            param_value._is_hf_initialized = True
            setattr(module_obj, param_name, param_value)


 def offload_and_maybe_resave_param(
@@ -619,8 +668,9 @@ def offload_and_maybe_resave_param(
    return disk_offload_index


 class SkipLayer(Exception):
    """Control-flow sentinel: abort processing of the current layer only."""
 class SkipParameters(Exception):
    """Control-flow sentinel: abort processing of the current parameters only (that were supposed to be created
    by a WeightConverter)."""

    pass

@@ -688,7 +738,7 @@ def convert_and_load_state_dict_in_model(
                target_patterns=["q", "k","v"],
                operations=[Chunk(dim=0, chunks=3)]),
                collected_tensors={
                    "qkv": [Future, Future, Future]},
                    "qkv": [Future]},
                layer_targets={
                    "model.layers.0.attention.q.weight": {"model.layers.0.attention.qkv.weight"},
                    "model.layers.0.attention.k.weight": {"model.layers.0.attention.qkv.weight"},
@@ -774,16 +824,20 @@ def convert_and_load_state_dict_in_model(
    meta_model_state_dict = model.state_dict()
    missing_keys = set(meta_model_state_dict.keys())

    misc = {}
    conversion_errors = {}
    mismatch_keys = set()
    unexpected_keys = set()
    # Global thread_pool
    thread_pool = ThreadPoolExecutor(max_workers=GLOBAL_WORKERS)

    # We use threading by default, if not explicitly deactivated via env variable. If we have to offload,
    # we cannot use it either to control the memory as we are under memory constraints, so we need to be sequential
    if is_env_variable_true("HF_DEACTIVATE_ASYNC_LOAD") or "disk" in device_map.values():
        thread_pool = None
    else:
        thread_pool = ThreadPoolExecutor(max_workers=GLOBAL_WORKERS)

    renamings = [entry for entry in weight_mapping if isinstance(entry, WeightRenaming)]
    converters = [entry for entry in weight_mapping if isinstance(entry, WeightConverter)]

    param_name_to_load: dict[str, Union[WeightRenaming | WeightConverter]] = {}
    param_name_to_load: dict[str, WeightRenaming | WeightConverter] = {}

    # build '(?P<g0>.*.*\\.block_sparse_moe\\..*)' and group to source {'g0': '*.block_sparse_moe.'}
    # and target to source {'g0': '*.mlp.'}. This allows us to quickly find which pattern matched.
@@ -831,12 +885,12 @@ def convert_and_load_state_dict_in_model(
            elif dtype_plan != {} and dtype_policy_alt.search(renamed_key):
                matched_dtype_pattern = dtype_policy_alt.search(renamed_key)
                if matched_dtype_pattern is not None:
                    _dtype = dtype_plan[matched_dtype_pattern.group()]
                    _dtype = dtype_plan[dtype_policy_by_group_name[matched_dtype_pattern.lastgroup]]
            elif empty_param is not None and empty_param.dtype != _dtype:
                _dtype = empty_param.dtype  # usually correct when initializing

            # 4. Handle TP sharding or device_map placement -> scheduled materialization
            future = None
            # 4. Handle TP sharding or device_map placement
            future_or_tensor = None
            if device_mesh:
                if matched_tp_pattern := tp_plan_alt.search(renamed_key):
                    matched_tp_pattern = tp_plan_by_group_name[matched_tp_pattern.lastgroup]
@@ -846,7 +900,7 @@ def convert_and_load_state_dict_in_model(
                            device_mesh=device_mesh, rank=device_map[""].index, empty_param=empty_param.clone()
                        )
                    shard_index = len(mapping.collected_tensors.get(original_key, []))
                    future = spawn_tp_materialize(
                    future_or_tensor = spawn_tp_materialize(
                        thread_pool,
                        tensor,
                        mapping.distributed_operation,
@@ -854,14 +908,14 @@ def convert_and_load_state_dict_in_model(
                        _dtype,
                    )

            if future is None:
            if future_or_tensor is None:
                device_match = device_map_regex.match(renamed_key)
                param_device = device_map[device_match.group()] if device_match else device_map.get("", "cpu")
                # If disk, we need to materialize on cpu first
                param_device = "cpu" if param_device == "disk" else param_device
                future = spawn_materialize(thread_pool, tensor, param_device, _dtype)
                future_or_tensor = spawn_materialize(thread_pool, tensor, param_device, _dtype)

            mapping.add_tensor(renamed_key, original_key, source_pattern, future)
            mapping.add_tensor(renamed_key, original_key, source_pattern, future_or_tensor)
        elif source_pattern is not None:  # add all target keys as unexpected
            mapping = pattern_to_converter[source_pattern]
            for k in mapping.target_patterns:
@@ -869,52 +923,58 @@ def convert_and_load_state_dict_in_model(
        else:
            unexpected_keys.add(renamed_key)

    total_entries = len(param_name_to_load)
    with logging.tqdm(total=total_entries, desc="Loading weights") as pbar:
        for first_param_name, mapping in param_name_to_load.items():
            pbar.update(1)
            pbar.set_postfix({"Materializing param": first_param_name})
            pbar.refresh()
            try:
                realized_value, misc = mapping.convert(
                    first_param_name,
                    model=model,
                    config=model.config,
                    hf_quantizer=hf_quantizer,
                    missing_keys=missing_keys,
                    misc=misc,
                )
                for target_name, param in realized_value.items():
                    param = param[0] if isinstance(param, list) else param
                    device_match = device_map_regex.match(target_name)
                    param_device = device_map[device_match.group()] if device_match else device_map.get("", "cpu")
                    # Offloading support
                    if param_device == "disk":
                        disk_offload_index = offload_and_maybe_resave_param(
                            target_name, param, missing_keys, disk_offload_folder, disk_offload_index, mapping
                        )
                    else:
                        set_param_for_module(
                            model,
                            target_name,
                            param,
                            mismatch_keys,
                            missing_keys,
                            misc,
                            unexpected_keys,
                            mapping.distributed_operation,
                            hf_quantizer,
                        )

                # Cleanup the tensors
                mapping.reset()
            except SkipLayer:
                continue
    try:
        total_entries = len(param_name_to_load)
        with logging.tqdm(total=total_entries, desc="Loading weights") as pbar:
            for first_param_name, mapping in param_name_to_load.items():
                pbar.update(1)
                pbar.set_postfix({"Materializing param": first_param_name})
                pbar.refresh()
                try:
                    realized_value, conversion_errors = mapping.convert(
                        first_param_name,
                        model=model,
                        config=model.config,
                        hf_quantizer=hf_quantizer,
                        missing_keys=missing_keys,
                        conversion_errors=conversion_errors,
                    )
                    for target_name, param in realized_value.items():
                        param = param[0] if isinstance(param, list) else param
                        device_match = device_map_regex.match(target_name)
                        param_device = device_map[device_match.group()] if device_match else device_map.get("", "cpu")
                        # Offloading support
                        if param_device == "disk":
                            disk_offload_index = offload_and_maybe_resave_param(
                                target_name, param, missing_keys, disk_offload_folder, disk_offload_index, mapping
                            )
                        else:
                            set_param_for_module(
                                model,
                                target_name,
                                param,
                                mismatch_keys,
                                missing_keys,
                                unexpected_keys,
                                mapping.distributed_operation,
                                hf_quantizer,
                            )

                    # Cleanup all the tensors that were gathered before next iteration
                    del realized_value

                except SkipParameters:
                    continue

    # Close the pool, independently of whether the code was interrupted or finished successfully
    finally:
        if thread_pool is not None:
            # `cancel_futures=True` in case the program was interupted, to avoid wasting time on exit
            thread_pool.shutdown(wait=False, cancel_futures=True)

    # Keep the current weight conversion mapping for later saving (in case it was coming directly from the user)
    model._weight_conversions = weight_mapping
    thread_pool.shutdown(wait=False)
    return missing_keys, unexpected_keys, mismatch_keys, disk_offload_index, misc
    return missing_keys, unexpected_keys, mismatch_keys, disk_offload_index, conversion_errors


 def revert_weight_conversion(model: PreTrainedModel, state_dict: dict[str, torch.Tensor]):
@@ -961,7 +1021,7 @@ def revert_weight_conversion(model: PreTrainedModel, state_dict: dict[str, torch
    new_state_dict = {}
    for first_param_name, reversed_converter in conversion_mapping.items():
        # Apply the reverse converter
        realized_value, misc = reversed_converter.convert(first_param_name, model=model, config=model.config)
        realized_value, _ = reversed_converter.convert(first_param_name, model=model, config=model.config)
        for target_name, param in realized_value.items():
            param = param[0] if isinstance(param, list) else param
            new_state_dict[target_name] = param
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -711,9 +711,6 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
        if self.random_replace_prob < 0 or self.random_replace_prob > 1:
            raise ValueError("random_replace_prob should be between 0 and 1.")

        self.mask_replace_prob = float(self.mask_replace_prob)
        self.random_replace_prob = float(self.random_replace_prob)

        if self.whole_word_mask:
            if not self.tokenizer.is_fast:
                warnings.warn(
@@ -729,6 +726,9 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
                self.mask_replace_prob = 1
                self.random_replace_prob = 0

        self.mask_replace_prob = float(self.mask_replace_prob)
        self.random_replace_prob = float(self.random_replace_prob)

        self.generator = None

    def get_generator(self, seed):
@@ -1413,9 +1413,17 @@ class DataCollatorWithFlattening(DefaultDataCollator):
            max_length = 0
        for seq_idx, sample in enumerate(features):
            input_ids = sample["input_ids"]
            # Convert to list if tensor
            if hasattr(input_ids, "tolist"):
                input_ids = input_ids.tolist()
            batch["input_ids"] += input_ids

            if is_labels_provided:
                batch["labels"] += [separator_id] + sample["labels"][1:]
                labels = sample["labels"]
                # Convert to list if tensor
                if hasattr(labels, "tolist"):
                    labels = labels.tolist()
                batch["labels"] += [separator_id] + labels[1:]
            else:
                batch["labels"] += [separator_id] + input_ids[1:]
            if self.return_position_ids:
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -9,7 +9,6 @@ deps = {
    "blobfile": "blobfile",
    "codecarbon": "codecarbon>=2.8.1",
    "cookiecutter": "cookiecutter==1.7.3",
    "dataclasses": "dataclasses",
    "datasets": "datasets>=2.15.0",
    "deepspeed": "deepspeed>=0.9.3",
    "diffusers": "diffusers",
@@ -23,7 +22,7 @@ deps = {
    "GitPython": "GitPython<3.1.19",
    "hf-doc-builder": "hf-doc-builder>=0.3.0",
    "hf_xet": "hf_xet",
    "huggingface-hub": "huggingface-hub>=1.0.0,<2.0",
    "huggingface-hub": "huggingface-hub>=1.2.1,<2.0",
    "importlib_metadata": "importlib_metadata",
    "ipadic": "ipadic>=1.0.0,<2.0",
    "jinja2": "jinja2>=3.1.0",
@@ -76,7 +75,7 @@ deps = {
    "tensorboard": "tensorboard",
    "timeout-decorator": "timeout-decorator",
    "tiktoken": "tiktoken",
    "timm": "timm<=1.0.19,!=1.0.18",
    "timm": "timm>=1.0.20",
    "tokenizers": "tokenizers>=0.22.0,<=0.23.0",
    "torch": "torch>=2.2",
    "torchaudio": "torchaudio",
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -30,7 +30,7 @@ from pathlib import Path
 from types import ModuleType
 from typing import Any, Optional, Union

 from huggingface_hub import try_to_load_from_cache
 from huggingface_hub import is_offline_mode, try_to_load_from_cache
 from packaging import version

 from .utils import (
@@ -38,7 +38,6 @@ from .utils import (
    TRANSFORMERS_DYNAMIC_MODULE_NAME,
    cached_file,
    extract_commit_hash,
    is_offline_mode,
    logging,
 )
 from .utils.import_utils import VersionComparison, split_package_version
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -22,7 +22,7 @@ from collections import UserDict
 from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union

 import numpy as np
 from huggingface_hub import create_repo
 from huggingface_hub import create_repo, is_offline_mode

 from .dynamic_module_utils import custom_object_save
 from .utils import (
@@ -32,7 +32,6 @@ from .utils import (
    TensorType,
    copy_func,
    is_numpy_array,
    is_offline_mode,
    is_torch_available,
    is_torch_device,
    is_torch_dtype,
@@ -68,11 +67,18 @@ class BatchFeature(UserDict):
        tensor_type (`Union[None, str, TensorType]`, *optional*):
            You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at
            initialization.
        skip_tensor_conversion (`list[str]` or `set[str]`, *optional*):
            List or set of keys that should NOT be converted to tensors, even when `tensor_type` is specified.
    """

    def __init__(self, data: Optional[dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
    def __init__(
        self,
        data: Optional[dict[str, Any]] = None,
        tensor_type: Union[None, str, TensorType] = None,
        skip_tensor_conversion: Optional[Union[list[str], set[str]]] = None,
    ):
        super().__init__(data)
        self.convert_to_tensors(tensor_type=tensor_type)
        self.convert_to_tensors(tensor_type=tensor_type, skip_tensor_conversion=skip_tensor_conversion)

    def __getitem__(self, item: str) -> Any:
        """
@@ -111,6 +117,14 @@ class BatchFeature(UserDict):
            import torch

            def as_tensor(value):
                if torch.is_tensor(value):
                    return value

                # stack list of tensors if tensor_type is PyTorch (# torch.tensor() does not support list of tensors)
                if isinstance(value, (list, tuple)) and len(value) > 0 and torch.is_tensor(value[0]):
                    return torch.stack(value)

                # convert list of numpy arrays to numpy array (stack) if tensor_type is Numpy
                if isinstance(value, (list, tuple)) and len(value) > 0:
                    if isinstance(value[0], np.ndarray):
                        value = np.array(value)
@@ -139,7 +153,11 @@ class BatchFeature(UserDict):
            is_tensor = is_numpy_array
        return is_tensor, as_tensor

    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
    def convert_to_tensors(
        self,
        tensor_type: Optional[Union[str, TensorType]] = None,
        skip_tensor_conversion: Optional[Union[list[str], set[str]]] = None,
    ):
        """
        Convert the inner content to tensors.

@@ -147,6 +165,8 @@ class BatchFeature(UserDict):
            tensor_type (`str` or [`~utils.TensorType`], *optional*):
                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
                `None`, no modification is done.
            skip_tensor_conversion (`list[str]` or `set[str]`, *optional*):
                List or set of keys that should NOT be converted to tensors, even when `tensor_type` is specified.
        """
        if tensor_type is None:
            return self
@@ -155,18 +175,26 @@ class BatchFeature(UserDict):

        # Do the tensor conversion in batch
        for key, value in self.items():
            # Skip keys explicitly marked for no conversion
            if skip_tensor_conversion and key in skip_tensor_conversion:
                continue

            try:
                if not is_tensor(value):
                    tensor = as_tensor(value)

                    self[key] = tensor
            except:  # noqa E722
            except Exception as e:
                if key == "overflowing_values":
                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
                    raise ValueError(
                        f"Unable to create tensor for '{key}' with overflowing values of different lengths. "
                        f"Original error: {str(e)}"
                    ) from e
                raise ValueError(
                    "Unable to create tensor, you should probably activate padding "
                    "with 'padding=True' to have batched tensors with the same length."
                )
                    f"Unable to convert output '{key}' (type: {type(value).__name__}) to tensor: {str(e)}\n"
                    f"You can try:\n"
                    f"  1. Use padding=True to ensure all outputs have the same shape\n"
                    f"  2. Set return_tensors=None to return Python objects instead of tensors"
                ) from e

        return self

--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -68,7 +68,6 @@ from .utils import (
    is_in_notebook,
    is_ipex_available,
    is_librosa_available,
    is_offline_mode,
    is_onnx_available,
    is_pandas_available,
    is_phonemizer_available,
--- a/src/transformers/generation/init.py
+++ b/src/transformers/generation/init.py
@@ -86,7 +86,11 @@ else:
        "StopStringCriteria",
    ]
    _import_structure["continuous_batching"] = [
        "ContinuousBatchingManager",
        "ContinuousMixin",
        "FIFOScheduler",
        "PrefillFirstScheduler",
        "Scheduler",
    ]
    _import_structure["utils"] = [
        "GenerationMixin",
@@ -127,7 +131,13 @@ if TYPE_CHECKING:
            EarlyExitCandidateGenerator,
            PromptLookupCandidateGenerator,
        )
        from .continuous_batching import ContinuousMixin
        from .continuous_batching import (
            ContinuousBatchingManager,
            ContinuousMixin,
            FIFOScheduler,
            PrefillFirstScheduler,
            Scheduler,
        )
        from .logits_process import (
            AlternatingCodebooksLogitsProcessor,
            ClassifierFreeGuidanceLogitsProcessor,
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -105,8 +105,9 @@ class GenerationConfig(PushToHubMixin):
        > Parameters that control the length of the output

        max_length (`int`, *optional*, defaults to 20):
            The maximum length the generated tokens can have. Corresponds to the length of the input prompt +
            `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
            `max_new_tokens` is recommended for controlling how many tokens the model generates.
            `max_length` remains for backward compatibility.

        max_new_tokens (`int`, *optional*):
            The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
        min_length (`int`, *optional*, defaults to 0):
--- a/src/transformers/generation/continuous_batching/init.py
+++ b/src/transformers/generation/continuous_batching/init.py
@@ -15,12 +15,16 @@
 from .cache import PagedAttentionCache
 from .continuous_api import ContinuousBatchingManager, ContinuousMixin
 from .requests import RequestState, RequestStatus
 from .scheduler import FIFOScheduler, PrefillFirstScheduler, Scheduler


 __all__ = [
    "ContinuousBatchingManager",
    "ContinuousMixin",
    "FIFOScheduler",
    "PagedAttentionCache",
    "PrefillFirstScheduler",
    "RequestState",
    "RequestStatus",
    "Scheduler",
 ]
--- a/src/transformers/generation/continuous_batching/continuous_api.py
+++ b/src/transformers/generation/continuous_batching/continuous_api.py
@@ -29,7 +29,7 @@ from tqdm import tqdm
 from tqdm.contrib.logging import logging_redirect_tqdm

 from ...configuration_utils import PretrainedConfig
 from ...generation.configuration_utils import GenerationConfig
 from ...generation.configuration_utils import CompileConfig, GenerationConfig
 from ...generation.logits_process import LogitsProcessor
 from ...utils.logging import logging
 from ...utils.metrics import ContinuousBatchProcessorMetrics, attach_tracer, traced
@@ -45,17 +45,20 @@ generation goes on, there are two dimensions that change:
 - the number of keys/values tokens (KV), which grows as the cache does

 To solve this, we slice along those dimensions to fixed lengths. The size of the slices is controlled by the variables
 below: NUM_X_CUDA_GRAPHS means that we create at most NUM_X_CUDA_GRAPHS graphs for the X dimension. So if the maximum
 number of queries tokens is 1000, and NUM_Q_CUDA_GRAPHS is 4, we will slice the number of queries token by intervals of
 1000 / 4 = 250 tokens, ie. to 250, 500, 750 or 1000 queries tokens.
 num_x_padding_intervals: NUM_X_PADDING_INTERVALS means that we create at most NUM_X_PADDING_INTERVALS graphs for the X
 dimension. So if the maximum number of queries tokens is 1000, and NUM_Q_PADDING_INTERVALS is 4, we will slice the
 number of queries token by intervals of 1000 / 4 = 250 tokens, ie. to 250, 500, 750 or 1000 queries tokens.

 Smaller slices means more granularity and thus less padding. But since each graph takes up space on the GPU and time to
 create, we don't want to many graphs. And since the size of the KV dimension is the number of queries tokens plus the
 number of tokens cached, dimension of KV is usually much larger than the dimension of Q. So we have more granularity
 for the KV dimension than the query dimension.

 This variable used to be called NUM_X_CUDA_GRAPHS, but we renamed it to NUM_X_PADDING_INTERVALS because it is used for
 padding in the case of cuda graphs AND torch.compile.
 """
 NUM_Q_CUDA_GRAPHS = 4
 NUM_KV_CUDA_GRAPHS = 8
 NUM_Q_PADDING_INTERVALS = 4
 NUM_KV_PADDING_INTERVALS = 8


 def pad_by_intervals(size: int, max_value: int, nb_intervals: int) -> int:
@@ -63,7 +66,7 @@ def pad_by_intervals(size: int, max_value: int, nb_intervals: int) -> int:
    interval_size = max_value // nb_intervals
    if interval_size == 0:
        return max_value
    padded = ceil(size / interval_size) * interval_size
    padded = ceil(size / interval_size) * interval_size if size > 0 else interval_size
    return min(padded, max_value)


@@ -188,6 +191,8 @@ class ContinuousBatchProcessor:
        scheduler: Scheduler,
        manual_eviction: bool,
        use_cuda_graph: bool,
        q_padding_intervals: int,
        kv_padding_intervals: int,
    ) -> None:
        """Initialize the continuous batch processor.

@@ -221,7 +226,14 @@ class ContinuousBatchProcessor:
        # Accumulator for batch scheduling
        self.requests_in_batch: list[RequestState] = []
        # Cuda graphs for the generation step
        self.q_padding_intervals = q_padding_intervals
        self.kv_padding_intervals = kv_padding_intervals
        self._graphs: dict[tuple[int, int], torch.cuda.CUDAGraph] | None = {} if use_cuda_graph else None
        # Compile-related arguments
        self.compile_config: CompileConfig | None = getattr(generation_config, "compile_config", None)
        self._forward_process_and_sample_is_compiled = False

        self._pad_inputs = use_cuda_graph or (self.compile_config is not None and not self.compile_config.dynamic)

        # Set up metrics collector
        self.max_batch_tokens = cache.max_batch_tokens
@@ -247,7 +259,7 @@ class ContinuousBatchProcessor:
        self.cumulative_seqlens_q = torch.empty((self.max_batch_tokens + 1,), **self.tensor_metadata)
        self.max_seqlen_q = 0
        self.logits_indices = torch.empty((self.max_batch_tokens,), **self.tensor_metadata)
        self.output_ids = torch.empty((1, self.max_batch_tokens), **self.tensor_metadata)
        self.output_ids = torch.empty((self.max_batch_tokens,), **self.tensor_metadata)

        # For some kwargs, we have a dict of tensors with as many items as there are attention types
        layer_types = getattr(self.config, "layer_types", None)
@@ -299,7 +311,7 @@ class ContinuousBatchProcessor:
        self.cumulative_seqlens_q[: b_size + 1].zero_()
        self.max_seqlen_q = 0
        self.logits_indices[:q_len].fill_(-1)
        self.output_ids[:, :q_len].fill_(-1)
        self.output_ids[:q_len].fill_(-1)

        # Reset the attributes that are either tensors or dict of tensors
        for layer_type in self.cumulative_seqlens_k:
@@ -435,7 +447,7 @@ class ContinuousBatchProcessor:
        self.metrics.record_batch_metrics(self.requests_in_batch)

        # Reset the static tensors used for storage
        self.reset_static_tensors()  # TODO: this might be unnecessary
        self.reset_static_tensors()  # FIXME: why does this make the generation faster?

        # Prepare accumulators
        self.actual_query_length = 0
@@ -545,13 +557,10 @@ class ContinuousBatchProcessor:
            self.actual_index_sizes[i] = (len(group_read_indices), len(group_write_indices))

    @traced
    def _sync(self) -> list[int]:
        if self.output_ids is not None:
            try:
                return self.output_ids.tolist()[0]
            except Exception:
                return [0, 1]
        return [0, 0]
    def _get_new_tokens(self, num_new_tokens: int) -> list[int]:
        indices = self.logits_indices[:num_new_tokens]
        new_tokens = self.output_ids[indices]
        return new_tokens.tolist()

    @traced
    def _maybe_send_output(self, state: RequestState) -> None:
@@ -562,13 +571,13 @@ class ContinuousBatchProcessor:
    @traced
    def update_batch(self) -> None:
        """Update request states based on generated tokens."""
        out_tokens = self._sync()
        new_tokens = self._get_new_tokens(len(self.requests_in_batch))
        for i, state in enumerate(self.requests_in_batch):
            # If the request has no remaining prompt ids, it means prefill has already ended or just finished
            if len(state.remaining_prefill_tokens) == 0:
                self.metrics.record_ttft_metric(state.created_time, state.request_id)
                state.status = RequestStatus.DECODING
                token = out_tokens[self.logits_indices[i]]
                token = new_tokens[i]
                state.tokens_to_process = [token]
                # Update the request and stop if it is complete
                is_finished = state.update_and_check_completion(token)
@@ -627,28 +636,39 @@ class ContinuousBatchProcessor:
    def _generation_step(self, model: nn.Module, logit_processor: LogitsProcessor, do_sample: bool) -> None:
        """Perform a single generation step."""

        # If cuda graphs are disabled, we just use the actual size
        # If a compile config is specified, we compile the forward pass once in a wrapper
        if self.compile_config is not None and not self._forward_process_and_sample_is_compiled:
            self._forward_process_and_sample = torch.compile(
                self._forward_process_and_sample,
                fullgraph=self.compile_config.fullgraph,
                mode=self.compile_config.mode,
                dynamic=self.compile_config.dynamic,
                backend=self.compile_config.backend,
                options=self.compile_config.options,
            )
            self._forward_process_and_sample_is_compiled = True

        # If inputs are static sized, we find the padded sizes of the queries and keys/values
        if self._pad_inputs:
            padded_q = pad_by_intervals(self.actual_query_length, self.max_batch_tokens, self.q_padding_intervals)
            max_read_index_size = max(self.actual_index_sizes[i][0] for i in range(self.cache.num_groups))
            padded_read_index_size = pad_by_intervals(
                max_read_index_size - self.max_batch_tokens,
                self.cache.num_blocks * self.cache.block_size,
                self.kv_padding_intervals,
            )
        else:
            padded_q, padded_read_index_size = 0, 0
        # Retrieve the model kwargs with or without padding
        batch_data = self.get_model_kwargs(padded_q, padded_read_index_size)

        # If we are not using cuda graphs, we perform the generation step and return
        if self._graphs is None:
            batch_data = self.get_model_kwargs()
            self._forward_process_and_sample(model, batch_data, logit_processor, do_sample)
            return None

        # Determine the padded size of the queries and keys/values
        padded_q = pad_by_intervals(self.actual_query_length, self.max_batch_tokens, NUM_Q_CUDA_GRAPHS)

        max_read_index_size = max(self.actual_index_sizes[i][0] for i in range(self.cache.num_groups))
        padded_read_index_size = pad_by_intervals(
            max_read_index_size - self.max_batch_tokens,
            self.cache.num_blocks * self.cache.block_size,
            NUM_KV_CUDA_GRAPHS,
        )

        # Get the batch data and the associated graph
        batch_data = self.get_model_kwargs(padded_q, padded_read_index_size)

        graph = self._graphs.get((padded_q, padded_read_index_size))

        # If we have a graph that fits, we replay it
        graph = self._graphs.get((padded_q, padded_read_index_size))
        if graph is not None:
            graph.replay()
            return None
@@ -673,7 +693,6 @@ class ContinuousBatchProcessor:
    ) -> None:
        """This function performs the forward pass, logits processing, and sampling; which are broken down into smaller
        function to be easier to trace with OpenTelemetry."""
        # with torch.no_grad():
        logits = self._model_forward(model, batch_data)
        # if self.log_prob_generation:    batch_processor.output_probs.copy_(logits)  # TODO
        probs = self._process_logit(batch_data, logits, logit_processor)
@@ -691,6 +710,7 @@ class ContinuousBatchProcessor:
        # Handle shape compatibility: logit processors expect 2D tensors [batch_size, vocab_size]
        # but continuous batching always produces 3D tensors [batch_size, seq_len, vocab_size]
        batch_size, seq_len, vocab_size = logits.shape
        # NOTE: to be an exact match with generate, we should also convert logits2d to float32 here, but it's not needed in practice
        logits_2d = logits.view(batch_size * seq_len, vocab_size)
        input_ids_2d = batch_data["input_ids"].view(batch_size * seq_len)
        # Process with 2D tensors
@@ -704,12 +724,11 @@ class ContinuousBatchProcessor:
            probs = nn.functional.softmax(probs, dim=-1)
            # probs[0] has shape [seq_len, vocab_size], multinomial returns [seq_len, 1]
            next_tokens = torch.multinomial(probs[0], num_samples=1).squeeze(-1)  # Now [seq_len]
            # Add batch dimension back to match argmax output
            next_tokens = next_tokens.unsqueeze(0)  # Now [1, seq_len]
        else:
            next_tokens = torch.argmax(probs, dim=-1)  # Already [1, seq_len]
        tokens = next_tokens.size(1)  # Get seq_len dimension
        self.output_ids[:, :tokens].copy_(next_tokens)
            next_tokens = torch.argmax(probs, dim=-1)  # shape is [1, seq_len]
            next_tokens = next_tokens.squeeze(0)  # shape is [seq_len]
        tokens = next_tokens.size(0)  # Get seq_len dimension
        self.output_ids[:tokens].copy_(next_tokens)


 # Manager Class (User Interface)
@@ -727,8 +746,8 @@ class ContinuousBatchingManager:
        generation_config: GenerationConfig,
        manual_eviction: bool = False,
        max_queue_size: int = 0,
        num_q_cuda_graphs: int = 0,
        num_kv_cuda_graphs: int = 0,
        num_q_padding_intervals: int = 0,
        num_kv_padding_intervals: int = 0,
        allow_prefix_sharing: bool = True,
    ) -> None:
        """Initialize the continuous batching manager.
@@ -737,19 +756,13 @@ class ContinuousBatchingManager:
            model: The language model for generation
            generation_config: Configuration for generation parameters
            max_queue_size: Maximum size of the request queue (0 = unlimited)
            num_q_cuda_graphs: (optional) Number of CUDA graphs to use for the query dimension
            num_kv_cuda_graphs: (optional) Number of CUDA graphs to use for the keys/values dimension
            num_q_padding_intervals: (optional) Number of intervals used to pad the query dimension
            num_kv_padding_intervals: (optional) Number of intervals used to pad the keys/values dimension
            allow_prefix_sharing: (optional) Whether to allow prefix sharing if the model has only full attention layers
        """
        # Reloade paged version if necessary
        if "paged|" not in model.config._attn_implementation:
            attn_implementation = f"paged|{model.config._attn_implementation}"
            model.config._attn_implementation = attn_implementation

            # lazy loading flash attention including kernel variations
            if "flash" in attn_implementation:
                from ...modeling_flash_attention_utils import lazy_import_paged_flash_attention

                lazy_import_paged_flash_attention(attn_implementation)
            model.set_attn_implementation(f"paged|{model.config._attn_implementation}")

        self.model = model.eval()
        generation_config = model.generation_config if generation_config is None else generation_config
@@ -764,38 +777,69 @@ class ContinuousBatchingManager:
        self.model.generation_config.top_p = None
        self.do_sample = getattr(generation_config, "do_sample", True)
        self.logit_processor = self.model._get_logits_processor(generation_config)
        use_cuda_graph: bool | None = getattr(generation_config, "use_cuda_graph", None)
        self.profile = getattr(generation_config, "profile", False)  # TODO: not supported yet
        self.manual_eviction = manual_eviction
        self.batch_processor: ContinuousBatchProcessor | None = None

        self._allow_prefix_sharing = allow_prefix_sharing

        # If a number of cuda graphs was specified for either Q or KV, we activate cuda graphs
        if num_q_cuda_graphs > 0 or num_kv_cuda_graphs > 0:
            self.use_cuda_graph = True
        # If use_cuda_graph is specified, we follow the user's choice
        elif use_cuda_graph is not None:
            self.use_cuda_graph = use_cuda_graph
        # If the use of cuda graphs is not specified, we follow the user's choice, otherwise we have a default heuristic
        else:
            # Attention implementations where an attention mask is needed suffer a lot more from the padding associated
            # with cuda graphs, so default is to turn cuda graphs off for those implementations
            self.use_cuda_graph = not attn_mask_is_needed(self.model.config)
            logger.warning(
                f"No behavior specified for use_cuda_graph, defaulting to {self.use_cuda_graph = } because "
                f"{self.model.config._attn_implementation = }. If you want to save memory, turn off cuda graphs, but "
                "they can improve performances."
            )
        self.use_cuda_graph = self._decide_use_cuda_graphs(
            use_cuda_graph=getattr(generation_config, "use_cuda_graph", None),
            num_q_padding_intervals=num_q_padding_intervals,
            num_kv_padding_intervals=num_kv_padding_intervals,
            compile_config=getattr(generation_config, "compile_config", None),
        )

        # If cuda graphs are activated, we set the number of cuda graphs for Q and KV if not specified
        if self.use_cuda_graph:
            self.num_q_cuda_graphs = num_q_cuda_graphs if num_q_cuda_graphs > 0 else NUM_Q_CUDA_GRAPHS
            self.num_kv_cuda_graphs = num_kv_cuda_graphs if num_kv_cuda_graphs > 0 else NUM_KV_CUDA_GRAPHS
        # We set the number of padding intervals for Q and KV
        self.q_padding_intervals = num_q_padding_intervals if num_q_padding_intervals > 0 else NUM_Q_PADDING_INTERVALS
        self.kv_padding_intervals = (
            num_kv_padding_intervals if num_kv_padding_intervals > 0 else NUM_KV_PADDING_INTERVALS
        )

        if self.log_prob_generation:
            raise NotImplementedError("log_prob_generation is not supported yet")

    def _decide_use_cuda_graphs(
        self,
        use_cuda_graph: bool | None,
        num_q_padding_intervals: int,
        num_kv_padding_intervals: int,
        compile_config: CompileConfig | None,
    ) -> bool:
        """Returns whether or not to use cuda graphs for continuous batching, depending on the following criteria:
        - (use_cuda_graph) which is the user choice
        - (num_q_padding_intervals) or (num_kv_padding_intervals) which is used to pad inputs: if it was specified by
            the user, it's probable they want to use cuda graphs so inputs need to be padded
        - (compile_config): if compile is on, turn on cuda graphs unless the compile mode uses its own cudagraphs
        If none of the above criteria are met, we use a default heuristic based on the attention implementation: we turn
        on cuda graphs if and only if no attention mask is needed.
        """
        # If use_cuda_graph is specified, we follow the user's choice
        if use_cuda_graph is not None:
            return use_cuda_graph
        # If a number of padding intervals was specified for either Q or KV, we activate cuda graphs
        if num_q_padding_intervals > 0 or num_kv_padding_intervals > 0:
            return True
        # If a compile config was found, turn off cuda graphs if the compile config already uses them
        if compile_config is not None:
            options = torch._inductor.list_mode_options().get(compile_config.mode, compile_config.options)
            compile_uses_cudagraphs = options.get("triton.cudagraphs", False)
            if compile_uses_cudagraphs:
                logger.warning(
                    f"Compile config {compile_config.mode = } uses cudagraphs, which usually does not work well with "
                    "continuous batching. We recommend using mode 'default' or 'max-autotune-no-cudagraphs' instead."
                )
            return not compile_uses_cudagraphs  # TODO: should this also match the dynamic shapes?
        # Otherwise we have a default heuristic based on the attention implementation:
        # attention implementations where an attention mask is needed suffer a lot more from the padding associated
        # with cuda graphs, so default is to turn cuda graphs off for those implementations
        use_cuda_graph = not attn_mask_is_needed(self.model.config)
        logger.warning(
            f"No behavior specified for use_cuda_graph, defaulting to {use_cuda_graph = } because "
            f"{self.model.config._attn_implementation = }. If you want to save memory, turn off cuda graphs, but "
            "they can improve performances."
        )
        return use_cuda_graph

    @traced
    def start(self) -> None:
        """Start the background generation thread."""
@@ -822,7 +866,7 @@ class ContinuousBatchingManager:
            logger.warning("\nBatch processor was not initialized.")
        else:
            if self.batch_processor.cache.use_prefix_sharing:
                logger.warning(
                logger.info(
                    f"\nPrefix sharing was on. Total prefix length: {self.batch_processor.cache._total_prefix_length}"
                )

@@ -902,6 +946,10 @@ class ContinuousBatchingManager:
        streaming: bool = False,
        record_timestamps: bool = False,
    ) -> None:
        # If there is prefix sharing, we sort the inputs to maximize cache hits
        if self._allow_prefix_sharing:
            inputs = sorted(inputs, reverse=True)
        # Add requests in order
        for input_ids in inputs:
            self.add_request(
                input_ids, max_new_tokens=max_new_tokens, streaming=streaming, record_timestamps=record_timestamps
@@ -999,6 +1047,8 @@ class ContinuousBatchingManager:
                scheduler=scheduler(paged_attention_cache, self.manual_eviction),
                manual_eviction=self.manual_eviction,
                use_cuda_graph=self.use_cuda_graph,
                q_padding_intervals=self.q_padding_intervals,
                kv_padding_intervals=self.kv_padding_intervals,
            )
            self.batch_processor = batch_processor
            self.current_batch = 0
@@ -1024,13 +1074,12 @@ class ContinuousBatchingManager:
        # Debug logging of the current memory usage
        if logger.level <= logging.DEBUG:
            device, total, reserved, allocated = get_device_and_memory_breakdown()
            logger.debug(f"[Memory] Device: {device}, Total: {total}, Reserved: {reserved}, Allocated: {allocated}")
            available_memory = total - max(allocated, reserved)
            logger.debug(
                f"[Memory] Device: {device}, Total: {total}, Reserved: {reserved}, Allocated: {allocated}, Available: {available_memory}"
            )

        self._generation_step()

        if torch.cuda.is_available():
            torch.cuda.synchronize()
        # Processor updates the batch after generation step is truly over
        batch_processor.update_batch()

    @traced
@@ -1099,18 +1148,19 @@ class ContinuousMixin:
        generation_config: GenerationConfig | None = None,
        manual_eviction: bool = False,
        max_queue_size: int = 0,
        num_q_cuda_graphs: int = 0,
        num_kv_cuda_graphs: int = 0,
        num_q_padding_intervals: int = 0,
        num_kv_padding_intervals: int = 0,
        allow_prefix_sharing: bool = True,
    ) -> ContinuousBatchingManager:
        """Initialize a manager for continuous batching inference.

        Args:
            generation_config: Custom generation configuration
            generation_config: An optional generation configuration, which may contain a CompileConfig object
            manual_eviction: Whether to manually evict requests from the cache
            max_queue_size: Maximum size of the input request queue
            num_q_cuda_graphs: Number of CUDA graphs to use for the query dimension
            num_kv_cuda_graphs: Number of CUDA graphs to use for the keys/values dimension
            num_q_padding_intervals: Number of intervals used to pad the query dimension
            num_kv_padding_intervals: Number of intervals used to pad the keys/values dimension
            allow_prefix_sharing: A flag to allow prefix sharing if the model has only full attention layers

        Returns:
            `ContinuousBatchingManager`: The manager instance to add requests and retrieve results.
@@ -1132,8 +1182,8 @@ class ContinuousMixin:
            generation_config=gen_config,
            manual_eviction=manual_eviction,
            max_queue_size=max_queue_size,
            num_q_cuda_graphs=num_q_cuda_graphs,
            num_kv_cuda_graphs=num_kv_cuda_graphs,
            num_q_padding_intervals=num_q_padding_intervals,
            num_kv_padding_intervals=num_kv_padding_intervals,
            allow_prefix_sharing=allow_prefix_sharing,
        )

@@ -1144,11 +1194,11 @@ class ContinuousMixin:
        self,
        inputs: list[list[int]],
        generation_config: GenerationConfig | None = None,
        progress_bar: bool = True,
        num_q_cuda_graphs: int = 0,
        num_kv_cuda_graphs: int = 0,
        num_q_padding_intervals: int = 0,
        num_kv_padding_intervals: int = 0,
        allow_prefix_sharing: bool = True,
        record_timestamps: bool = False,
        progress_bar: bool = True,
        **kwargs,
    ) -> dict[str, GenerationOutput]:
        """Generate sequences for a batch of prompts using continuous batching.
@@ -1156,14 +1206,15 @@ class ContinuousMixin:
        Args:
            inputs: List of input token sequences (prompts)
            generation_config: Optional generation configuration
            num_q_cuda_graphs: Number of CUDA graphs to use for the query dimension
            num_kv_cuda_graphs: Number of CUDA graphs to use for the keys/values dimension
            num_q_padding_intervals: Number of intervals used to pad the query dimension
            num_kv_padding_intervals: Number of intervals used to pad the keys/values dimension
            allow_prefix_sharing: A flag to allow prefix sharing if the model has only full attention layers
            record_timestamps: If set to true, the requests will have a timestamp for each token generated
            progress_bar: If set to true, a progress bar will be displayed
            **kwargs: Additional generation parameters

        Returns:
            `list[list[int]]`: A list containing the generated sequences (including prompt tokens
                                if not handled otherwise) for each input prompt, in the same order.
                                Returns an empty list `[]` for requests that failed.
            `dict[str, GenerationOutput]`: a dictionary of request ids to GenerationOutput objects
        """
        if not inputs:
            return {}
@@ -1177,8 +1228,8 @@ class ContinuousMixin:
        with (
            self.continuous_batching_context_manager(
                generation_config=generation_config,
                num_q_cuda_graphs=num_q_cuda_graphs,
                num_kv_cuda_graphs=num_kv_cuda_graphs,
                num_q_cuda_graphs=num_q_padding_intervals,
                num_kv_cuda_graphs=num_kv_padding_intervals,
                allow_prefix_sharing=allow_prefix_sharing,
                block=True,
                timeout=5,