Bump dev version

ROCm: Fix int32 overflow for blocksize quantization (#1796 )
Drop Python 3.9 support (#1795 )
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
  - repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.11.2
    rev: v0.14.3
    hooks:
      - id: ruff
        args:
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ The library includes quantization primitives for 8-bit & 4-bit operations, throu
 ## System Requirements
 bitsandbytes has the following minimum requirements for all platforms:

 * Python 3.9+
 * Python 3.10+
 * [PyTorch](https://pytorch.org/get-started/locally/) 2.3+
  * _Note: While we aim to provide wide backwards compatibility, we recommend using the latest version of PyTorch for the best experience._

--- a/benchmarking/matmul_benchmark.py
+++ b/benchmarking/matmul_benchmark.py
@@ -35,8 +35,8 @@ def test_bench_matmul(batch, seq, model, hidden):
    B = torch.empty(hidden, model, dtype=torch.float16, device="cuda")
    torch.nn.init.xavier_uniform_(B)

    B_fp4, state = F.quantize_fp4(B)
    B_fp4_c, state_c = F.quantize_fp4(B, compress_statistics=True)
    _B_fp4, _state = F.quantize_fp4(B)
    _B_fp4_c, _state_c = F.quantize_fp4(B, compress_statistics=True)

    B_nf4, state_nf4 = F.quantize_nf4(B)
    B_nf4_c, state_nf4_c = F.quantize_nf4(B, compress_statistics=True)
@@ -117,8 +117,8 @@ def test_bench_matmul(batch, seq, model, hidden):
        f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
    )

    CA, SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
    CB, SCB, _ = F.int8_vectorwise_quant(B)
    CA, _SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
    CB, _SCB, _ = F.int8_vectorwise_quant(B)
    torch.cuda.synchronize()
    t0 = time.time()
    for i in range(iters):
--- a/bitsandbytes/init.py
+++ b/bitsandbytes/init.py
@@ -54,10 +54,7 @@ def _import_backends():
    """
    from importlib.metadata import entry_points

    if sys.version_info < (3, 10):
        extensions = entry_points().get("bitsandbytes.backends", [])
    else:
        extensions = entry_points(group="bitsandbytes.backends")
    extensions = entry_points(group="bitsandbytes.backends")

    for ext in extensions:
        try:
@@ -75,4 +72,4 @@ __pdoc__ = {
    "optim.optimizer.MockArgs": False,
 }

 __version__ = "0.48.3.dev0"
 __version__ = "0.49.0.dev0"
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -1,6 +1,7 @@
 from collections.abc import Callable
 from dataclasses import dataclass
 from math import prod
 from typing import Callable, Optional
 from typing import Optional
 import warnings
 from warnings import warn

@@ -257,7 +258,7 @@ class MatMul8bitLt(torch.autograd.Function):
            return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None

        req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad
        CAt, subA, A = ctx.tensors
        CAt, subA, _A = ctx.tensors
        SCAt, idx = ctx.tensor_states
        state: MatmulLtState = ctx.state
        grad_A = grad_B = grad_bias = None
--- a/bitsandbytes/backends/utils.py
+++ b/bitsandbytes/backends/utils.py
@@ -4,9 +4,10 @@ from packaging import version
 import torch

 try:
    import triton  # noqa: F401
    import triton.language as tl  # noqa: F401

    import triton  # noqa: F401

    triton_available = True
 except ImportError:
    triton_available = False
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -6,7 +6,7 @@ from collections.abc import Iterable
 import ctypes as ct
 import itertools
 from math import prod
 from typing import Any, Optional, Union
 from typing import Any, Optional

 import numpy as np
 import torch
@@ -1413,7 +1413,7 @@ def percentile_clipping(grad: Tensor, gnorm_vec: Tensor, step: int, percentile:
            raise ValueError(f"Gradient type {grad.dtype} not supported!")

    current_gnorm = torch.sqrt(gnorm_vec[step % 100])
    vals, idx = torch.sort(gnorm_vec)
    vals, _ = torch.sort(gnorm_vec)
    clip_value = torch.sqrt(vals[percentile])
    gnorm_scale = 1.0

@@ -2059,7 +2059,7 @@ def int8_vectorwise_quant(A: torch.Tensor, threshold=0.0):


 def spmm_coo(
    cooA: Union[COOSparseTensor, torch.Tensor],
    cooA: COOSparseTensor | torch.Tensor,
    B: torch.Tensor,
    out: Optional[torch.Tensor] = None,
 ):
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -310,28 +310,28 @@ class Params4bit(torch.nn.Parameter):
    def cpu(self):
        return self.to(device="cpu")

    def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
    def cuda(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
        return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)

    def xpu(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
    def xpu(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
        return self.to(device="xpu" if device is None else device, non_blocking=non_blocking)

    @overload
    def to(
        self: T,
        device: Optional[Union[int, device]] = ...,
        dtype: Optional[Union[dtype, str]] = ...,
        device: Optional[int | device] = ...,
        dtype: Optional[dtype | str] = ...,
        non_blocking: bool = ...,
    ) -> T: ...

    @overload
    def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T: ...
    def to(self: T, dtype: dtype | str, non_blocking: bool = ...) -> T: ...

    @overload
    def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...

    def to(self, *args, **kwargs):
        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
        device, dtype, non_blocking, _ = torch._C._nn._parse_to(*args, **kwargs)

        if device is not None and device.type != "meta" and not self.bnb_quantized:
            return self._quantize(device)
@@ -644,10 +644,10 @@ class Int8Params(torch.nn.Parameter):
    def cpu(self):
        return self.to(device="cpu")

    def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
    def cuda(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
        return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)

    def xpu(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
    def xpu(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
        return self.to(device="xpu" if device is None else device, non_blocking=non_blocking)

    def __deepcopy__(self, memo):
@@ -665,19 +665,19 @@ class Int8Params(torch.nn.Parameter):
    @overload
    def to(
        self: T,
        device: Optional[Union[int, device]] = ...,
        dtype: Optional[Union[dtype, str]] = ...,
        device: Optional[int | device] = ...,
        dtype: Optional[dtype | str] = ...,
        non_blocking: bool = ...,
    ) -> T: ...

    @overload
    def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T: ...
    def to(self: T, dtype: dtype | str, non_blocking: bool = ...) -> T: ...

    @overload
    def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...

    def to(self, *args, **kwargs):
        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
        device, dtype, non_blocking, _ = torch._C._nn._parse_to(*args, **kwargs)

        is_quantized = self.data.dtype == torch.int8

@@ -1048,7 +1048,7 @@ class Linear8bitLt(nn.Linear):
        # Call the parent to() method to handle standard parameter/buffer movement
        result = super().to(*args, **kwargs)

        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
        device, _, _, _ = torch._C._nn._parse_to(*args, **kwargs)

        # Handle state tensors if needed.
        if device is not None:
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
@@ -507,7 +507,7 @@ class Optimizer2State(Optimizer8bit):
        step = state["step"]

        if config["percentile_clipping"] < 100:
            current_gnorm, clip_value, gnorm_scale = F.percentile_clipping(
            _current_gnorm, _clip_value, gnorm_scale = F.percentile_clipping(
                grad,
                state["gnorm_vec"],
                step,
@@ -725,7 +725,7 @@ class Optimizer1State(Optimizer8bit):
        step = state["step"]

        if config["percentile_clipping"] < 100:
            current_gnorm, clip_value, gnorm_scale = F.percentile_clipping(
            _current_gnorm, _clip_value, gnorm_scale = F.percentile_clipping(
                grad,
                state["gnorm_vec"],
                step,
--- a/bitsandbytes/research/autograd/_functions.py
+++ b/bitsandbytes/research/autograd/_functions.py
@@ -307,8 +307,8 @@ class SwitchBackBnb(torch.autograd.Function):
            return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None

        req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad
        CAt, subA, A = ctx.tensors
        SCAt, idx = ctx.tensor_states
        _CAt, _subA, A = ctx.tensors
        _SCAt, _idx = ctx.tensor_states
        state = ctx.state
        grad_A = grad_B = grad_bias = None

@@ -320,7 +320,7 @@ class SwitchBackBnb(torch.autograd.Function):
        if len(grad_output.shape) == 3:
            grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()

        Cgrad, Cgradt, SCgrad, SCgradt, outlier_cols = F.int8_double_quant(grad_output.to(torch.float16))
        _Cgrad, _Cgradt, _SCgrad, _SCgradt, _outlier_cols = F.int8_double_quant(grad_output.to(torch.float16))

        if req_gradB:
            # print('back A shape', A.shape)
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -91,7 +91,7 @@ def find_outlier_dims(weight, reduction_dim=0, zscore=4.0, topk=None, rdm=False)
    zstd = (std - stdm) / stdstd

    if topk is not None:
        val, idx = torch.topk(std.abs(), k=topk, dim=0)
        _, idx = torch.topk(std.abs(), k=topk, dim=0)
    else:
        idx = torch.where(zstd > zscore)[0]

--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -348,16 +348,17 @@ template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC, int DATA_TY
 //__launch_bounds__(TH, 4)
 __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n)
 {
  const int n_full = gridDim.x * BLOCK_SIZE;
  int valid_items = 0;
  const int base_idx = (blockIdx.x * BLOCK_SIZE);
    // This can overflow, so we clamp to INT32_MAX. We won't have more elements than this.
    const int n_full = min(gridDim.x * BLOCK_SIZE, INT32_MAX);
    int valid_items = 0;
    const int base_idx = blockIdx.x * BLOCK_SIZE;

    T vals[NUM_PER_TH];
    float rand_vals[NUM_PER_TH];
    unsigned char qvals[(DATA_TYPE > 0) ? NUM_PER_TH / 2 : NUM_PER_TH];

  T vals[NUM_PER_TH];
  float rand_vals[NUM_PER_TH];
  unsigned char qvals[(DATA_TYPE > 0) ? NUM_PER_TH/2 : NUM_PER_TH];
  //float local_abs_max = -FLT_MAX;
  float local_abs_max = 0.0f;
  int local_rand_idx = 0;
    float local_abs_max = 0.0f;
    int local_rand_idx = 0;

  typedef hipcub::BlockLoad<T, BLOCK_SIZE/NUM_PER_TH, NUM_PER_TH, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
  typedef hipcub::BlockStore<unsigned char, BLOCK_SIZE/NUM_PER_TH, (DATA_TYPE > 0) ? NUM_PER_TH/2 : NUM_PER_TH, hipcub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
@@ -375,9 +376,9 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
    for(int i = threadIdx.x; i < 256; i+=blockDim.x)
      smem_code[i] = code[i];

  for (int i = base_idx; i < n_full; i += gridDim.x*BLOCK_SIZE)
  {
    valid_items = n - i > BLOCK_SIZE ? BLOCK_SIZE : n - i;

  for (int64_t i = base_idx; i < n_full; i += gridDim.x * BLOCK_SIZE) {
    valid_items = min(BLOCK_SIZE, static_cast<int>(n - i));
    local_abs_max = -FLT_MAX;

    __syncthreads();
@@ -465,7 +466,8 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * abs
  {
    if (DATA_TYPE > 0)
    {
      valid_items_load = min(TILE_SIZE, (n + 1) / 2 - i);
      // Cast n to int64_t to avoid overflow for large n
      valid_items_load = min(TILE_SIZE, static_cast<int>((static_cast<int64_t>(n) + 1) / 2) - i);
      valid_items_store = min(TILE_SIZE * 2, n - i * 2);
    }
    else
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -34,7 +34,7 @@ void quantize(float *code, float *A, unsigned char *out, int n)
 {
  int num_blocks = n/1024;
  num_blocks = n % 1024 == 0 ? num_blocks : num_blocks + 1;
 hipLaunchKernelGGL(( kQuantize), dim3(num_blocks), dim3(1024), 0, 0, code, A, out, n);
  hipLaunchKernelGGL(( kQuantize), dim3(num_blocks), dim3(1024), 0, 0, code, A, out, n);
  CUDA_CHECK_RETURN(hipPeekAtLastError());
 }

@@ -72,21 +72,21 @@ template <typename T, int STOCHASTIC, int DATA_TYPE> void quantizeBlockwise(floa

 template<typename T, int DATA_TYPE> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n, hipStream_t stream)
 {
  int num_blocks = n/blocksize;
  num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
  int tile_size = (DATA_TYPE > 0) ? 1024 : 512;

  // Upcast to int64 to avoid overflow for large n
  int grid_blocks = ((int64_t)n + tile_size - 1) / tile_size;

  if(DATA_TYPE > 0)
   hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3((n+tile_size-1)/tile_size), dim3(64), 0, stream, code, A, absmax, out, blocksize/2, n);
    hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3(grid_blocks), dim3(64), 0, stream, code, A, absmax, out, blocksize / 2, n);
  else
   hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3((n+tile_size-1)/tile_size), dim3(64), 0, stream, code, A, absmax, out, blocksize, n);
    hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3(grid_blocks), dim3(64), 0, stream, code, A, absmax, out, blocksize, n);

  CUDA_CHECK_RETURN(hipPeekAtLastError());
 }




 template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
                float* state1, float* state2, float *unorm, float max_unorm, float param_norm,
                const float beta1, const float beta2, const float beta3, const float alpha,
@@ -102,10 +102,10 @@ template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
      if(max_unorm > 0.0f)
 			{
 				CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float)));
       hipLaunchKernelGGL(( kPreconditionOptimizer32bit2State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, state2, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
        hipLaunchKernelGGL(( kPreconditionOptimizer32bit2State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, state2, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
        CUDA_CHECK_RETURN(hipPeekAtLastError());
      }
 		hipLaunchKernelGGL((	kOptimizer32bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, beta3, alpha, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
 		  hipLaunchKernelGGL((	kOptimizer32bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, beta3, alpha, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
      CUDA_CHECK_RETURN(hipPeekAtLastError());
 			break;
 		case MOMENTUM:
@@ -114,22 +114,22 @@ template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
      if(max_unorm > 0.0f)
 			{
 				CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float)));
 			hipLaunchKernelGGL((	kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
 			  hipLaunchKernelGGL((	kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
        CUDA_CHECK_RETURN(hipPeekAtLastError());
 			}

 		hipLaunchKernelGGL((	kOptimizer32bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
 		  hipLaunchKernelGGL((	kOptimizer32bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
      CUDA_CHECK_RETURN(hipPeekAtLastError());
 			break;
    case LION:
      // in lion, the momentum update after the parameter update
     hipLaunchKernelGGL(( kOptimizer32bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
      hipLaunchKernelGGL(( kOptimizer32bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
      CUDA_CHECK_RETURN(hipPeekAtLastError());

      if(max_unorm > 0.0f)
      {
        CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float)));
       hipLaunchKernelGGL(( kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
        hipLaunchKernelGGL(( kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
        CUDA_CHECK_RETURN(hipPeekAtLastError());
      }
      break;
@@ -156,9 +156,9 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g,
 		case ADAM:
 			CUDA_CHECK_RETURN(hipMemset(new_max1, 0, 1*sizeof(float)));
 			CUDA_CHECK_RETURN(hipMemset(new_max2, 0, 1*sizeof(float)));
 		hipLaunchKernelGGL((	kPreconditionOptimizerStatic8bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, state2, unorm, beta1, beta2, eps, step, quantiles1, quantiles2, max1, max2, new_max1, new_max2, gnorm_scale, n);
 		  hipLaunchKernelGGL((	kPreconditionOptimizerStatic8bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, state2, unorm, beta1, beta2, eps, step, quantiles1, quantiles2, max1, max2, new_max1, new_max2, gnorm_scale, n);
 			CUDA_CHECK_RETURN(hipPeekAtLastError());
 		hipLaunchKernelGGL((	kOptimizerStatic8bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
 		  hipLaunchKernelGGL((	kOptimizerStatic8bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
 																														quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n);
 			CUDA_CHECK_RETURN(hipPeekAtLastError());
 		break;
@@ -166,20 +166,20 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g,
    case RMSPROP:
    case ADAGRAD:
 			CUDA_CHECK_RETURN(hipMemset(new_max1, 0, 1*sizeof(float)));
 		hipLaunchKernelGGL((	kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
 		  hipLaunchKernelGGL((	kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
 			CUDA_CHECK_RETURN(hipPeekAtLastError());
 		hipLaunchKernelGGL((	kOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
 		  hipLaunchKernelGGL((	kOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
 																														quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
 			CUDA_CHECK_RETURN(hipPeekAtLastError());
 			break;
    case LION:
      // in lion, the momentum update happens after the parameter update
     hipLaunchKernelGGL(( kOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
      hipLaunchKernelGGL(( kOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
                                                            quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
      CUDA_CHECK_RETURN(hipPeekAtLastError());

      CUDA_CHECK_RETURN(hipMemset(new_max1, 0, 1*sizeof(float)));
     hipLaunchKernelGGL(( kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
      hipLaunchKernelGGL(( kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
      CUDA_CHECK_RETURN(hipPeekAtLastError());
      break;
 		default:
@@ -221,7 +221,7 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(
    case ADEMAMIX:
 			num_blocks = n/BLOCKSIZE_2STATE;
 			num_blocks = n % BLOCKSIZE_2STATE == 0 ? num_blocks : num_blocks + 1;
 		hipLaunchKernelGGL((	kOptimizerStatic8bit2StateBlockwise<T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE>), dim3(num_blocks), dim3(BLOCKSIZE_2STATE/NUM_2STATE), 0, 0, p, g, state1, state2, beta1, beta2, beta3, alpha, eps, step, lr,
 		  hipLaunchKernelGGL((	kOptimizerStatic8bit2StateBlockwise<T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE>), dim3(num_blocks), dim3(BLOCKSIZE_2STATE/NUM_2STATE), 0, 0, p, g, state1, state2, beta1, beta2, beta3, alpha, eps, step, lr,
 																														quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n);
 			CUDA_CHECK_RETURN(hipPeekAtLastError());
 		break;
@@ -231,7 +231,7 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(
    case LION:
 			num_blocks = n/BLOCKSIZE_1STATE;
 			num_blocks = n % BLOCKSIZE_1STATE == 0 ? num_blocks : num_blocks + 1;
 		hipLaunchKernelGGL((	kOptimizerStatic8bit1StateBlockwise<T, OPTIMIZER, BLOCKSIZE_1STATE, NUM_1STATE>), dim3(num_blocks), dim3(BLOCKSIZE_1STATE/NUM_1STATE), 0, 0, p, g, state1, beta1, beta2, eps, step, lr,
 		  hipLaunchKernelGGL((	kOptimizerStatic8bit1StateBlockwise<T, OPTIMIZER, BLOCKSIZE_1STATE, NUM_1STATE>), dim3(num_blocks), dim3(BLOCKSIZE_1STATE/NUM_1STATE), 0, 0, p, g, state1, beta1, beta2, eps, step, lr,
 																														quantiles1, absmax1, weight_decay, gnorm_scale, skip_zeros, n);
 			CUDA_CHECK_RETURN(hipPeekAtLastError());
 		break;
@@ -245,7 +245,7 @@ template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step,
  int num_blocks = n/2048;
  num_blocks = n % 2048 == 0 ? num_blocks : num_blocks + 1;
 	CUDA_CHECK_RETURN(hipMemset(&gnorm_vec[step % 100], 0, 1*sizeof(float)));
 hipLaunchKernelGGL(( kPercentileClipping<T, 2048, 4>), dim3(num_blocks), dim3(512), 0, 0, g, gnorm_vec, step, n);
  hipLaunchKernelGGL(( kPercentileClipping<T, 2048, 4>), dim3(num_blocks), dim3(512), 0, 0, g, gnorm_vec, step, n);
  CUDA_CHECK_RETURN(hipPeekAtLastError());
 }

@@ -669,7 +669,7 @@ void spmm_coo(hipsparseHandle_t handle, int *A_rowidx, int *A_colidx, half *A_va
 template <typename T, int BITS> void spmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
 {

 hipLaunchKernelGGL(( kspmm_coo_very_sparse_naive<T, 8, BITS>), dim3(nnz_rows), dim3(256), 0, 0, max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz, rowsA, rowsB, colsB);
  hipLaunchKernelGGL(( kspmm_coo_very_sparse_naive<T, 8, BITS>), dim3(nnz_rows), dim3(256), 0, 0, max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz, rowsA, rowsB, colsB);
  CUDA_CHECK_RETURN(hipPeekAtLastError());
 }

@@ -679,9 +679,9 @@ template <typename T> void gemm_host(int m, int n, int k, T * A,  T* B,  T * out
 	int num_blocks = (m+31)/32;

  if(bits == 32)
   hipLaunchKernelGGL(( gemm_device<T, 32, 32>),  dim3(num_blocks), dim3(32), 0, 0, m,  n,  k, A,  B,  out, lda, ldb, ldc);
    hipLaunchKernelGGL(( gemm_device<T, 32, 32>),  dim3(num_blocks), dim3(32), 0, 0, m,  n,  k, A,  B,  out, lda, ldb, ldc);
  if(bits == 16)
   hipLaunchKernelGGL(( gemm_device<T, 16, 160>),  dim3(num_blocks), dim3(160), 0, 0, m,  n,  k, A,  B,  out, lda, ldb, ldc);
    hipLaunchKernelGGL(( gemm_device<T, 16, 160>),  dim3(num_blocks), dim3(160), 0, 0, m,  n,  k, A,  B,  out, lda, ldb, ldc);
 }

 template <typename T> void gemm_4bit_inference(int m, int n, int k, T * A,  unsigned char* B,  float *absmax, T * out,  int lda, int ldb, int ldc, int blocksize)
@@ -689,7 +689,7 @@ template <typename T> void gemm_4bit_inference(int m, int n, int k, T * A,  unsi

 	int num_blocks = (m+31)/32;

 hipLaunchKernelGGL(( kgemm_4bit_inference<T, 96>),  dim3(num_blocks), dim3(96), 0, 0, m,  n,  k, A,  B, absmax, out, lda, ldb, ldc, blocksize);
  hipLaunchKernelGGL(( kgemm_4bit_inference<T, 96>),  dim3(num_blocks), dim3(96), 0, 0, m,  n,  k, A,  B, absmax, out, lda, ldb, ldc, blocksize);
 }

 template <typename T, int BITS> void gemm_4bit_inference_naive(int m, int n, int k, T * A,  unsigned char* B,  float *absmax, float *datatype, T * out,  int lda, int ldb, int ldc, int blocksize, hipStream_t stream)
@@ -712,7 +712,7 @@ template <typename T, int FUNC> void func(T *A, T *B, T value, long n)
  int blocks = n/threads;
  blocks = n % threads == 0 ? blocks : blocks + 1;
  blocks = blocks > 65535 ? 65535 : blocks;
 hipLaunchKernelGGL(( kfunc<T, FUNC>), dim3(blocks), dim3(512), 0, 0, A, B, value, n);
  hipLaunchKernelGGL(( kfunc<T, FUNC>), dim3(blocks), dim3(512), 0, 0, A, B, value, n);
  CUDA_CHECK_RETURN(hipPeekAtLastError());
 }

--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -25,7 +25,7 @@ additional platforms such as AMD ROCm.

 These are the minimum requirements for `bitsandbytes` across all platforms. Please be aware that some compute platforms may impose more strict requirements.

 * Python >= 3.9
 * Python >= 3.10
 * PyTorch >= 2.3

 ## NVIDIA CUDA[[cuda]]
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ maintainers = [
    {name="Titus von Köller", email="titus@huggingface.co"},
    {name="Matthew Douglas", email="matthew.douglas@huggingface.co"}
 ]
 requires-python = ">=3.9"
 requires-python = ">=3.10"
 readme = "README.md"
 license = "MIT"
 license-files = ["LICENSE"]
@@ -35,11 +35,11 @@ classifiers = [
    "Operating System :: Microsoft :: Windows",
    "Programming Language :: C++",
    "Programming Language :: Python :: Implementation :: CPython",
    "Programming Language :: Python :: 3.9",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
    "Programming Language :: Python :: 3.14",
    "Topic :: Scientific/Engineering :: Artificial Intelligence"
 ]
 dependencies = [
@@ -60,7 +60,7 @@ docs = ["hf-doc-builder==0.5.0"]
 dev = [
    "bitsandbytes[test]",
    "build>=1.0.0,<2",
    "ruff==0.11.2",
    "ruff~=0.14.3",
    "pre-commit>=3.5.0,<4",
    "wheel>=0.42,<1"
 ]
@@ -108,7 +108,7 @@ src = [
    "tests",
    "benchmarking"
 ]
 target-version = "py39"
 target-version = "py310"
 line-length = 119

 [tool.ruff.lint]
@@ -125,13 +125,14 @@ select = [
 ignore = [
    "B007",  # Loop control variable not used within the loop body (TODO: enable)
    "B028",  # Warning without stacklevel (TODO: enable)
    "B905",  # zip without explicit `strict=` kwarg
    "E501",  # Suppress line-too-long warnings: trust yapf's judgement on this one.
    "E701",  # Multiple statements on one line (TODO: enable)
    "E712",  # Allow using if x == False, as it's not always equivalent to if x.
    "E731",  # Do not use lambda
    "RUF012",  # Mutable class attribute annotations
    "RUF034", # Useless if-else (TODO: enable)
    "ISC001",   # single-line-implicit-string-concatenation incompatible with formatter
    "RUF012",# Mutable class attribute annotations
    "RUF034",# Useless if-else (TODO: enable)
    "UP045", # Use `X | None` instead of `Optional[X]`
 ]

 [tool.ruff.lint.extend-per-file-ignores]
@@ -145,6 +146,9 @@ ignore = [
    "F841",
    "UP030",
 ]
 "bitsandbytes/**/triton/**/*.py" = [
    "I001",  # import order
 ]

 [tool.ruff.lint.isort]
 combine-as-imports = true
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@ class ExtBuildPy(build_py):


 setup(
    version="0.48.3.dev0",
    version="0.49.0.dev0",
    packages=find_packages(),
    distclass=BinaryDistribution,
    cmake_source_dir=".",
--- a/tests/test_deprecated.py
+++ b/tests/test_deprecated.py
@@ -52,7 +52,7 @@ def test_percentile_clipping(gtype):
        else:
            gnorm_vec1[step % 100] = gnorm2

        vals, idx = torch.sort(gnorm_vec1)
        vals, _ = torch.sort(gnorm_vec1)
        clip1 = vals[percentile]

        torch.testing.assert_close(gnorm_vec1, torch.sqrt(gnorm_vec2))
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -312,7 +312,7 @@ class Test8BitBlockwiseQuantizeFunctional:
    def test_bench_dequantization(self):
        a = torch.rand(1024, 1024, device="cuda").half()
        code = F.create_fp8_map(True, 3, 0, 4).cuda()
        qa, SA = F.quantize_blockwise(a, code=code)
        qa, _SA = F.quantize_blockwise(a, code=code)
        print(qa.max())

        max_theoretical_mu = 1024 * 1024 * 2 / 1024**3 / 672 * 1000 * 1000
@@ -321,7 +321,7 @@ class Test8BitBlockwiseQuantizeFunctional:
        torch.cuda.synchronize()
        t0 = time.time()
        for i in range(100):
            qa, SA = F.quantize_blockwise(a)
            qa, _SA = F.quantize_blockwise(a)
        torch.cuda.synchronize()
        # print((time.time()-t0)/1e6)

@@ -1004,7 +1004,7 @@ class TestSpMMFunctional:
        torch.nn.init.xavier_uniform_(B)
        Bt = B.t().contiguous()

        CB, CBt, statsB, statsBt, coo_tensor = F.int8_double_quant(B)
        _CB, CBt, _statsB, statsBt, _coo_tensor = F.int8_double_quant(B)

        rowidx = torch.randint(0, A.shape[-1], size=(15,))

@@ -1023,7 +1023,7 @@ class TestSpMMFunctional:

        values, counts = torch.unique(cooA.rowidx, return_counts=True)
        offset = counts.cumsum(0).int()
        max_count, max_idx = torch.sort(counts, descending=True)
        max_count, _ = torch.sort(counts, descending=True)
        print(torch.median(max_count.float()))

        torch.testing.assert_close(out2, out3, rtol=0.05, atol=0.001)
--- a/tests/test_optim.py
+++ b/tests/test_optim.py
@@ -496,7 +496,7 @@ def test_adam_percentile_clipping(requires_cuda, dim1, dim2, gtype, optim_bits):
        g2 = g1.clone()
        p2.grad = g2

        current_gnorm, clip_val, gnorm_scale = F.percentile_clipping(g1, gnorm_vec, step, 5)
        _current_gnorm, _clip_val, gnorm_scale = F.percentile_clipping(g1, gnorm_vec, step, 5)
        g1 = (g1.float() * gnorm_scale).to(gtype)
        p1.grad = g1

--- a/tests/test_parametrize.py
+++ b/tests/test_parametrize.py
@@ -246,14 +246,14 @@ def test_error_conditions():
        replace_parameter_4bit(module, "nonexistent")

    # Test TypeError for non-Parameter attribute
    with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn.Parameter"):
    with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn\\.Parameter"):
        replace_parameter_4bit(module, "not_param")

    # Test same errors for prequantized version
    with pytest.raises(AttributeError, match="Module does not have parameter 'nonexistent'"):
        replace_parameter_4bit_prequantized(module, "nonexistent", {}, torch.device("cpu"))

    with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn.Parameter"):
    with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn\\.Parameter"):
        replace_parameter_4bit_prequantized(module, "not_param", {}, torch.device("cpu"))
Author	SHA1	Message	Date
Matthew Douglas	45553f7392	Bump dev version	1 month ago
Strahinja Stamenkovic	1920972130	ROCm: Fix int32 overflow for blocksize quantization (#1796 )	1 month ago
Matthew Douglas	45dcd4d37a	Drop Python 3.9 support (#1795 )	1 month ago