3 Commits

Author SHA1 Message Date
  Matthew Douglas 45553f7392 Bump dev version 1 month ago
  Strahinja Stamenkovic 1920972130
ROCm: Fix int32 overflow for blocksize quantization (#1796) 1 month ago
  Matthew Douglas 45dcd4d37a
Drop Python 3.9 support (#1795) 1 month ago
20 changed files with 97 additions and 92 deletions
Split View
  1. +1
    -1
      .pre-commit-config.yaml
  2. +1
    -1
      README.md
  3. +4
    -4
      benchmarking/matmul_benchmark.py
  4. +2
    -5
      bitsandbytes/__init__.py
  5. +3
    -2
      bitsandbytes/autograd/_functions.py
  6. +2
    -1
      bitsandbytes/backends/utils.py
  7. +3
    -3
      bitsandbytes/functional.py
  8. +13
    -13
      bitsandbytes/nn/modules.py
  9. +2
    -2
      bitsandbytes/optim/optimizer.py
  10. +3
    -3
      bitsandbytes/research/autograd/_functions.py
  11. +1
    -1
      bitsandbytes/utils.py
  12. +15
    -13
      csrc/kernels.hip
  13. +26
    -26
      csrc/ops.hip
  14. +1
    -1
      docs/source/installation.mdx
  15. +11
    -7
      pyproject.toml
  16. +1
    -1
      setup.py
  17. +1
    -1
      tests/test_deprecated.py
  18. +4
    -4
      tests/test_functional.py
  19. +1
    -1
      tests/test_optim.py
  20. +2
    -2
      tests/test_parametrize.py

+ 1
- 1
.pre-commit-config.yaml View File

@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.2
rev: v0.14.3
hooks:
- id: ruff
args:


+ 1
- 1
README.md View File

@@ -19,7 +19,7 @@ The library includes quantization primitives for 8-bit & 4-bit operations, throu
## System Requirements
bitsandbytes has the following minimum requirements for all platforms:

* Python 3.9+
* Python 3.10+
* [PyTorch](https://pytorch.org/get-started/locally/) 2.3+
* _Note: While we aim to provide wide backwards compatibility, we recommend using the latest version of PyTorch for the best experience._



+ 4
- 4
benchmarking/matmul_benchmark.py View File

@@ -35,8 +35,8 @@ def test_bench_matmul(batch, seq, model, hidden):
B = torch.empty(hidden, model, dtype=torch.float16, device="cuda")
torch.nn.init.xavier_uniform_(B)

B_fp4, state = F.quantize_fp4(B)
B_fp4_c, state_c = F.quantize_fp4(B, compress_statistics=True)
_B_fp4, _state = F.quantize_fp4(B)
_B_fp4_c, _state_c = F.quantize_fp4(B, compress_statistics=True)

B_nf4, state_nf4 = F.quantize_nf4(B)
B_nf4_c, state_nf4_c = F.quantize_nf4(B, compress_statistics=True)
@@ -117,8 +117,8 @@ def test_bench_matmul(batch, seq, model, hidden):
f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
)

CA, SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
CB, SCB, _ = F.int8_vectorwise_quant(B)
CA, _SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
CB, _SCB, _ = F.int8_vectorwise_quant(B)
torch.cuda.synchronize()
t0 = time.time()
for i in range(iters):


+ 2
- 5
bitsandbytes/__init__.py View File

@@ -54,10 +54,7 @@ def _import_backends():
"""
from importlib.metadata import entry_points

if sys.version_info < (3, 10):
extensions = entry_points().get("bitsandbytes.backends", [])
else:
extensions = entry_points(group="bitsandbytes.backends")
extensions = entry_points(group="bitsandbytes.backends")

for ext in extensions:
try:
@@ -75,4 +72,4 @@ __pdoc__ = {
"optim.optimizer.MockArgs": False,
}

__version__ = "0.48.3.dev0"
__version__ = "0.49.0.dev0"

+ 3
- 2
bitsandbytes/autograd/_functions.py View File

@@ -1,6 +1,7 @@
from collections.abc import Callable
from dataclasses import dataclass
from math import prod
from typing import Callable, Optional
from typing import Optional
import warnings
from warnings import warn

@@ -257,7 +258,7 @@ class MatMul8bitLt(torch.autograd.Function):
return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None

req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad
CAt, subA, A = ctx.tensors
CAt, subA, _A = ctx.tensors
SCAt, idx = ctx.tensor_states
state: MatmulLtState = ctx.state
grad_A = grad_B = grad_bias = None


+ 2
- 1
bitsandbytes/backends/utils.py View File

@@ -4,9 +4,10 @@ from packaging import version
import torch

try:
import triton # noqa: F401
import triton.language as tl # noqa: F401

import triton # noqa: F401

triton_available = True
except ImportError:
triton_available = False


+ 3
- 3
bitsandbytes/functional.py View File

@@ -6,7 +6,7 @@ from collections.abc import Iterable
import ctypes as ct
import itertools
from math import prod
from typing import Any, Optional, Union
from typing import Any, Optional

import numpy as np
import torch
@@ -1413,7 +1413,7 @@ def percentile_clipping(grad: Tensor, gnorm_vec: Tensor, step: int, percentile:
raise ValueError(f"Gradient type {grad.dtype} not supported!")

current_gnorm = torch.sqrt(gnorm_vec[step % 100])
vals, idx = torch.sort(gnorm_vec)
vals, _ = torch.sort(gnorm_vec)
clip_value = torch.sqrt(vals[percentile])
gnorm_scale = 1.0

@@ -2059,7 +2059,7 @@ def int8_vectorwise_quant(A: torch.Tensor, threshold=0.0):


def spmm_coo(
cooA: Union[COOSparseTensor, torch.Tensor],
cooA: COOSparseTensor | torch.Tensor,
B: torch.Tensor,
out: Optional[torch.Tensor] = None,
):


+ 13
- 13
bitsandbytes/nn/modules.py View File

@@ -310,28 +310,28 @@ class Params4bit(torch.nn.Parameter):
def cpu(self):
return self.to(device="cpu")

def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
def cuda(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)

def xpu(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
def xpu(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
return self.to(device="xpu" if device is None else device, non_blocking=non_blocking)

@overload
def to(
self: T,
device: Optional[Union[int, device]] = ...,
dtype: Optional[Union[dtype, str]] = ...,
device: Optional[int | device] = ...,
dtype: Optional[dtype | str] = ...,
non_blocking: bool = ...,
) -> T: ...

@overload
def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T: ...
def to(self: T, dtype: dtype | str, non_blocking: bool = ...) -> T: ...

@overload
def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...

def to(self, *args, **kwargs):
device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
device, dtype, non_blocking, _ = torch._C._nn._parse_to(*args, **kwargs)

if device is not None and device.type != "meta" and not self.bnb_quantized:
return self._quantize(device)
@@ -644,10 +644,10 @@ class Int8Params(torch.nn.Parameter):
def cpu(self):
return self.to(device="cpu")

def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
def cuda(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)

def xpu(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
def xpu(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
return self.to(device="xpu" if device is None else device, non_blocking=non_blocking)

def __deepcopy__(self, memo):
@@ -665,19 +665,19 @@ class Int8Params(torch.nn.Parameter):
@overload
def to(
self: T,
device: Optional[Union[int, device]] = ...,
dtype: Optional[Union[dtype, str]] = ...,
device: Optional[int | device] = ...,
dtype: Optional[dtype | str] = ...,
non_blocking: bool = ...,
) -> T: ...

@overload
def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T: ...
def to(self: T, dtype: dtype | str, non_blocking: bool = ...) -> T: ...

@overload
def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...

def to(self, *args, **kwargs):
device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
device, dtype, non_blocking, _ = torch._C._nn._parse_to(*args, **kwargs)

is_quantized = self.data.dtype == torch.int8

@@ -1048,7 +1048,7 @@ class Linear8bitLt(nn.Linear):
# Call the parent to() method to handle standard parameter/buffer movement
result = super().to(*args, **kwargs)

device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
device, _, _, _ = torch._C._nn._parse_to(*args, **kwargs)

# Handle state tensors if needed.
if device is not None:


+ 2
- 2
bitsandbytes/optim/optimizer.py View File

@@ -507,7 +507,7 @@ class Optimizer2State(Optimizer8bit):
step = state["step"]

if config["percentile_clipping"] < 100:
current_gnorm, clip_value, gnorm_scale = F.percentile_clipping(
_current_gnorm, _clip_value, gnorm_scale = F.percentile_clipping(
grad,
state["gnorm_vec"],
step,
@@ -725,7 +725,7 @@ class Optimizer1State(Optimizer8bit):
step = state["step"]

if config["percentile_clipping"] < 100:
current_gnorm, clip_value, gnorm_scale = F.percentile_clipping(
_current_gnorm, _clip_value, gnorm_scale = F.percentile_clipping(
grad,
state["gnorm_vec"],
step,


+ 3
- 3
bitsandbytes/research/autograd/_functions.py View File

@@ -307,8 +307,8 @@ class SwitchBackBnb(torch.autograd.Function):
return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None

req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad
CAt, subA, A = ctx.tensors
SCAt, idx = ctx.tensor_states
_CAt, _subA, A = ctx.tensors
_SCAt, _idx = ctx.tensor_states
state = ctx.state
grad_A = grad_B = grad_bias = None

@@ -320,7 +320,7 @@ class SwitchBackBnb(torch.autograd.Function):
if len(grad_output.shape) == 3:
grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()

Cgrad, Cgradt, SCgrad, SCgradt, outlier_cols = F.int8_double_quant(grad_output.to(torch.float16))
_Cgrad, _Cgradt, _SCgrad, _SCgradt, _outlier_cols = F.int8_double_quant(grad_output.to(torch.float16))

if req_gradB:
# print('back A shape', A.shape)


+ 1
- 1
bitsandbytes/utils.py View File

@@ -91,7 +91,7 @@ def find_outlier_dims(weight, reduction_dim=0, zscore=4.0, topk=None, rdm=False)
zstd = (std - stdm) / stdstd

if topk is not None:
val, idx = torch.topk(std.abs(), k=topk, dim=0)
_, idx = torch.topk(std.abs(), k=topk, dim=0)
else:
idx = torch.where(zstd > zscore)[0]



+ 15
- 13
csrc/kernels.hip View File

@@ -348,16 +348,17 @@ template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC, int DATA_TY
//__launch_bounds__(TH, 4)
__global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n)
{
const int n_full = gridDim.x * BLOCK_SIZE;
int valid_items = 0;
const int base_idx = (blockIdx.x * BLOCK_SIZE);
// This can overflow, so we clamp to INT32_MAX. We won't have more elements than this.
const int n_full = min(gridDim.x * BLOCK_SIZE, INT32_MAX);
int valid_items = 0;
const int base_idx = blockIdx.x * BLOCK_SIZE;

T vals[NUM_PER_TH];
float rand_vals[NUM_PER_TH];
unsigned char qvals[(DATA_TYPE > 0) ? NUM_PER_TH / 2 : NUM_PER_TH];

T vals[NUM_PER_TH];
float rand_vals[NUM_PER_TH];
unsigned char qvals[(DATA_TYPE > 0) ? NUM_PER_TH/2 : NUM_PER_TH];
//float local_abs_max = -FLT_MAX;
float local_abs_max = 0.0f;
int local_rand_idx = 0;
float local_abs_max = 0.0f;
int local_rand_idx = 0;

typedef hipcub::BlockLoad<T, BLOCK_SIZE/NUM_PER_TH, NUM_PER_TH, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
typedef hipcub::BlockStore<unsigned char, BLOCK_SIZE/NUM_PER_TH, (DATA_TYPE > 0) ? NUM_PER_TH/2 : NUM_PER_TH, hipcub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
@@ -375,9 +376,9 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
for(int i = threadIdx.x; i < 256; i+=blockDim.x)
smem_code[i] = code[i];

for (int i = base_idx; i < n_full; i += gridDim.x*BLOCK_SIZE)
{
valid_items = n - i > BLOCK_SIZE ? BLOCK_SIZE : n - i;
for (int64_t i = base_idx; i < n_full; i += gridDim.x * BLOCK_SIZE) {
valid_items = min(BLOCK_SIZE, static_cast<int>(n - i));
local_abs_max = -FLT_MAX;

__syncthreads();
@@ -465,7 +466,8 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * abs
{
if (DATA_TYPE > 0)
{
valid_items_load = min(TILE_SIZE, (n + 1) / 2 - i);
// Cast n to int64_t to avoid overflow for large n
valid_items_load = min(TILE_SIZE, static_cast<int>((static_cast<int64_t>(n) + 1) / 2) - i);
valid_items_store = min(TILE_SIZE * 2, n - i * 2);
}
else


+ 26
- 26
csrc/ops.hip View File

@@ -34,7 +34,7 @@ void quantize(float *code, float *A, unsigned char *out, int n)
{
int num_blocks = n/1024;
num_blocks = n % 1024 == 0 ? num_blocks : num_blocks + 1;
hipLaunchKernelGGL(( kQuantize), dim3(num_blocks), dim3(1024), 0, 0, code, A, out, n);
hipLaunchKernelGGL(( kQuantize), dim3(num_blocks), dim3(1024), 0, 0, code, A, out, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());
}

@@ -72,21 +72,21 @@ template <typename T, int STOCHASTIC, int DATA_TYPE> void quantizeBlockwise(floa

template<typename T, int DATA_TYPE> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n, hipStream_t stream)
{
int num_blocks = n/blocksize;
num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
int tile_size = (DATA_TYPE > 0) ? 1024 : 512;

// Upcast to int64 to avoid overflow for large n
int grid_blocks = ((int64_t)n + tile_size - 1) / tile_size;

if(DATA_TYPE > 0)
hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3((n+tile_size-1)/tile_size), dim3(64), 0, stream, code, A, absmax, out, blocksize/2, n);
hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3(grid_blocks), dim3(64), 0, stream, code, A, absmax, out, blocksize / 2, n);
else
hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3((n+tile_size-1)/tile_size), dim3(64), 0, stream, code, A, absmax, out, blocksize, n);
hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3(grid_blocks), dim3(64), 0, stream, code, A, absmax, out, blocksize, n);

CUDA_CHECK_RETURN(hipPeekAtLastError());
}




template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
float* state1, float* state2, float *unorm, float max_unorm, float param_norm,
const float beta1, const float beta2, const float beta3, const float alpha,
@@ -102,10 +102,10 @@ template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
if(max_unorm > 0.0f)
{
CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float)));
hipLaunchKernelGGL(( kPreconditionOptimizer32bit2State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, state2, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
hipLaunchKernelGGL(( kPreconditionOptimizer32bit2State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, state2, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());
}
hipLaunchKernelGGL(( kOptimizer32bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, beta3, alpha, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
hipLaunchKernelGGL(( kOptimizer32bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, beta3, alpha, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());
break;
case MOMENTUM:
@@ -114,22 +114,22 @@ template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
if(max_unorm > 0.0f)
{
CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float)));
hipLaunchKernelGGL(( kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
hipLaunchKernelGGL(( kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());
}

hipLaunchKernelGGL(( kOptimizer32bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
hipLaunchKernelGGL(( kOptimizer32bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());
break;
case LION:
// in lion, the momentum update after the parameter update
hipLaunchKernelGGL(( kOptimizer32bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
hipLaunchKernelGGL(( kOptimizer32bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());

if(max_unorm > 0.0f)
{
CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float)));
hipLaunchKernelGGL(( kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
hipLaunchKernelGGL(( kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());
}
break;
@@ -156,9 +156,9 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g,
case ADAM:
CUDA_CHECK_RETURN(hipMemset(new_max1, 0, 1*sizeof(float)));
CUDA_CHECK_RETURN(hipMemset(new_max2, 0, 1*sizeof(float)));
hipLaunchKernelGGL(( kPreconditionOptimizerStatic8bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, state2, unorm, beta1, beta2, eps, step, quantiles1, quantiles2, max1, max2, new_max1, new_max2, gnorm_scale, n);
hipLaunchKernelGGL(( kPreconditionOptimizerStatic8bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, state2, unorm, beta1, beta2, eps, step, quantiles1, quantiles2, max1, max2, new_max1, new_max2, gnorm_scale, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());
hipLaunchKernelGGL(( kOptimizerStatic8bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
hipLaunchKernelGGL(( kOptimizerStatic8bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());
break;
@@ -166,20 +166,20 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g,
case RMSPROP:
case ADAGRAD:
CUDA_CHECK_RETURN(hipMemset(new_max1, 0, 1*sizeof(float)));
hipLaunchKernelGGL(( kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
hipLaunchKernelGGL(( kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());
hipLaunchKernelGGL(( kOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
hipLaunchKernelGGL(( kOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());
break;
case LION:
// in lion, the momentum update happens after the parameter update
hipLaunchKernelGGL(( kOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
hipLaunchKernelGGL(( kOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());

CUDA_CHECK_RETURN(hipMemset(new_max1, 0, 1*sizeof(float)));
hipLaunchKernelGGL(( kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
hipLaunchKernelGGL(( kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());
break;
default:
@@ -221,7 +221,7 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(
case ADEMAMIX:
num_blocks = n/BLOCKSIZE_2STATE;
num_blocks = n % BLOCKSIZE_2STATE == 0 ? num_blocks : num_blocks + 1;
hipLaunchKernelGGL(( kOptimizerStatic8bit2StateBlockwise<T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE>), dim3(num_blocks), dim3(BLOCKSIZE_2STATE/NUM_2STATE), 0, 0, p, g, state1, state2, beta1, beta2, beta3, alpha, eps, step, lr,
hipLaunchKernelGGL(( kOptimizerStatic8bit2StateBlockwise<T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE>), dim3(num_blocks), dim3(BLOCKSIZE_2STATE/NUM_2STATE), 0, 0, p, g, state1, state2, beta1, beta2, beta3, alpha, eps, step, lr,
quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());
break;
@@ -231,7 +231,7 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(
case LION:
num_blocks = n/BLOCKSIZE_1STATE;
num_blocks = n % BLOCKSIZE_1STATE == 0 ? num_blocks : num_blocks + 1;
hipLaunchKernelGGL(( kOptimizerStatic8bit1StateBlockwise<T, OPTIMIZER, BLOCKSIZE_1STATE, NUM_1STATE>), dim3(num_blocks), dim3(BLOCKSIZE_1STATE/NUM_1STATE), 0, 0, p, g, state1, beta1, beta2, eps, step, lr,
hipLaunchKernelGGL(( kOptimizerStatic8bit1StateBlockwise<T, OPTIMIZER, BLOCKSIZE_1STATE, NUM_1STATE>), dim3(num_blocks), dim3(BLOCKSIZE_1STATE/NUM_1STATE), 0, 0, p, g, state1, beta1, beta2, eps, step, lr,
quantiles1, absmax1, weight_decay, gnorm_scale, skip_zeros, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());
break;
@@ -245,7 +245,7 @@ template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step,
int num_blocks = n/2048;
num_blocks = n % 2048 == 0 ? num_blocks : num_blocks + 1;
CUDA_CHECK_RETURN(hipMemset(&gnorm_vec[step % 100], 0, 1*sizeof(float)));
hipLaunchKernelGGL(( kPercentileClipping<T, 2048, 4>), dim3(num_blocks), dim3(512), 0, 0, g, gnorm_vec, step, n);
hipLaunchKernelGGL(( kPercentileClipping<T, 2048, 4>), dim3(num_blocks), dim3(512), 0, 0, g, gnorm_vec, step, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());
}

@@ -669,7 +669,7 @@ void spmm_coo(hipsparseHandle_t handle, int *A_rowidx, int *A_colidx, half *A_va
template <typename T, int BITS> void spmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
{

hipLaunchKernelGGL(( kspmm_coo_very_sparse_naive<T, 8, BITS>), dim3(nnz_rows), dim3(256), 0, 0, max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz, rowsA, rowsB, colsB);
hipLaunchKernelGGL(( kspmm_coo_very_sparse_naive<T, 8, BITS>), dim3(nnz_rows), dim3(256), 0, 0, max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz, rowsA, rowsB, colsB);
CUDA_CHECK_RETURN(hipPeekAtLastError());
}

@@ -679,9 +679,9 @@ template <typename T> void gemm_host(int m, int n, int k, T * A, T* B, T * out
int num_blocks = (m+31)/32;

if(bits == 32)
hipLaunchKernelGGL(( gemm_device<T, 32, 32>), dim3(num_blocks), dim3(32), 0, 0, m, n, k, A, B, out, lda, ldb, ldc);
hipLaunchKernelGGL(( gemm_device<T, 32, 32>), dim3(num_blocks), dim3(32), 0, 0, m, n, k, A, B, out, lda, ldb, ldc);
if(bits == 16)
hipLaunchKernelGGL(( gemm_device<T, 16, 160>), dim3(num_blocks), dim3(160), 0, 0, m, n, k, A, B, out, lda, ldb, ldc);
hipLaunchKernelGGL(( gemm_device<T, 16, 160>), dim3(num_blocks), dim3(160), 0, 0, m, n, k, A, B, out, lda, ldb, ldc);
}

template <typename T> void gemm_4bit_inference(int m, int n, int k, T * A, unsigned char* B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize)
@@ -689,7 +689,7 @@ template <typename T> void gemm_4bit_inference(int m, int n, int k, T * A, unsi

int num_blocks = (m+31)/32;

hipLaunchKernelGGL(( kgemm_4bit_inference<T, 96>), dim3(num_blocks), dim3(96), 0, 0, m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize);
hipLaunchKernelGGL(( kgemm_4bit_inference<T, 96>), dim3(num_blocks), dim3(96), 0, 0, m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize);
}

template <typename T, int BITS> void gemm_4bit_inference_naive(int m, int n, int k, T * A, unsigned char* B, float *absmax, float *datatype, T * out, int lda, int ldb, int ldc, int blocksize, hipStream_t stream)
@@ -712,7 +712,7 @@ template <typename T, int FUNC> void func(T *A, T *B, T value, long n)
int blocks = n/threads;
blocks = n % threads == 0 ? blocks : blocks + 1;
blocks = blocks > 65535 ? 65535 : blocks;
hipLaunchKernelGGL(( kfunc<T, FUNC>), dim3(blocks), dim3(512), 0, 0, A, B, value, n);
hipLaunchKernelGGL(( kfunc<T, FUNC>), dim3(blocks), dim3(512), 0, 0, A, B, value, n);
CUDA_CHECK_RETURN(hipPeekAtLastError());
}



+ 1
- 1
docs/source/installation.mdx View File

@@ -25,7 +25,7 @@ additional platforms such as AMD ROCm.

These are the minimum requirements for `bitsandbytes` across all platforms. Please be aware that some compute platforms may impose more strict requirements.

* Python >= 3.9
* Python >= 3.10
* PyTorch >= 2.3

## NVIDIA CUDA[[cuda]]


+ 11
- 7
pyproject.toml View File

@@ -11,7 +11,7 @@ maintainers = [
{name="Titus von Köller", email="titus@huggingface.co"},
{name="Matthew Douglas", email="matthew.douglas@huggingface.co"}
]
requires-python = ">=3.9"
requires-python = ">=3.10"
readme = "README.md"
license = "MIT"
license-files = ["LICENSE"]
@@ -35,11 +35,11 @@ classifiers = [
"Operating System :: Microsoft :: Windows",
"Programming Language :: C++",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
"Topic :: Scientific/Engineering :: Artificial Intelligence"
]
dependencies = [
@@ -60,7 +60,7 @@ docs = ["hf-doc-builder==0.5.0"]
dev = [
"bitsandbytes[test]",
"build>=1.0.0,<2",
"ruff==0.11.2",
"ruff~=0.14.3",
"pre-commit>=3.5.0,<4",
"wheel>=0.42,<1"
]
@@ -108,7 +108,7 @@ src = [
"tests",
"benchmarking"
]
target-version = "py39"
target-version = "py310"
line-length = 119

[tool.ruff.lint]
@@ -125,13 +125,14 @@ select = [
ignore = [
"B007", # Loop control variable not used within the loop body (TODO: enable)
"B028", # Warning without stacklevel (TODO: enable)
"B905", # zip without explicit `strict=` kwarg
"E501", # Suppress line-too-long warnings: trust yapf's judgement on this one.
"E701", # Multiple statements on one line (TODO: enable)
"E712", # Allow using if x == False, as it's not always equivalent to if x.
"E731", # Do not use lambda
"RUF012", # Mutable class attribute annotations
"RUF034", # Useless if-else (TODO: enable)
"ISC001", # single-line-implicit-string-concatenation incompatible with formatter
"RUF012",# Mutable class attribute annotations
"RUF034",# Useless if-else (TODO: enable)
"UP045", # Use `X | None` instead of `Optional[X]`
]

[tool.ruff.lint.extend-per-file-ignores]
@@ -145,6 +146,9 @@ ignore = [
"F841",
"UP030",
]
"bitsandbytes/**/triton/**/*.py" = [
"I001", # import order
]

[tool.ruff.lint.isort]
combine-as-imports = true


+ 1
- 1
setup.py View File

@@ -31,7 +31,7 @@ class ExtBuildPy(build_py):


setup(
version="0.48.3.dev0",
version="0.49.0.dev0",
packages=find_packages(),
distclass=BinaryDistribution,
cmake_source_dir=".",


+ 1
- 1
tests/test_deprecated.py View File

@@ -52,7 +52,7 @@ def test_percentile_clipping(gtype):
else:
gnorm_vec1[step % 100] = gnorm2

vals, idx = torch.sort(gnorm_vec1)
vals, _ = torch.sort(gnorm_vec1)
clip1 = vals[percentile]

torch.testing.assert_close(gnorm_vec1, torch.sqrt(gnorm_vec2))


+ 4
- 4
tests/test_functional.py View File

@@ -312,7 +312,7 @@ class Test8BitBlockwiseQuantizeFunctional:
def test_bench_dequantization(self):
a = torch.rand(1024, 1024, device="cuda").half()
code = F.create_fp8_map(True, 3, 0, 4).cuda()
qa, SA = F.quantize_blockwise(a, code=code)
qa, _SA = F.quantize_blockwise(a, code=code)
print(qa.max())

max_theoretical_mu = 1024 * 1024 * 2 / 1024**3 / 672 * 1000 * 1000
@@ -321,7 +321,7 @@ class Test8BitBlockwiseQuantizeFunctional:
torch.cuda.synchronize()
t0 = time.time()
for i in range(100):
qa, SA = F.quantize_blockwise(a)
qa, _SA = F.quantize_blockwise(a)
torch.cuda.synchronize()
# print((time.time()-t0)/1e6)

@@ -1004,7 +1004,7 @@ class TestSpMMFunctional:
torch.nn.init.xavier_uniform_(B)
Bt = B.t().contiguous()

CB, CBt, statsB, statsBt, coo_tensor = F.int8_double_quant(B)
_CB, CBt, _statsB, statsBt, _coo_tensor = F.int8_double_quant(B)

rowidx = torch.randint(0, A.shape[-1], size=(15,))

@@ -1023,7 +1023,7 @@ class TestSpMMFunctional:

values, counts = torch.unique(cooA.rowidx, return_counts=True)
offset = counts.cumsum(0).int()
max_count, max_idx = torch.sort(counts, descending=True)
max_count, _ = torch.sort(counts, descending=True)
print(torch.median(max_count.float()))

torch.testing.assert_close(out2, out3, rtol=0.05, atol=0.001)


+ 1
- 1
tests/test_optim.py View File

@@ -496,7 +496,7 @@ def test_adam_percentile_clipping(requires_cuda, dim1, dim2, gtype, optim_bits):
g2 = g1.clone()
p2.grad = g2

current_gnorm, clip_val, gnorm_scale = F.percentile_clipping(g1, gnorm_vec, step, 5)
_current_gnorm, _clip_val, gnorm_scale = F.percentile_clipping(g1, gnorm_vec, step, 5)
g1 = (g1.float() * gnorm_scale).to(gtype)
p1.grad = g1



+ 2
- 2
tests/test_parametrize.py View File

@@ -246,14 +246,14 @@ def test_error_conditions():
replace_parameter_4bit(module, "nonexistent")

# Test TypeError for non-Parameter attribute
with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn.Parameter"):
with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn\\.Parameter"):
replace_parameter_4bit(module, "not_param")

# Test same errors for prequantized version
with pytest.raises(AttributeError, match="Module does not have parameter 'nonexistent'"):
replace_parameter_4bit_prequantized(module, "nonexistent", {}, torch.device("cpu"))

with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn.Parameter"):
with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn\\.Parameter"):
replace_parameter_4bit_prequantized(module, "not_param", {}, torch.device("cpu"))




Loading…
Cancel
Save
Baidu
map