|
|
|
@@ -1795,102 +1795,6 @@ def int8_mm_dequant( |
|
|
|
return result |
|
|
|
|
|
|
|
|
|
|
|
@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning) |
|
|
|
def get_colrow_absmax( |
|
|
|
A: torch.Tensor, |
|
|
|
row_stats: Optional[torch.Tensor] = None, |
|
|
|
col_stats: Optional[torch.Tensor] = None, |
|
|
|
nnz_block_ptr: Optional[torch.Tensor] = None, |
|
|
|
threshold=0.0, |
|
|
|
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: |
|
|
|
""" "Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm. |
|
|
|
|
|
|
|
The row-wise and column-wise absmax values are determined. |
|
|
|
|
|
|
|
For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339). |
|
|
|
|
|
|
|
<Tip> |
|
|
|
This function is useful for training, but for inference it is advised to use [`get_row_absmax`] instead. |
|
|
|
The column-wise quantization scales are not typically needed in inference scenarios. |
|
|
|
</Tip> |
|
|
|
|
|
|
|
Args: |
|
|
|
A (`torch.Tensor` with dtype `torch.float16`): Input tensor. |
|
|
|
row_stats (`torch.Tensor`, *optional*): If provided, calculation of row statistics is skipped. |
|
|
|
col_stats (`torch.Tensor`, *optional*): If provided, calculation of column statistics is skipped. |
|
|
|
nnz_block_ptr (`torch.Tensor`, *optional*): Not used. |
|
|
|
threshold (`float`, *optional*): |
|
|
|
An optional threshold for sparse decomposition of outlier features. |
|
|
|
No outliers are held back when 0.0. Defaults to 0.0. |
|
|
|
|
|
|
|
Returns: |
|
|
|
`Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing quantization statistics. |
|
|
|
- `torch.Tensor` with dtype `torch.float32`: The row-wise quantization statistics. |
|
|
|
- `torch.Tensor` with dtype `torch.float32`: The column-wise quantization statistics. |
|
|
|
- `torch.Tensor` with dtype `torch.bool`, *optional*: A mask indicating the locations of outliers in the input tensor. |
|
|
|
""" |
|
|
|
assert A.is_floating_point() |
|
|
|
|
|
|
|
outlier_mask = None |
|
|
|
|
|
|
|
if row_stats is None or col_stats is None: |
|
|
|
absA = A.abs().view(-1, A.shape[-1]) |
|
|
|
|
|
|
|
if threshold > 0.0: |
|
|
|
# Filter outliers from stats when enabled |
|
|
|
outlier_mask = absA >= threshold |
|
|
|
absA.masked_fill_(outlier_mask, 0.0) |
|
|
|
|
|
|
|
if row_stats is None: |
|
|
|
# shape [rows]; unsqueeze(-1) gives [rows,1] |
|
|
|
# We have a CUDA kernel for row max, but not yet for cols. |
|
|
|
row_stats = get_row_absmax(A, threshold) |
|
|
|
|
|
|
|
if col_stats is None: |
|
|
|
# shape [cols]; unsqueeze(0) gives [1,cols] |
|
|
|
col_stats = absA.amax(dim=0, keepdim=False).float() |
|
|
|
|
|
|
|
return row_stats, col_stats, outlier_mask |
|
|
|
|
|
|
|
|
|
|
|
@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning) |
|
|
|
def get_row_absmax(A: torch.Tensor, threshold=0.0): |
|
|
|
"""Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm. |
|
|
|
|
|
|
|
For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339). |
|
|
|
|
|
|
|
Args: |
|
|
|
A (`torch.Tensor` with dtype `torch.float16`): The input matrix. |
|
|
|
threshold (`float`, *optional*): |
|
|
|
An optional threshold for sparse decomposition of outlier features. |
|
|
|
No outliers are held back when 0.0. Defaults to 0.0. |
|
|
|
|
|
|
|
Returns: |
|
|
|
`torch.Tensor` with dtype `torch.float32`: The absolute maximum value for each row, with outliers ignored. |
|
|
|
""" |
|
|
|
|
|
|
|
assert A.dtype == torch.float16 |
|
|
|
|
|
|
|
rows = prod(A.shape[:-1]) |
|
|
|
cols = A.shape[-1] |
|
|
|
|
|
|
|
row_stats = torch.empty((rows,), dtype=torch.float32, device=A.device) |
|
|
|
|
|
|
|
is_on_gpu([A]) |
|
|
|
|
|
|
|
with _cuda_device_of(A): |
|
|
|
lib.cget_row_stats( |
|
|
|
get_ptr(A), |
|
|
|
get_ptr(row_stats), |
|
|
|
ct.c_float(threshold), |
|
|
|
ct.c_int32(rows), |
|
|
|
ct.c_int32(cols), |
|
|
|
_get_tensor_stream(A), |
|
|
|
) |
|
|
|
|
|
|
|
return row_stats |
|
|
|
|
|
|
|
|
|
|
|
class COOSparseTensor: |
|
|
|
def __init__( |
|
|
|
self, rows: int, cols: int, nnz: int, rowidx: torch.Tensor, colidx: torch.Tensor, values: torch.Tensor |
|
|
|
|