Merge 4e570a99a705502948c29519bebff9fb43ea079b into 592fd5daf8177b205af11651bbb31a1834a8b0e0

2025-05-02 13:36:28 +02:00 · 2025-02-24 16:37:23 +08:00 · 2025-02-24 16:37:23 +08:00 · 378a5ed64e
commit 378a5ed64e
parent 592fd5daf8 4e570a99a7
1 changed files with 2 additions and 2 deletions
--- a/inference/model.py
+++ b/inference/model.py
@ -143,8 +143,8 @@ def linear(x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] =
        quantization-aware computations depending on the input parameters.

    Notes:
-        - If `weight` is quantized (e.g., `element_size() == 1`), a dequantized version 
-          is used for computation.
+        - If `weight` is in a higher precision format (e.g., float32 or bfloat16), then `element_size() > 1`, and the original
+          weight tensor is used for computation.
        - If `gemm_impl == "bf16"`, dequantization and a `bf16` GEMM operation are applied.
        - For other cases, the function applies quantization to `x` and uses `fp8_gemm` for computation.
    """