Updates

pytorch · jainapurva · Feb 27, 2025 · Mar 2, 2025 · Mar 4, 2025 · May 21, 2025
commit b7cb54cfe360c1743093e1aa2613c9ad1e86fd2b
diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
@@ -222,7 +222,7 @@ def test_flatten_unflatten(self, device, dtype):
         if device == "cuda" and dtype == torch.bfloat16 and is_fbcode():
             raise unittest.SkipTest("TODO: Failing for cuda + bfloat16 in fbcode")
         if device == "cuda" and dtype == torch.bfloat16 and is_sm_at_least_90():
-            raise unittest.SkipTest('TODO: Failing on H100')
+            raise unittest.SkipTest("TODO: Failing on H100")
         apply_quant_list = get_quantization_functions(False, True, device)
         for apply_quant in apply_quant_list:
             linear = torch.nn.Linear(128, 256, dtype=dtype, device=device)

diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
@@ -27,7 +27,6 @@
     quantize_,
 )
 from torchao.quantization.granularity import (
-    Granularity,
     PerRow,
     PerTensor,
 )
@@ -145,9 +144,8 @@ def test_invalid_granularity(self):
         with pytest.raises(ValueError, match="Invalid granularity specification"):
             model = ToyLinearModel(64, 64).eval().to(torch.float32).to("cuda")
             quantize_(
-                model, 
-                float8_dynamic_activation_float8_weight(granularity="invalid")
-            )        
+                model, float8_dynamic_activation_float8_weight(granularity="invalid")
+            )
 
     @unittest.skipIf(
         not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
@@ -160,7 +158,9 @@ def test_mismatched_granularity(self):
             model = ToyLinearModel(64, 64).eval().to(torch.float32).to("cuda")
             quantize_(
                 model,
-                float8_dynamic_activation_float8_weight(granularity=(PerTensor(), PerRow()))
+                float8_dynamic_activation_float8_weight(
+                    granularity=(PerTensor(), PerRow())
+                ),
             )
 
     @unittest.skipIf(
@@ -169,14 +169,17 @@ def test_mismatched_granularity(self):
     def test_unsupported_granularity(self):
         class UnsupportedGranularity:
             pass
+
         with pytest.raises(
             ValueError,
             match="Invalid granularity types:",
         ):
             model = ToyLinearModel(64, 64).eval().to(torch.float32).to("cuda")
             quantize_(
-                model, 
-                float8_dynamic_activation_float8_weight(granularity=(UnsupportedGranularity(), UnsupportedGranularity()))
+                model,
+                float8_dynamic_activation_float8_weight(
+                    granularity=(UnsupportedGranularity(), UnsupportedGranularity())
+                ),
             )
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")

diff --git a/test/dtypes/test_nf4.py b/test/dtypes/test_nf4.py
@@ -617,7 +617,9 @@ def world_size(self) -> int:
         reason="torch >= 2.4 required",
     )
     @skip_if_lt_x_gpu(2)
-    @pytest.mark.skipif(is_sm_at_least_90(), reason="Skipping test on SM90+") # TODO: fix
+    @pytest.mark.skipif(
+        is_sm_at_least_90(), reason="Skipping test on SM90+"
+    )  # TODO: fix
     def test_qlora_fsdp2(self):
         from torch.distributed._composable.fsdp import CPUOffloadPolicy, OffloadPolicy
 

diff --git a/test/prototype/test_low_bit_optim.py b/test/prototype/test_low_bit_optim.py
@@ -420,7 +420,9 @@ def world_size(self) -> int:
     )
     @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
     @skip_if_rocm("ROCm enablement in progress")
-    @pytest.mark.skipif(is_sm_at_least_90(), reason="Will need more investigation on H100")
+    @pytest.mark.skipif(
+        is_sm_at_least_90(), reason="Will need more investigation on H100"
+    )
     def test_fsdp2(self):
         optim_classes = [low_bit_optim.AdamW8bit, low_bit_optim.AdamW4bit]
         if torch.cuda.get_device_capability() >= (8, 9):
@@ -532,7 +534,9 @@ def _test_fsdp2(self, optim_cls):
     )
     @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
     @skip_if_rocm("ROCm enablement in progress")
-    @pytest.mark.skipif(is_sm_at_least_90(), reason="Will need more investigation on H100") # TODO: investigate why this test fails on H100
+    @pytest.mark.skipif(
+        is_sm_at_least_90(), reason="Will need more investigation on H100"
+    )  # TODO: investigate why this test fails on H100
     def test_uneven_shard(self):
         in_dim = 512
         out_dim = _FSDP_WORLD_SIZE * 16 + 1

diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py
@@ -1,7 +1,10 @@
-from unittest import skipIf
 import pytest
 
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_4, TORCH_VERSION_AT_LEAST_2_6, is_sm_at_least_90
+from torchao.utils import (
+    TORCH_VERSION_AT_LEAST_2_4,
+    TORCH_VERSION_AT_LEAST_2_6,
+    is_sm_at_least_90,
+)
 
 if not TORCH_VERSION_AT_LEAST_2_4:
     pytest.skip("Requires torch>=2.4", allow_module_level=True)
@@ -296,7 +299,9 @@ def world_size(self) -> int:
         return _FSDP_WORLD_SIZE
 
     @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
-    @pytest.mark.skipif(is_sm_at_least_90(), reason="Skipping test on SM90+") # TODO: fix
+    @pytest.mark.skipif(
+        is_sm_at_least_90(), reason="Skipping test on SM90+"
+    )  # TODO: fix
     def test_fsdp2_correctness(self):
         mp_policy = MixedPrecisionPolicy()
 
@@ -389,7 +394,9 @@ def _run_subtest(self, args):
             )
 
     @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
-    @pytest.mark.skipif(is_sm_at_least_90(), reason="Skipping test on SM90+") # TODO: fix
+    @pytest.mark.skipif(
+        is_sm_at_least_90(), reason="Skipping test on SM90+"
+    )  # TODO: fix
     def test_precompute_bitnet_scale(self):
         from torchao.prototype.quantized_training.bitnet import (
             get_bitnet_scale,

diff --git a/test/prototype/test_smoothquant.py b/test/prototype/test_smoothquant.py
@@ -62,7 +62,9 @@ def forward(self, x):
     torch._dynamo.config.cache_size_limit = 128
 
 
-@pytest.mark.skipif(is_sm_at_least_90(), reason="Does not run on H100") # TODO: fix this test on H100
+@pytest.mark.skipif(
+    is_sm_at_least_90(), reason="Does not run on H100"
+)  # TODO: fix this test on H100
 @pytest.mark.parametrize("bias", bias_list)
 @pytest.mark.parametrize("alpha", alpha_list)
 @pytest.mark.parametrize("quant_mode", quant_mode_list)
@@ -138,7 +140,9 @@ def forward(self, x):
         assert torch.allclose(out, out_ref.to(idtype), atol=atol)
 
 
-@pytest.mark.skipif(is_sm_at_least_90(), reason="Does not run on H100") # TODO: fix this test on H100
+@pytest.mark.skipif(
+    is_sm_at_least_90(), reason="Does not run on H100"
+)  # TODO: fix this test on H100
 @pytest.mark.parametrize("alpha", alpha_list)
 @pytest.mark.parametrize("quant_mode", quant_mode_list)
 @pytest.mark.parametrize("device", devices)

diff --git a/test/test_rowwise_scaled_linear_cutlass.py b/test/test_rowwise_scaled_linear_cutlass.py
@@ -8,7 +8,7 @@
     rowwise_scaled_linear_cutlass_s8s4,
 )
 from torchao.quantization.utils import group_quantize_tensor_symmetric
-from torchao.utils import is_sm_at_least_89, is_sm_at_least_90
+from torchao.utils import is_sm_at_least_90
 
 ROWWISE_SCALED_LINEAR_CUTLASS_DTYPE = [torch.float16, torch.bfloat16]
 ROWWISE_SCALED_LINEAR_CUTLASS_BATCH_SIZE = [1, 4, 8, 16, 32, 64]

diff --git a/torchao/utils.py b/torchao/utils.py
@@ -6,7 +6,6 @@
 from importlib.metadata import version
 from math import gcd
 from typing import Any, Callable, Tuple
-import warnings
 
 import torch
 import torch.nn.utils.parametrize as parametrize