Skip to content

Fix failing tests on h100 #2231

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 11 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Updates
  • Loading branch information
jainapurva committed Mar 4, 2025
commit b7cb54cfe360c1743093e1aa2613c9ad1e86fd2b
2 changes: 1 addition & 1 deletion test/dtypes/test_affine_quantized.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def test_flatten_unflatten(self, device, dtype):
if device == "cuda" and dtype == torch.bfloat16 and is_fbcode():
raise unittest.SkipTest("TODO: Failing for cuda + bfloat16 in fbcode")
if device == "cuda" and dtype == torch.bfloat16 and is_sm_at_least_90():
raise unittest.SkipTest('TODO: Failing on H100')
raise unittest.SkipTest("TODO: Failing on H100")
apply_quant_list = get_quantization_functions(False, True, device)
for apply_quant in apply_quant_list:
linear = torch.nn.Linear(128, 256, dtype=dtype, device=device)
Expand Down
17 changes: 10 additions & 7 deletions test/dtypes/test_affine_quantized_float.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
quantize_,
)
from torchao.quantization.granularity import (
Granularity,
PerRow,
PerTensor,
)
Expand Down Expand Up @@ -145,9 +144,8 @@ def test_invalid_granularity(self):
with pytest.raises(ValueError, match="Invalid granularity specification"):
model = ToyLinearModel(64, 64).eval().to(torch.float32).to("cuda")
quantize_(
model,
float8_dynamic_activation_float8_weight(granularity="invalid")
)
model, float8_dynamic_activation_float8_weight(granularity="invalid")
)

@unittest.skipIf(
not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
Expand All @@ -160,7 +158,9 @@ def test_mismatched_granularity(self):
model = ToyLinearModel(64, 64).eval().to(torch.float32).to("cuda")
quantize_(
model,
float8_dynamic_activation_float8_weight(granularity=(PerTensor(), PerRow()))
float8_dynamic_activation_float8_weight(
granularity=(PerTensor(), PerRow())
),
)

@unittest.skipIf(
Expand All @@ -169,14 +169,17 @@ def test_mismatched_granularity(self):
def test_unsupported_granularity(self):
class UnsupportedGranularity:
pass

with pytest.raises(
ValueError,
match="Invalid granularity types:",
):
model = ToyLinearModel(64, 64).eval().to(torch.float32).to("cuda")
quantize_(
model,
float8_dynamic_activation_float8_weight(granularity=(UnsupportedGranularity(), UnsupportedGranularity()))
model,
float8_dynamic_activation_float8_weight(
granularity=(UnsupportedGranularity(), UnsupportedGranularity())
),
)

@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
Expand Down
4 changes: 3 additions & 1 deletion test/dtypes/test_nf4.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,9 @@ def world_size(self) -> int:
reason="torch >= 2.4 required",
)
@skip_if_lt_x_gpu(2)
@pytest.mark.skipif(is_sm_at_least_90(), reason="Skipping test on SM90+") # TODO: fix
@pytest.mark.skipif(
is_sm_at_least_90(), reason="Skipping test on SM90+"
) # TODO: fix
def test_qlora_fsdp2(self):
from torch.distributed._composable.fsdp import CPUOffloadPolicy, OffloadPolicy

Expand Down
8 changes: 6 additions & 2 deletions test/prototype/test_low_bit_optim.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,9 @@ def world_size(self) -> int:
)
@skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
@skip_if_rocm("ROCm enablement in progress")
@pytest.mark.skipif(is_sm_at_least_90(), reason="Will need more investigation on H100")
@pytest.mark.skipif(
is_sm_at_least_90(), reason="Will need more investigation on H100"
)
def test_fsdp2(self):
optim_classes = [low_bit_optim.AdamW8bit, low_bit_optim.AdamW4bit]
if torch.cuda.get_device_capability() >= (8, 9):
Expand Down Expand Up @@ -532,7 +534,9 @@ def _test_fsdp2(self, optim_cls):
)
@skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
@skip_if_rocm("ROCm enablement in progress")
@pytest.mark.skipif(is_sm_at_least_90(), reason="Will need more investigation on H100") # TODO: investigate why this test fails on H100
@pytest.mark.skipif(
is_sm_at_least_90(), reason="Will need more investigation on H100"
) # TODO: investigate why this test fails on H100
def test_uneven_shard(self):
in_dim = 512
out_dim = _FSDP_WORLD_SIZE * 16 + 1
Expand Down
15 changes: 11 additions & 4 deletions test/prototype/test_quantized_training.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from unittest import skipIf
import pytest

from torchao.utils import TORCH_VERSION_AT_LEAST_2_4, TORCH_VERSION_AT_LEAST_2_6, is_sm_at_least_90
from torchao.utils import (
TORCH_VERSION_AT_LEAST_2_4,
TORCH_VERSION_AT_LEAST_2_6,
is_sm_at_least_90,
)

if not TORCH_VERSION_AT_LEAST_2_4:
pytest.skip("Requires torch>=2.4", allow_module_level=True)
Expand Down Expand Up @@ -296,7 +299,9 @@ def world_size(self) -> int:
return _FSDP_WORLD_SIZE

@skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
@pytest.mark.skipif(is_sm_at_least_90(), reason="Skipping test on SM90+") # TODO: fix
@pytest.mark.skipif(
is_sm_at_least_90(), reason="Skipping test on SM90+"
) # TODO: fix
def test_fsdp2_correctness(self):
mp_policy = MixedPrecisionPolicy()

Expand Down Expand Up @@ -389,7 +394,9 @@ def _run_subtest(self, args):
)

@skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
@pytest.mark.skipif(is_sm_at_least_90(), reason="Skipping test on SM90+") # TODO: fix
@pytest.mark.skipif(
is_sm_at_least_90(), reason="Skipping test on SM90+"
) # TODO: fix
def test_precompute_bitnet_scale(self):
from torchao.prototype.quantized_training.bitnet import (
get_bitnet_scale,
Expand Down
8 changes: 6 additions & 2 deletions test/prototype/test_smoothquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ def forward(self, x):
torch._dynamo.config.cache_size_limit = 128


@pytest.mark.skipif(is_sm_at_least_90(), reason="Does not run on H100") # TODO: fix this test on H100
@pytest.mark.skipif(
is_sm_at_least_90(), reason="Does not run on H100"
) # TODO: fix this test on H100
@pytest.mark.parametrize("bias", bias_list)
@pytest.mark.parametrize("alpha", alpha_list)
@pytest.mark.parametrize("quant_mode", quant_mode_list)
Expand Down Expand Up @@ -138,7 +140,9 @@ def forward(self, x):
assert torch.allclose(out, out_ref.to(idtype), atol=atol)


@pytest.mark.skipif(is_sm_at_least_90(), reason="Does not run on H100") # TODO: fix this test on H100
@pytest.mark.skipif(
is_sm_at_least_90(), reason="Does not run on H100"
) # TODO: fix this test on H100
@pytest.mark.parametrize("alpha", alpha_list)
@pytest.mark.parametrize("quant_mode", quant_mode_list)
@pytest.mark.parametrize("device", devices)
Expand Down
2 changes: 1 addition & 1 deletion test/test_rowwise_scaled_linear_cutlass.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
rowwise_scaled_linear_cutlass_s8s4,
)
from torchao.quantization.utils import group_quantize_tensor_symmetric
from torchao.utils import is_sm_at_least_89, is_sm_at_least_90
from torchao.utils import is_sm_at_least_90

ROWWISE_SCALED_LINEAR_CUTLASS_DTYPE = [torch.float16, torch.bfloat16]
ROWWISE_SCALED_LINEAR_CUTLASS_BATCH_SIZE = [1, 4, 8, 16, 32, 64]
Expand Down
1 change: 0 additions & 1 deletion torchao/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from importlib.metadata import version
from math import gcd
from typing import Any, Callable, Tuple
import warnings

import torch
import torch.nn.utils.parametrize as parametrize
Expand Down