Skip to content

Commit cdadb02

Browse files
authored
Make Video Tests faster (huggingface#5787)
* update test * update
1 parent 51fd3dd commit cdadb02

File tree

5 files changed

+34
-18
lines changed

5 files changed

+34
-18
lines changed

src/diffusers/models/resnet.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -985,30 +985,30 @@ class TemporalConvLayer(nn.Module):
985985
dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
986986
"""
987987

988-
def __init__(self, in_dim: int, out_dim: Optional[int] = None, dropout: float = 0.0):
988+
def __init__(self, in_dim: int, out_dim: Optional[int] = None, dropout: float = 0.0, norm_num_groups: int = 32):
989989
super().__init__()
990990
out_dim = out_dim or in_dim
991991
self.in_dim = in_dim
992992
self.out_dim = out_dim
993993

994994
# conv layers
995995
self.conv1 = nn.Sequential(
996-
nn.GroupNorm(32, in_dim), nn.SiLU(), nn.Conv3d(in_dim, out_dim, (3, 1, 1), padding=(1, 0, 0))
996+
nn.GroupNorm(norm_num_groups, in_dim), nn.SiLU(), nn.Conv3d(in_dim, out_dim, (3, 1, 1), padding=(1, 0, 0))
997997
)
998998
self.conv2 = nn.Sequential(
999-
nn.GroupNorm(32, out_dim),
999+
nn.GroupNorm(norm_num_groups, out_dim),
10001000
nn.SiLU(),
10011001
nn.Dropout(dropout),
10021002
nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
10031003
)
10041004
self.conv3 = nn.Sequential(
1005-
nn.GroupNorm(32, out_dim),
1005+
nn.GroupNorm(norm_num_groups, out_dim),
10061006
nn.SiLU(),
10071007
nn.Dropout(dropout),
10081008
nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
10091009
)
10101010
self.conv4 = nn.Sequential(
1011-
nn.GroupNorm(32, out_dim),
1011+
nn.GroupNorm(norm_num_groups, out_dim),
10121012
nn.SiLU(),
10131013
nn.Dropout(dropout),
10141014
nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),

src/diffusers/models/unet_3d_blocks.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,7 @@ def __init__(
269269
in_channels,
270270
in_channels,
271271
dropout=0.1,
272+
norm_num_groups=resnet_groups,
272273
)
273274
]
274275
attentions = []
@@ -316,6 +317,7 @@ def __init__(
316317
in_channels,
317318
in_channels,
318319
dropout=0.1,
320+
norm_num_groups=resnet_groups,
319321
)
320322
)
321323

@@ -406,6 +408,7 @@ def __init__(
406408
out_channels,
407409
out_channels,
408410
dropout=0.1,
411+
norm_num_groups=resnet_groups,
409412
)
410413
)
411414
attentions.append(
@@ -529,6 +532,7 @@ def __init__(
529532
out_channels,
530533
out_channels,
531534
dropout=0.1,
535+
norm_num_groups=resnet_groups,
532536
)
533537
)
534538

@@ -622,6 +626,7 @@ def __init__(
622626
out_channels,
623627
out_channels,
624628
dropout=0.1,
629+
norm_num_groups=resnet_groups,
625630
)
626631
)
627632
attentions.append(
@@ -764,6 +769,7 @@ def __init__(
764769
out_channels,
765770
out_channels,
766771
dropout=0.1,
772+
norm_num_groups=resnet_groups,
767773
)
768774
)
769775

src/diffusers/models/unet_3d_condition.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ def __init__(
173173
attention_head_dim=attention_head_dim,
174174
in_channels=block_out_channels[0],
175175
num_layers=1,
176+
norm_num_groups=norm_num_groups,
176177
)
177178

178179
# class embedding

tests/pipelines/text_to_video_synthesis/test_text_to_video.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,16 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
6262
def get_dummy_components(self):
6363
torch.manual_seed(0)
6464
unet = UNet3DConditionModel(
65-
block_out_channels=(32, 32),
66-
layers_per_block=2,
65+
block_out_channels=(4, 8),
66+
layers_per_block=1,
6767
sample_size=32,
6868
in_channels=4,
6969
out_channels=4,
7070
down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
7171
up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
7272
cross_attention_dim=4,
7373
attention_head_dim=4,
74+
norm_num_groups=2,
7475
)
7576
scheduler = DDIMScheduler(
7677
beta_start=0.00085,
@@ -81,13 +82,14 @@ def get_dummy_components(self):
8182
)
8283
torch.manual_seed(0)
8384
vae = AutoencoderKL(
84-
block_out_channels=(32,),
85+
block_out_channels=(8,),
8586
in_channels=3,
8687
out_channels=3,
8788
down_block_types=["DownEncoderBlock2D"],
8889
up_block_types=["UpDecoderBlock2D"],
8990
latent_channels=4,
9091
sample_size=32,
92+
norm_num_groups=2,
9193
)
9294
torch.manual_seed(0)
9395
text_encoder_config = CLIPTextConfig(
@@ -142,10 +144,11 @@ def test_text_to_video_default_case(self):
142144
image_slice = frames[0][-3:, -3:, -1]
143145

144146
assert frames[0].shape == (32, 32, 3)
145-
expected_slice = np.array([91.0, 152.0, 66.0, 192.0, 94.0, 126.0, 101.0, 123.0, 152.0])
147+
expected_slice = np.array([192.0, 44.0, 157.0, 140.0, 108.0, 104.0, 123.0, 144.0, 129.0])
146148

147149
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
148150

151+
@unittest.skipIf(torch_device != "cuda", reason="Feature isn't heavily used. Test in CUDA environment only.")
149152
def test_attention_slicing_forward_pass(self):
150153
self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False, expected_max_diff=3e-3)
151154

tests/pipelines/text_to_video_synthesis/test_video_to_video.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -70,15 +70,16 @@ class VideoToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
7070
def get_dummy_components(self):
7171
torch.manual_seed(0)
7272
unet = UNet3DConditionModel(
73-
block_out_channels=(32, 64, 64, 64),
74-
layers_per_block=2,
73+
block_out_channels=(4, 8),
74+
layers_per_block=1,
7575
sample_size=32,
7676
in_channels=4,
7777
out_channels=4,
78-
down_block_types=("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D"),
79-
up_block_types=("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"),
78+
down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
79+
up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
8080
cross_attention_dim=32,
8181
attention_head_dim=4,
82+
norm_num_groups=2,
8283
)
8384
scheduler = DDIMScheduler(
8485
beta_start=0.00085,
@@ -89,13 +90,18 @@ def get_dummy_components(self):
8990
)
9091
torch.manual_seed(0)
9192
vae = AutoencoderKL(
92-
block_out_channels=[32, 64],
93+
block_out_channels=[
94+
8,
95+
],
9396
in_channels=3,
9497
out_channels=3,
95-
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
96-
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
98+
down_block_types=[
99+
"DownEncoderBlock2D",
100+
],
101+
up_block_types=["UpDecoderBlock2D"],
97102
latent_channels=4,
98-
sample_size=128,
103+
sample_size=32,
104+
norm_num_groups=2,
99105
)
100106
torch.manual_seed(0)
101107
text_encoder_config = CLIPTextConfig(
@@ -154,7 +160,7 @@ def test_text_to_video_default_case(self):
154160
image_slice = frames[0][-3:, -3:, -1]
155161

156162
assert frames[0].shape == (32, 32, 3)
157-
expected_slice = np.array([106, 117, 113, 174, 137, 112, 148, 151, 131])
163+
expected_slice = np.array([162.0, 136.0, 132.0, 140.0, 139.0, 137.0, 169.0, 134.0, 132.0])
158164

159165
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
160166

0 commit comments

Comments
 (0)