Make Video Tests faster (huggingface#5787)

DN6 · web-flow · commit cdadb023a298 · 2023-11-15T10:56:01.000+05:30
* update test

* update
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
@@ -985,30 +985,30 @@ class TemporalConvLayer(nn.Module):
         dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
     """
 
-    def __init__(self, in_dim: int, out_dim: Optional[int] = None, dropout: float = 0.0):
+    def __init__(self, in_dim: int, out_dim: Optional[int] = None, dropout: float = 0.0, norm_num_groups: int = 32):
         super().__init__()
         out_dim = out_dim or in_dim
         self.in_dim = in_dim
         self.out_dim = out_dim
 
         # conv layers
         self.conv1 = nn.Sequential(
-            nn.GroupNorm(32, in_dim), nn.SiLU(), nn.Conv3d(in_dim, out_dim, (3, 1, 1), padding=(1, 0, 0))
+            nn.GroupNorm(norm_num_groups, in_dim), nn.SiLU(), nn.Conv3d(in_dim, out_dim, (3, 1, 1), padding=(1, 0, 0))
         )
         self.conv2 = nn.Sequential(
-            nn.GroupNorm(32, out_dim),
+            nn.GroupNorm(norm_num_groups, out_dim),
             nn.SiLU(),
             nn.Dropout(dropout),
             nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
         )
         self.conv3 = nn.Sequential(
-            nn.GroupNorm(32, out_dim),
+            nn.GroupNorm(norm_num_groups, out_dim),
             nn.SiLU(),
             nn.Dropout(dropout),
             nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
         )
         self.conv4 = nn.Sequential(
-            nn.GroupNorm(32, out_dim),
+            nn.GroupNorm(norm_num_groups, out_dim),
             nn.SiLU(),
             nn.Dropout(dropout),
             nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
diff --git a/src/diffusers/models/unet_3d_blocks.py b/src/diffusers/models/unet_3d_blocks.py
@@ -269,6 +269,7 @@ def __init__(
                 in_channels,
                 in_channels,
                 dropout=0.1,
+                norm_num_groups=resnet_groups,
             )
         ]
         attentions = []
@@ -316,6 +317,7 @@ def __init__(
                     in_channels,
                     in_channels,
                     dropout=0.1,
+                    norm_num_groups=resnet_groups,
                 )
             )
 
@@ -406,6 +408,7 @@ def __init__(
                     out_channels,
                     out_channels,
                     dropout=0.1,
+                    norm_num_groups=resnet_groups,
                 )
             )
             attentions.append(
@@ -529,6 +532,7 @@ def __init__(
                     out_channels,
                     out_channels,
                     dropout=0.1,
+                    norm_num_groups=resnet_groups,
                 )
             )
 
@@ -622,6 +626,7 @@ def __init__(
                     out_channels,
                     out_channels,
                     dropout=0.1,
+                    norm_num_groups=resnet_groups,
                 )
             )
             attentions.append(
@@ -764,6 +769,7 @@ def __init__(
                     out_channels,
                     out_channels,
                     dropout=0.1,
+                    norm_num_groups=resnet_groups,
                 )
             )
 
diff --git a/src/diffusers/models/unet_3d_condition.py b/src/diffusers/models/unet_3d_condition.py
@@ -173,6 +173,7 @@ def __init__(
             attention_head_dim=attention_head_dim,
             in_channels=block_out_channels[0],
             num_layers=1,
+            norm_num_groups=norm_num_groups,
         )
 
         # class embedding
diff --git a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py
@@ -62,15 +62,16 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     def get_dummy_components(self):
         torch.manual_seed(0)
         unet = UNet3DConditionModel(
-            block_out_channels=(32, 32),
-            layers_per_block=2,
+            block_out_channels=(4, 8),
+            layers_per_block=1,
             sample_size=32,
             in_channels=4,
             out_channels=4,
             down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
             up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
             cross_attention_dim=4,
             attention_head_dim=4,
+            norm_num_groups=2,
         )
         scheduler = DDIMScheduler(
             beta_start=0.00085,
@@ -81,13 +82,14 @@ def get_dummy_components(self):
         )
         torch.manual_seed(0)
         vae = AutoencoderKL(
-            block_out_channels=(32,),
+            block_out_channels=(8,),
             in_channels=3,
             out_channels=3,
             down_block_types=["DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D"],
             latent_channels=4,
             sample_size=32,
+            norm_num_groups=2,
         )
         torch.manual_seed(0)
         text_encoder_config = CLIPTextConfig(
@@ -142,10 +144,11 @@ def test_text_to_video_default_case(self):
         image_slice = frames[0][-3:, -3:, -1]
 
         assert frames[0].shape == (32, 32, 3)
-        expected_slice = np.array([91.0, 152.0, 66.0, 192.0, 94.0, 126.0, 101.0, 123.0, 152.0])
+        expected_slice = np.array([192.0, 44.0, 157.0, 140.0, 108.0, 104.0, 123.0, 144.0, 129.0])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
+    @unittest.skipIf(torch_device != "cuda", reason="Feature isn't heavily used. Test in CUDA environment only.")
     def test_attention_slicing_forward_pass(self):
         self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False, expected_max_diff=3e-3)
 
diff --git a/tests/pipelines/text_to_video_synthesis/test_video_to_video.py b/tests/pipelines/text_to_video_synthesis/test_video_to_video.py
@@ -70,15 +70,16 @@ class VideoToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     def get_dummy_components(self):
         torch.manual_seed(0)
         unet = UNet3DConditionModel(
-            block_out_channels=(32, 64, 64, 64),
-            layers_per_block=2,
+            block_out_channels=(4, 8),
+            layers_per_block=1,
             sample_size=32,
             in_channels=4,
             out_channels=4,
-            down_block_types=("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D"),
-            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"),
+            down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
+            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
             cross_attention_dim=32,
             attention_head_dim=4,
+            norm_num_groups=2,
         )
         scheduler = DDIMScheduler(
             beta_start=0.00085,
@@ -89,13 +90,18 @@ def get_dummy_components(self):
         )
         torch.manual_seed(0)
         vae = AutoencoderKL(
-            block_out_channels=[32, 64],
+            block_out_channels=[
+                8,
+            ],
             in_channels=3,
             out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            down_block_types=[
+                "DownEncoderBlock2D",
+            ],
+            up_block_types=["UpDecoderBlock2D"],
             latent_channels=4,
-            sample_size=128,
+            sample_size=32,
+            norm_num_groups=2,
         )
         torch.manual_seed(0)
         text_encoder_config = CLIPTextConfig(
@@ -154,7 +160,7 @@ def test_text_to_video_default_case(self):
         image_slice = frames[0][-3:, -3:, -1]
 
         assert frames[0].shape == (32, 32, 3)
-        expected_slice = np.array([106, 117, 113, 174, 137, 112, 148, 151, 131])
+        expected_slice = np.array([162.0, 136.0, 132.0, 140.0, 139.0, 137.0, 169.0, 134.0, 132.0])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 

Original file line number	Diff line number	Diff line change
`@@ -269,6 +269,7 @@ def __init__(`
`269`	`269`	`in_channels,`
`270`	`270`	`in_channels,`
`271`	`271`	`dropout=0.1,`
	`272`	`+ norm_num_groups=resnet_groups,`
`272`	`273`	`)`
`273`	`274`	`]`
`274`	`275`	`attentions = []`
`@@ -316,6 +317,7 @@ def __init__(`
`316`	`317`	`in_channels,`
`317`	`318`	`in_channels,`
`318`	`319`	`dropout=0.1,`
	`320`	`+ norm_num_groups=resnet_groups,`
`319`	`321`	`)`
`320`	`322`	`)`
`321`	`323`
`@@ -406,6 +408,7 @@ def __init__(`
`406`	`408`	`out_channels,`
`407`	`409`	`out_channels,`
`408`	`410`	`dropout=0.1,`
	`411`	`+ norm_num_groups=resnet_groups,`
`409`	`412`	`)`
`410`	`413`	`)`
`411`	`414`	`attentions.append(`
`@@ -529,6 +532,7 @@ def __init__(`
`529`	`532`	`out_channels,`
`530`	`533`	`out_channels,`
`531`	`534`	`dropout=0.1,`
	`535`	`+ norm_num_groups=resnet_groups,`
`532`	`536`	`)`
`533`	`537`	`)`
`534`	`538`
`@@ -622,6 +626,7 @@ def __init__(`
`622`	`626`	`out_channels,`
`623`	`627`	`out_channels,`
`624`	`628`	`dropout=0.1,`
	`629`	`+ norm_num_groups=resnet_groups,`
`625`	`630`	`)`
`626`	`631`	`)`
`627`	`632`	`attentions.append(`
`@@ -764,6 +769,7 @@ def __init__(`
`764`	`769`	`out_channels,`
`765`	`770`	`out_channels,`
`766`	`771`	`dropout=0.1,`
	`772`	`+ norm_num_groups=resnet_groups,`
`767`	`773`	`)`
`768`	`774`	`)`
`769`	`775`
Original file line number	Diff line number	Diff line change
`@@ -173,6 +173,7 @@ def __init__(`
`173`	`173`	`attention_head_dim=attention_head_dim,`
`174`	`174`	`in_channels=block_out_channels[0],`
`175`	`175`	`num_layers=1,`
	`176`	`+ norm_num_groups=norm_num_groups,`
`176`	`177`	`)`
`177`	`178`
`178`	`179`	`# class embedding`