Skip to content

Commit 02d83c9

Browse files
Standardize fast pipeline tests with PipelineTestMixin (huggingface#1526)
* [WIP] Standardize fast pipeline tests with PipelineTestMixin * refactor the sd tests a bit * add more common tests * add xformers * add progressbar test * cleanup * upd fp16 * CycleDiffusionPipelineFastTests * DanceDiffusionPipelineFastTests * AltDiffusionPipelineFastTests * StableDiffusion2PipelineFastTests * StableDiffusion2InpaintPipelineFastTests * StableDiffusionImageVariationPipelineFastTests * StableDiffusionImg2ImgPipelineFastTests * StableDiffusionInpaintPipelineFastTests * remove unused mixins * quality * add missing inits * try to fix mps tests * fix mps tests * add mps warmups * skip for some pipelines * style * Update tests/test_pipelines_common.py Co-authored-by: Patrick von Platen <[email protected]> Co-authored-by: Patrick von Platen <[email protected]>
1 parent 9e11029 commit 02d83c9

32 files changed

+975
-2165
lines changed

src/diffusers/pipelines/ddim/pipeline_ddim.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,10 @@ def __call__(
9696

9797
if self.device.type == "mps":
9898
# randn does not work reproducibly on mps
99-
image = torch.randn(image_shape, generator=generator)
99+
image = torch.randn(image_shape, generator=generator, dtype=self.unet.dtype)
100100
image = image.to(self.device)
101101
else:
102-
image = torch.randn(image_shape, generator=generator, device=self.device)
102+
image = torch.randn(image_shape, generator=generator, device=self.device, dtype=self.unet.dtype)
103103

104104
# set step values
105105
self.scheduler.set_timesteps(num_inference_steps)

tests/pipelines/altdiffusion/test_alt_diffusion.py

+92-163
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
# limitations under the License.
1515

1616
import gc
17-
import random
1817
import unittest
1918

2019
import numpy as np
@@ -25,9 +24,9 @@
2524
RobertaSeriesConfig,
2625
RobertaSeriesModelWithTransformation,
2726
)
28-
from diffusers.utils import floats_tensor, slow, torch_device
27+
from diffusers.utils import slow, torch_device
2928
from diffusers.utils.testing_utils import require_torch_gpu
30-
from transformers import XLMRobertaTokenizer
29+
from transformers import CLIPTextConfig, CLIPTextModel, XLMRobertaTokenizer
3130

3231
from ...test_pipelines_common import PipelineTesterMixin
3332

@@ -36,25 +35,11 @@
3635

3736

3837
class AltDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
39-
def tearDown(self):
40-
# clean up the VRAM after each test
41-
super().tearDown()
42-
gc.collect()
43-
torch.cuda.empty_cache()
38+
pipeline_class = AltDiffusionPipeline
4439

45-
@property
46-
def dummy_image(self):
47-
batch_size = 1
48-
num_channels = 3
49-
sizes = (32, 32)
50-
51-
image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
52-
return image
53-
54-
@property
55-
def dummy_cond_unet(self):
40+
def get_dummy_components(self):
5641
torch.manual_seed(0)
57-
model = UNet2DConditionModel(
42+
unet = UNet2DConditionModel(
5843
block_out_channels=(32, 64),
5944
layers_per_block=2,
6045
sample_size=32,
@@ -64,202 +49,146 @@ def dummy_cond_unet(self):
6449
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
6550
cross_attention_dim=32,
6651
)
67-
return model
68-
69-
@property
70-
def dummy_cond_unet_inpaint(self):
71-
torch.manual_seed(0)
72-
model = UNet2DConditionModel(
73-
block_out_channels=(32, 64),
74-
layers_per_block=2,
75-
sample_size=32,
76-
in_channels=9,
77-
out_channels=4,
78-
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
79-
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
80-
cross_attention_dim=32,
52+
scheduler = DDIMScheduler(
53+
beta_start=0.00085,
54+
beta_end=0.012,
55+
beta_schedule="scaled_linear",
56+
clip_sample=False,
57+
set_alpha_to_one=False,
8158
)
82-
return model
83-
84-
@property
85-
def dummy_vae(self):
8659
torch.manual_seed(0)
87-
model = AutoencoderKL(
60+
vae = AutoencoderKL(
8861
block_out_channels=[32, 64],
8962
in_channels=3,
9063
out_channels=3,
9164
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
9265
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
9366
latent_channels=4,
9467
)
95-
return model
9668

97-
@property
98-
def dummy_text_encoder(self):
69+
# TODO: address the non-deterministic text encoder (fails for save-load tests)
70+
# torch.manual_seed(0)
71+
# text_encoder_config = RobertaSeriesConfig(
72+
# hidden_size=32,
73+
# project_dim=32,
74+
# intermediate_size=37,
75+
# layer_norm_eps=1e-05,
76+
# num_attention_heads=4,
77+
# num_hidden_layers=5,
78+
# vocab_size=5002,
79+
# )
80+
# text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config)
81+
9982
torch.manual_seed(0)
100-
config = RobertaSeriesConfig(
83+
text_encoder_config = CLIPTextConfig(
84+
bos_token_id=0,
85+
eos_token_id=2,
10186
hidden_size=32,
102-
project_dim=32,
87+
projection_dim=32,
10388
intermediate_size=37,
10489
layer_norm_eps=1e-05,
10590
num_attention_heads=4,
10691
num_hidden_layers=5,
92+
pad_token_id=1,
10793
vocab_size=5002,
10894
)
109-
return RobertaSeriesModelWithTransformation(config)
95+
text_encoder = CLIPTextModel(text_encoder_config)
11096

111-
@property
112-
def dummy_extractor(self):
113-
def extract(*args, **kwargs):
114-
class Out:
115-
def __init__(self):
116-
self.pixel_values = torch.ones([0])
117-
118-
def to(self, device):
119-
self.pixel_values.to(device)
120-
return self
121-
122-
return Out()
97+
tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
98+
tokenizer.model_max_length = 77
12399

124-
return extract
100+
components = {
101+
"unet": unet,
102+
"scheduler": scheduler,
103+
"vae": vae,
104+
"text_encoder": text_encoder,
105+
"tokenizer": tokenizer,
106+
"safety_checker": None,
107+
"feature_extractor": None,
108+
}
109+
return components
110+
111+
def get_dummy_inputs(self, device, seed=0):
112+
if str(device).startswith("mps"):
113+
generator = torch.manual_seed(seed)
114+
else:
115+
generator = torch.Generator(device=device).manual_seed(seed)
116+
inputs = {
117+
"prompt": "A painting of a squirrel eating a burger",
118+
"generator": generator,
119+
"num_inference_steps": 2,
120+
"guidance_scale": 6.0,
121+
"output_type": "numpy",
122+
}
123+
return inputs
125124

126125
def test_alt_diffusion_ddim(self):
127126
device = "cpu" # ensure determinism for the device-dependent torch.Generator
128-
unet = self.dummy_cond_unet
129-
scheduler = DDIMScheduler(
130-
beta_start=0.00085,
131-
beta_end=0.012,
132-
beta_schedule="scaled_linear",
133-
clip_sample=False,
134-
set_alpha_to_one=False,
135-
)
136127

137-
vae = self.dummy_vae
138-
bert = self.dummy_text_encoder
139-
tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
140-
tokenizer.model_max_length = 77
141-
142-
# make sure here that pndm scheduler skips prk
143-
alt_pipe = AltDiffusionPipeline(
144-
unet=unet,
145-
scheduler=scheduler,
146-
vae=vae,
147-
text_encoder=bert,
148-
tokenizer=tokenizer,
149-
safety_checker=None,
150-
feature_extractor=self.dummy_extractor,
128+
components = self.get_dummy_components()
129+
torch.manual_seed(0)
130+
text_encoder_config = RobertaSeriesConfig(
131+
hidden_size=32,
132+
project_dim=32,
133+
intermediate_size=37,
134+
layer_norm_eps=1e-05,
135+
num_attention_heads=4,
136+
num_hidden_layers=5,
137+
vocab_size=5002,
151138
)
139+
# TODO: remove after fixing the non-deterministic text encoder
140+
text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config)
141+
components["text_encoder"] = text_encoder
142+
143+
alt_pipe = AltDiffusionPipeline(**components)
152144
alt_pipe = alt_pipe.to(device)
153145
alt_pipe.set_progress_bar_config(disable=None)
154146

155-
prompt = "A photo of an astronaut"
156-
157-
generator = torch.Generator(device=device).manual_seed(0)
158-
output = alt_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
147+
inputs = self.get_dummy_inputs(device)
148+
inputs["prompt"] = "A photo of an astronaut"
149+
output = alt_pipe(**inputs)
159150
image = output.images
160-
161-
generator = torch.Generator(device=device).manual_seed(0)
162-
image_from_tuple = alt_pipe(
163-
[prompt],
164-
generator=generator,
165-
guidance_scale=6.0,
166-
num_inference_steps=2,
167-
output_type="np",
168-
return_dict=False,
169-
)[0]
170-
171151
image_slice = image[0, -3:, -3:, -1]
172-
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
173152

174153
assert image.shape == (1, 64, 64, 3)
175154
expected_slice = np.array(
176155
[0.5748162, 0.60447145, 0.48821217, 0.50100636, 0.5431185, 0.45763683, 0.49657696, 0.48132733, 0.47573093]
177156
)
178157

179158
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
180-
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
181159

182160
def test_alt_diffusion_pndm(self):
183161
device = "cpu" # ensure determinism for the device-dependent torch.Generator
184-
unet = self.dummy_cond_unet
185-
scheduler = PNDMScheduler(skip_prk_steps=True)
186-
vae = self.dummy_vae
187-
bert = self.dummy_text_encoder
188-
tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
189-
tokenizer.model_max_length = 77
190162

191-
# make sure here that pndm scheduler skips prk
192-
alt_pipe = AltDiffusionPipeline(
193-
unet=unet,
194-
scheduler=scheduler,
195-
vae=vae,
196-
text_encoder=bert,
197-
tokenizer=tokenizer,
198-
safety_checker=None,
199-
feature_extractor=self.dummy_extractor,
163+
components = self.get_dummy_components()
164+
components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
165+
torch.manual_seed(0)
166+
text_encoder_config = RobertaSeriesConfig(
167+
hidden_size=32,
168+
project_dim=32,
169+
intermediate_size=37,
170+
layer_norm_eps=1e-05,
171+
num_attention_heads=4,
172+
num_hidden_layers=5,
173+
vocab_size=5002,
200174
)
175+
# TODO: remove after fixing the non-deterministic text encoder
176+
text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config)
177+
components["text_encoder"] = text_encoder
178+
alt_pipe = AltDiffusionPipeline(**components)
201179
alt_pipe = alt_pipe.to(device)
202180
alt_pipe.set_progress_bar_config(disable=None)
203181

204-
prompt = "A painting of a squirrel eating a burger"
205-
generator = torch.Generator(device=device).manual_seed(0)
206-
output = alt_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
207-
182+
inputs = self.get_dummy_inputs(device)
183+
output = alt_pipe(**inputs)
208184
image = output.images
209-
210-
generator = torch.Generator(device=device).manual_seed(0)
211-
image_from_tuple = alt_pipe(
212-
[prompt],
213-
generator=generator,
214-
guidance_scale=6.0,
215-
num_inference_steps=2,
216-
output_type="np",
217-
return_dict=False,
218-
)[0]
219-
220185
image_slice = image[0, -3:, -3:, -1]
221-
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
222186

223187
assert image.shape == (1, 64, 64, 3)
224188
expected_slice = np.array(
225189
[0.51605093, 0.5707241, 0.47365507, 0.50578886, 0.5633877, 0.4642503, 0.5182081, 0.48763484, 0.49084237]
226190
)
227191
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
228-
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
229-
230-
@unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
231-
def test_alt_diffusion_fp16(self):
232-
"""Test that stable diffusion works with fp16"""
233-
unet = self.dummy_cond_unet
234-
scheduler = PNDMScheduler(skip_prk_steps=True)
235-
vae = self.dummy_vae
236-
bert = self.dummy_text_encoder
237-
tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
238-
tokenizer.model_max_length = 77
239-
240-
# put models in fp16
241-
unet = unet.half()
242-
vae = vae.half()
243-
bert = bert.half()
244-
245-
# make sure here that pndm scheduler skips prk
246-
alt_pipe = AltDiffusionPipeline(
247-
unet=unet,
248-
scheduler=scheduler,
249-
vae=vae,
250-
text_encoder=bert,
251-
tokenizer=tokenizer,
252-
safety_checker=None,
253-
feature_extractor=self.dummy_extractor,
254-
)
255-
alt_pipe = alt_pipe.to(torch_device)
256-
alt_pipe.set_progress_bar_config(disable=None)
257-
258-
prompt = "A painting of a squirrel eating a burger"
259-
generator = torch.Generator(device=torch_device).manual_seed(0)
260-
image = alt_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
261-
262-
assert image.shape == (1, 64, 64, 3)
263192

264193

265194
@slow

tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,11 @@
2929
from diffusers.utils.testing_utils import require_torch_gpu
3030
from transformers import XLMRobertaTokenizer
3131

32-
from ...test_pipelines_common import PipelineTesterMixin
33-
3432

3533
torch.backends.cuda.matmul.allow_tf32 = False
3634

3735

38-
class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
36+
class AltDiffusionImg2ImgPipelineFastTests(unittest.TestCase):
3937
def tearDown(self):
4038
# clean up the VRAM after each test
4139
super().tearDown()

tests/pipelines/audio_diffusion/__init__.py

Whitespace-only changes.

tests/pipelines/dance_diffusion/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)