1414# limitations under the License.
1515
1616import gc
17- import random
1817import unittest
1918
2019import numpy as np
2524 RobertaSeriesConfig ,
2625 RobertaSeriesModelWithTransformation ,
2726)
28- from diffusers .utils import floats_tensor , slow , torch_device
27+ from diffusers .utils import slow , torch_device
2928from diffusers .utils .testing_utils import require_torch_gpu
30- from transformers import XLMRobertaTokenizer
29+ from transformers import CLIPTextConfig , CLIPTextModel , XLMRobertaTokenizer
3130
3231from ...test_pipelines_common import PipelineTesterMixin
3332
3635
3736
3837class AltDiffusionPipelineFastTests (PipelineTesterMixin , unittest .TestCase ):
39- def tearDown (self ):
40- # clean up the VRAM after each test
41- super ().tearDown ()
42- gc .collect ()
43- torch .cuda .empty_cache ()
38+ pipeline_class = AltDiffusionPipeline
4439
45- @property
46- def dummy_image (self ):
47- batch_size = 1
48- num_channels = 3
49- sizes = (32 , 32 )
50-
51- image = floats_tensor ((batch_size , num_channels ) + sizes , rng = random .Random (0 )).to (torch_device )
52- return image
53-
54- @property
55- def dummy_cond_unet (self ):
40+ def get_dummy_components (self ):
5641 torch .manual_seed (0 )
57- model = UNet2DConditionModel (
42+ unet = UNet2DConditionModel (
5843 block_out_channels = (32 , 64 ),
5944 layers_per_block = 2 ,
6045 sample_size = 32 ,
@@ -64,202 +49,146 @@ def dummy_cond_unet(self):
6449 up_block_types = ("CrossAttnUpBlock2D" , "UpBlock2D" ),
6550 cross_attention_dim = 32 ,
6651 )
67- return model
68-
69- @property
70- def dummy_cond_unet_inpaint (self ):
71- torch .manual_seed (0 )
72- model = UNet2DConditionModel (
73- block_out_channels = (32 , 64 ),
74- layers_per_block = 2 ,
75- sample_size = 32 ,
76- in_channels = 9 ,
77- out_channels = 4 ,
78- down_block_types = ("DownBlock2D" , "CrossAttnDownBlock2D" ),
79- up_block_types = ("CrossAttnUpBlock2D" , "UpBlock2D" ),
80- cross_attention_dim = 32 ,
52+ scheduler = DDIMScheduler (
53+ beta_start = 0.00085 ,
54+ beta_end = 0.012 ,
55+ beta_schedule = "scaled_linear" ,
56+ clip_sample = False ,
57+ set_alpha_to_one = False ,
8158 )
82- return model
83-
84- @property
85- def dummy_vae (self ):
8659 torch .manual_seed (0 )
87- model = AutoencoderKL (
60+ vae = AutoencoderKL (
8861 block_out_channels = [32 , 64 ],
8962 in_channels = 3 ,
9063 out_channels = 3 ,
9164 down_block_types = ["DownEncoderBlock2D" , "DownEncoderBlock2D" ],
9265 up_block_types = ["UpDecoderBlock2D" , "UpDecoderBlock2D" ],
9366 latent_channels = 4 ,
9467 )
95- return model
9668
97- @property
98- def dummy_text_encoder (self ):
69+ # TODO: address the non-deterministic text encoder (fails for save-load tests)
70+ # torch.manual_seed(0)
71+ # text_encoder_config = RobertaSeriesConfig(
72+ # hidden_size=32,
73+ # project_dim=32,
74+ # intermediate_size=37,
75+ # layer_norm_eps=1e-05,
76+ # num_attention_heads=4,
77+ # num_hidden_layers=5,
78+ # vocab_size=5002,
79+ # )
80+ # text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config)
81+
9982 torch .manual_seed (0 )
100- config = RobertaSeriesConfig (
83+ text_encoder_config = CLIPTextConfig (
84+ bos_token_id = 0 ,
85+ eos_token_id = 2 ,
10186 hidden_size = 32 ,
102- project_dim = 32 ,
87+ projection_dim = 32 ,
10388 intermediate_size = 37 ,
10489 layer_norm_eps = 1e-05 ,
10590 num_attention_heads = 4 ,
10691 num_hidden_layers = 5 ,
92+ pad_token_id = 1 ,
10793 vocab_size = 5002 ,
10894 )
109- return RobertaSeriesModelWithTransformation ( config )
95+ text_encoder = CLIPTextModel ( text_encoder_config )
11096
111- @property
112- def dummy_extractor (self ):
113- def extract (* args , ** kwargs ):
114- class Out :
115- def __init__ (self ):
116- self .pixel_values = torch .ones ([0 ])
117-
118- def to (self , device ):
119- self .pixel_values .to (device )
120- return self
121-
122- return Out ()
97+ tokenizer = XLMRobertaTokenizer .from_pretrained ("hf-internal-testing/tiny-xlm-roberta" )
98+ tokenizer .model_max_length = 77
12399
124- return extract
100+ components = {
101+ "unet" : unet ,
102+ "scheduler" : scheduler ,
103+ "vae" : vae ,
104+ "text_encoder" : text_encoder ,
105+ "tokenizer" : tokenizer ,
106+ "safety_checker" : None ,
107+ "feature_extractor" : None ,
108+ }
109+ return components
110+
111+ def get_dummy_inputs (self , device , seed = 0 ):
112+ if str (device ).startswith ("mps" ):
113+ generator = torch .manual_seed (seed )
114+ else :
115+ generator = torch .Generator (device = device ).manual_seed (seed )
116+ inputs = {
117+ "prompt" : "A painting of a squirrel eating a burger" ,
118+ "generator" : generator ,
119+ "num_inference_steps" : 2 ,
120+ "guidance_scale" : 6.0 ,
121+ "output_type" : "numpy" ,
122+ }
123+ return inputs
125124
126125 def test_alt_diffusion_ddim (self ):
127126 device = "cpu" # ensure determinism for the device-dependent torch.Generator
128- unet = self .dummy_cond_unet
129- scheduler = DDIMScheduler (
130- beta_start = 0.00085 ,
131- beta_end = 0.012 ,
132- beta_schedule = "scaled_linear" ,
133- clip_sample = False ,
134- set_alpha_to_one = False ,
135- )
136127
137- vae = self .dummy_vae
138- bert = self .dummy_text_encoder
139- tokenizer = XLMRobertaTokenizer .from_pretrained ("hf-internal-testing/tiny-xlm-roberta" )
140- tokenizer .model_max_length = 77
141-
142- # make sure here that pndm scheduler skips prk
143- alt_pipe = AltDiffusionPipeline (
144- unet = unet ,
145- scheduler = scheduler ,
146- vae = vae ,
147- text_encoder = bert ,
148- tokenizer = tokenizer ,
149- safety_checker = None ,
150- feature_extractor = self .dummy_extractor ,
128+ components = self .get_dummy_components ()
129+ torch .manual_seed (0 )
130+ text_encoder_config = RobertaSeriesConfig (
131+ hidden_size = 32 ,
132+ project_dim = 32 ,
133+ intermediate_size = 37 ,
134+ layer_norm_eps = 1e-05 ,
135+ num_attention_heads = 4 ,
136+ num_hidden_layers = 5 ,
137+ vocab_size = 5002 ,
151138 )
139+ # TODO: remove after fixing the non-deterministic text encoder
140+ text_encoder = RobertaSeriesModelWithTransformation (text_encoder_config )
141+ components ["text_encoder" ] = text_encoder
142+
143+ alt_pipe = AltDiffusionPipeline (** components )
152144 alt_pipe = alt_pipe .to (device )
153145 alt_pipe .set_progress_bar_config (disable = None )
154146
155- prompt = "A photo of an astronaut"
156-
157- generator = torch .Generator (device = device ).manual_seed (0 )
158- output = alt_pipe ([prompt ], generator = generator , guidance_scale = 6.0 , num_inference_steps = 2 , output_type = "np" )
147+ inputs = self .get_dummy_inputs (device )
148+ inputs ["prompt" ] = "A photo of an astronaut"
149+ output = alt_pipe (** inputs )
159150 image = output .images
160-
161- generator = torch .Generator (device = device ).manual_seed (0 )
162- image_from_tuple = alt_pipe (
163- [prompt ],
164- generator = generator ,
165- guidance_scale = 6.0 ,
166- num_inference_steps = 2 ,
167- output_type = "np" ,
168- return_dict = False ,
169- )[0 ]
170-
171151 image_slice = image [0 , - 3 :, - 3 :, - 1 ]
172- image_from_tuple_slice = image_from_tuple [0 , - 3 :, - 3 :, - 1 ]
173152
174153 assert image .shape == (1 , 64 , 64 , 3 )
175154 expected_slice = np .array (
176155 [0.5748162 , 0.60447145 , 0.48821217 , 0.50100636 , 0.5431185 , 0.45763683 , 0.49657696 , 0.48132733 , 0.47573093 ]
177156 )
178157
179158 assert np .abs (image_slice .flatten () - expected_slice ).max () < 1e-2
180- assert np .abs (image_from_tuple_slice .flatten () - expected_slice ).max () < 1e-2
181159
182160 def test_alt_diffusion_pndm (self ):
183161 device = "cpu" # ensure determinism for the device-dependent torch.Generator
184- unet = self .dummy_cond_unet
185- scheduler = PNDMScheduler (skip_prk_steps = True )
186- vae = self .dummy_vae
187- bert = self .dummy_text_encoder
188- tokenizer = XLMRobertaTokenizer .from_pretrained ("hf-internal-testing/tiny-xlm-roberta" )
189- tokenizer .model_max_length = 77
190162
191- # make sure here that pndm scheduler skips prk
192- alt_pipe = AltDiffusionPipeline (
193- unet = unet ,
194- scheduler = scheduler ,
195- vae = vae ,
196- text_encoder = bert ,
197- tokenizer = tokenizer ,
198- safety_checker = None ,
199- feature_extractor = self .dummy_extractor ,
163+ components = self .get_dummy_components ()
164+ components ["scheduler" ] = PNDMScheduler (skip_prk_steps = True )
165+ torch .manual_seed (0 )
166+ text_encoder_config = RobertaSeriesConfig (
167+ hidden_size = 32 ,
168+ project_dim = 32 ,
169+ intermediate_size = 37 ,
170+ layer_norm_eps = 1e-05 ,
171+ num_attention_heads = 4 ,
172+ num_hidden_layers = 5 ,
173+ vocab_size = 5002 ,
200174 )
175+ # TODO: remove after fixing the non-deterministic text encoder
176+ text_encoder = RobertaSeriesModelWithTransformation (text_encoder_config )
177+ components ["text_encoder" ] = text_encoder
178+ alt_pipe = AltDiffusionPipeline (** components )
201179 alt_pipe = alt_pipe .to (device )
202180 alt_pipe .set_progress_bar_config (disable = None )
203181
204- prompt = "A painting of a squirrel eating a burger"
205- generator = torch .Generator (device = device ).manual_seed (0 )
206- output = alt_pipe ([prompt ], generator = generator , guidance_scale = 6.0 , num_inference_steps = 2 , output_type = "np" )
207-
182+ inputs = self .get_dummy_inputs (device )
183+ output = alt_pipe (** inputs )
208184 image = output .images
209-
210- generator = torch .Generator (device = device ).manual_seed (0 )
211- image_from_tuple = alt_pipe (
212- [prompt ],
213- generator = generator ,
214- guidance_scale = 6.0 ,
215- num_inference_steps = 2 ,
216- output_type = "np" ,
217- return_dict = False ,
218- )[0 ]
219-
220185 image_slice = image [0 , - 3 :, - 3 :, - 1 ]
221- image_from_tuple_slice = image_from_tuple [0 , - 3 :, - 3 :, - 1 ]
222186
223187 assert image .shape == (1 , 64 , 64 , 3 )
224188 expected_slice = np .array (
225189 [0.51605093 , 0.5707241 , 0.47365507 , 0.50578886 , 0.5633877 , 0.4642503 , 0.5182081 , 0.48763484 , 0.49084237 ]
226190 )
227191 assert np .abs (image_slice .flatten () - expected_slice ).max () < 1e-2
228- assert np .abs (image_from_tuple_slice .flatten () - expected_slice ).max () < 1e-2
229-
230- @unittest .skipIf (torch_device != "cuda" , "This test requires a GPU" )
231- def test_alt_diffusion_fp16 (self ):
232- """Test that stable diffusion works with fp16"""
233- unet = self .dummy_cond_unet
234- scheduler = PNDMScheduler (skip_prk_steps = True )
235- vae = self .dummy_vae
236- bert = self .dummy_text_encoder
237- tokenizer = XLMRobertaTokenizer .from_pretrained ("hf-internal-testing/tiny-xlm-roberta" )
238- tokenizer .model_max_length = 77
239-
240- # put models in fp16
241- unet = unet .half ()
242- vae = vae .half ()
243- bert = bert .half ()
244-
245- # make sure here that pndm scheduler skips prk
246- alt_pipe = AltDiffusionPipeline (
247- unet = unet ,
248- scheduler = scheduler ,
249- vae = vae ,
250- text_encoder = bert ,
251- tokenizer = tokenizer ,
252- safety_checker = None ,
253- feature_extractor = self .dummy_extractor ,
254- )
255- alt_pipe = alt_pipe .to (torch_device )
256- alt_pipe .set_progress_bar_config (disable = None )
257-
258- prompt = "A painting of a squirrel eating a burger"
259- generator = torch .Generator (device = torch_device ).manual_seed (0 )
260- image = alt_pipe ([prompt ], generator = generator , num_inference_steps = 2 , output_type = "np" ).images
261-
262- assert image .shape == (1 , 64 , 64 , 3 )
263192
264193
265194@slow
0 commit comments