2626 StableDiffusionXLImg2ImgPipeline ,
2727 UNet2DConditionModel ,
2828)
29- from diffusers .utils .testing_utils import enable_full_determinism , floats_tensor , require_torch_gpu , torch_device
29+ from diffusers .utils .testing_utils import (
30+ enable_full_determinism ,
31+ floats_tensor ,
32+ require_torch_gpu ,
33+ torch_device ,
34+ )
3035
3136from ..pipeline_params import (
3237 IMAGE_TO_IMAGE_IMAGE_PARAMS ,
@@ -159,24 +164,6 @@ def test_stable_diffusion_xl_img2img_euler(self):
159164
160165 assert np .abs (image_slice .flatten () - expected_slice ).max () < 1e-2
161166
162- def test_stable_diffusion_xl_refiner (self ):
163- device = "cpu" # ensure determinism for the device-dependent torch.Generator
164- components = self .get_dummy_components (skip_first_text_encoder = True )
165-
166- sd_pipe = StableDiffusionXLImg2ImgPipeline (** components )
167- sd_pipe = sd_pipe .to (device )
168- sd_pipe .set_progress_bar_config (disable = None )
169-
170- inputs = self .get_dummy_inputs (device )
171- image = sd_pipe (** inputs ).images
172- image_slice = image [0 , - 3 :, - 3 :, - 1 ]
173-
174- assert image .shape == (1 , 32 , 32 , 3 )
175-
176- expected_slice = np .array ([0.4578 , 0.4981 , 0.4301 , 0.6454 , 0.5588 , 0.4442 , 0.5678 , 0.5940 , 0.5176 ])
177-
178- assert np .abs (image_slice .flatten () - expected_slice ).max () < 1e-2
179-
180167 def test_attention_slicing_forward_pass (self ):
181168 super ().test_attention_slicing_forward_pass (expected_max_diff = 3e-3 )
182169
@@ -195,7 +182,8 @@ def test_stable_diffusion_xl_img2img_negative_prompt_embeds(self):
195182 sd_pipe .set_progress_bar_config (disable = None )
196183
197184 # forward without prompt embeds
198- inputs = self .get_dummy_inputs (torch_device )
185+ generator_device = "cpu"
186+ inputs = self .get_dummy_inputs (generator_device )
199187 negative_prompt = 3 * ["this is a negative prompt" ]
200188 inputs ["negative_prompt" ] = negative_prompt
201189 inputs ["prompt" ] = 3 * [inputs ["prompt" ]]
@@ -204,7 +192,8 @@ def test_stable_diffusion_xl_img2img_negative_prompt_embeds(self):
204192 image_slice_1 = output .images [0 , - 3 :, - 3 :, - 1 ]
205193
206194 # forward with prompt embeds
207- inputs = self .get_dummy_inputs (torch_device )
195+ generator_device = "cpu"
196+ inputs = self .get_dummy_inputs (generator_device )
208197 negative_prompt = 3 * ["this is a negative prompt" ]
209198 prompt = 3 * [inputs .pop ("prompt" )]
210199
@@ -248,7 +237,8 @@ def test_stable_diffusion_xl_offloads(self):
248237 for pipe in pipes :
249238 pipe .unet .set_default_attn_processor ()
250239
251- inputs = self .get_dummy_inputs (torch_device )
240+ generator_device = "cpu"
241+ inputs = self .get_dummy_inputs (generator_device )
252242 image = pipe (** inputs ).images
253243
254244 image_slices .append (image [0 , - 3 :, - 3 :, - 1 ].flatten ())
@@ -261,13 +251,15 @@ def test_stable_diffusion_xl_multi_prompts(self):
261251 sd_pipe = self .pipeline_class (** components ).to (torch_device )
262252
263253 # forward with single prompt
264- inputs = self .get_dummy_inputs (torch_device )
254+ generator_device = "cpu"
255+ inputs = self .get_dummy_inputs (generator_device )
265256 inputs ["num_inference_steps" ] = 5
266257 output = sd_pipe (** inputs )
267258 image_slice_1 = output .images [0 , - 3 :, - 3 :, - 1 ]
268259
269260 # forward with same prompt duplicated
270- inputs = self .get_dummy_inputs (torch_device )
261+ generator_device = "cpu"
262+ inputs = self .get_dummy_inputs (generator_device )
271263 inputs ["num_inference_steps" ] = 5
272264 inputs ["prompt_2" ] = inputs ["prompt" ]
273265 output = sd_pipe (** inputs )
@@ -277,7 +269,8 @@ def test_stable_diffusion_xl_multi_prompts(self):
277269 assert np .abs (image_slice_1 .flatten () - image_slice_2 .flatten ()).max () < 1e-4
278270
279271 # forward with different prompt
280- inputs = self .get_dummy_inputs (torch_device )
272+ generator_device = "cpu"
273+ inputs = self .get_dummy_inputs (generator_device )
281274 inputs ["num_inference_steps" ] = 5
282275 inputs ["prompt_2" ] = "different prompt"
283276 output = sd_pipe (** inputs )
@@ -287,14 +280,16 @@ def test_stable_diffusion_xl_multi_prompts(self):
287280 assert np .abs (image_slice_1 .flatten () - image_slice_3 .flatten ()).max () > 1e-4
288281
289282 # manually set a negative_prompt
290- inputs = self .get_dummy_inputs (torch_device )
283+ generator_device = "cpu"
284+ inputs = self .get_dummy_inputs (generator_device )
291285 inputs ["num_inference_steps" ] = 5
292286 inputs ["negative_prompt" ] = "negative prompt"
293287 output = sd_pipe (** inputs )
294288 image_slice_1 = output .images [0 , - 3 :, - 3 :, - 1 ]
295289
296290 # forward with same negative_prompt duplicated
297- inputs = self .get_dummy_inputs (torch_device )
291+ generator_device = "cpu"
292+ inputs = self .get_dummy_inputs (generator_device )
298293 inputs ["num_inference_steps" ] = 5
299294 inputs ["negative_prompt" ] = "negative prompt"
300295 inputs ["negative_prompt_2" ] = inputs ["negative_prompt" ]
@@ -305,7 +300,8 @@ def test_stable_diffusion_xl_multi_prompts(self):
305300 assert np .abs (image_slice_1 .flatten () - image_slice_2 .flatten ()).max () < 1e-4
306301
307302 # forward with different negative_prompt
308- inputs = self .get_dummy_inputs (torch_device )
303+ generator_device = "cpu"
304+ inputs = self .get_dummy_inputs (generator_device )
309305 inputs ["num_inference_steps" ] = 5
310306 inputs ["negative_prompt" ] = "negative prompt"
311307 inputs ["negative_prompt_2" ] = "different negative prompt"
@@ -342,3 +338,229 @@ def test_stable_diffusion_xl_img2img_negative_conditions(self):
342338 np .abs (image_slice_with_no_neg_conditions .flatten () - image_slice_with_neg_conditions .flatten ()).max ()
343339 > 1e-4
344340 )
341+
342+
343+ class StableDiffusionXLImg2ImgRefinerOnlyPipelineFastTests (
344+ PipelineLatentTesterMixin , PipelineTesterMixin , unittest .TestCase
345+ ):
346+ pipeline_class = StableDiffusionXLImg2ImgPipeline
347+ params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height" , "width" }
348+ required_optional_params = PipelineTesterMixin .required_optional_params - {"latents" }
349+ batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
350+ image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
351+ image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
352+
353+ def get_dummy_components (self ):
354+ torch .manual_seed (0 )
355+ unet = UNet2DConditionModel (
356+ block_out_channels = (32 , 64 ),
357+ layers_per_block = 2 ,
358+ sample_size = 32 ,
359+ in_channels = 4 ,
360+ out_channels = 4 ,
361+ down_block_types = ("DownBlock2D" , "CrossAttnDownBlock2D" ),
362+ up_block_types = ("CrossAttnUpBlock2D" , "UpBlock2D" ),
363+ # SD2-specific config below
364+ attention_head_dim = (2 , 4 ),
365+ use_linear_projection = True ,
366+ addition_embed_type = "text_time" ,
367+ addition_time_embed_dim = 8 ,
368+ transformer_layers_per_block = (1 , 2 ),
369+ projection_class_embeddings_input_dim = 72 , # 5 * 8 + 32
370+ cross_attention_dim = 32 ,
371+ )
372+ scheduler = EulerDiscreteScheduler (
373+ beta_start = 0.00085 ,
374+ beta_end = 0.012 ,
375+ steps_offset = 1 ,
376+ beta_schedule = "scaled_linear" ,
377+ timestep_spacing = "leading" ,
378+ )
379+ torch .manual_seed (0 )
380+ vae = AutoencoderKL (
381+ block_out_channels = [32 , 64 ],
382+ in_channels = 3 ,
383+ out_channels = 3 ,
384+ down_block_types = ["DownEncoderBlock2D" , "DownEncoderBlock2D" ],
385+ up_block_types = ["UpDecoderBlock2D" , "UpDecoderBlock2D" ],
386+ latent_channels = 4 ,
387+ sample_size = 128 ,
388+ )
389+ torch .manual_seed (0 )
390+ text_encoder_config = CLIPTextConfig (
391+ bos_token_id = 0 ,
392+ eos_token_id = 2 ,
393+ hidden_size = 32 ,
394+ intermediate_size = 37 ,
395+ layer_norm_eps = 1e-05 ,
396+ num_attention_heads = 4 ,
397+ num_hidden_layers = 5 ,
398+ pad_token_id = 1 ,
399+ vocab_size = 1000 ,
400+ # SD2-specific config below
401+ hidden_act = "gelu" ,
402+ projection_dim = 32 ,
403+ )
404+ text_encoder_2 = CLIPTextModelWithProjection (text_encoder_config )
405+ tokenizer_2 = CLIPTokenizer .from_pretrained ("hf-internal-testing/tiny-random-clip" )
406+
407+ components = {
408+ "unet" : unet ,
409+ "scheduler" : scheduler ,
410+ "vae" : vae ,
411+ "tokenizer" : None ,
412+ "text_encoder" : None ,
413+ "text_encoder_2" : text_encoder_2 ,
414+ "tokenizer_2" : tokenizer_2 ,
415+ "requires_aesthetics_score" : True ,
416+ }
417+ return components
418+
419+ def test_components_function (self ):
420+ init_components = self .get_dummy_components ()
421+ init_components .pop ("requires_aesthetics_score" )
422+ pipe = self .pipeline_class (** init_components )
423+
424+ self .assertTrue (hasattr (pipe , "components" ))
425+ self .assertTrue (set (pipe .components .keys ()) == set (init_components .keys ()))
426+
427+ def get_dummy_inputs (self , device , seed = 0 ):
428+ image = floats_tensor ((1 , 3 , 32 , 32 ), rng = random .Random (seed )).to (device )
429+ image = image / 2 + 0.5
430+ if str (device ).startswith ("mps" ):
431+ generator = torch .manual_seed (seed )
432+ else :
433+ generator = torch .Generator (device = device ).manual_seed (seed )
434+ inputs = {
435+ "prompt" : "A painting of a squirrel eating a burger" ,
436+ "image" : image ,
437+ "generator" : generator ,
438+ "num_inference_steps" : 2 ,
439+ "guidance_scale" : 5.0 ,
440+ "output_type" : "np" ,
441+ "strength" : 0.8 ,
442+ }
443+ return inputs
444+
445+ def test_stable_diffusion_xl_img2img_euler (self ):
446+ device = "cpu" # ensure determinism for the device-dependent torch.Generator
447+ components = self .get_dummy_components ()
448+ sd_pipe = StableDiffusionXLImg2ImgPipeline (** components )
449+ sd_pipe = sd_pipe .to (device )
450+ sd_pipe .set_progress_bar_config (disable = None )
451+
452+ inputs = self .get_dummy_inputs (device )
453+ image = sd_pipe (** inputs ).images
454+ image_slice = image [0 , - 3 :, - 3 :, - 1 ]
455+
456+ assert image .shape == (1 , 32 , 32 , 3 )
457+
458+ expected_slice = np .array ([0.4745 , 0.4924 , 0.4338 , 0.6468 , 0.5547 , 0.4419 , 0.5646 , 0.5897 , 0.5146 ])
459+
460+ assert np .abs (image_slice .flatten () - expected_slice ).max () < 1e-2
461+
462+ @require_torch_gpu
463+ def test_stable_diffusion_xl_offloads (self ):
464+ pipes = []
465+ components = self .get_dummy_components ()
466+ sd_pipe = StableDiffusionXLImg2ImgPipeline (** components ).to (torch_device )
467+ pipes .append (sd_pipe )
468+
469+ components = self .get_dummy_components ()
470+ sd_pipe = StableDiffusionXLImg2ImgPipeline (** components )
471+ sd_pipe .enable_model_cpu_offload ()
472+ pipes .append (sd_pipe )
473+
474+ components = self .get_dummy_components ()
475+ sd_pipe = StableDiffusionXLImg2ImgPipeline (** components )
476+ sd_pipe .enable_sequential_cpu_offload ()
477+ pipes .append (sd_pipe )
478+
479+ image_slices = []
480+ for pipe in pipes :
481+ pipe .unet .set_default_attn_processor ()
482+
483+ generator_device = "cpu"
484+ inputs = self .get_dummy_inputs (generator_device )
485+ image = pipe (** inputs ).images
486+
487+ image_slices .append (image [0 , - 3 :, - 3 :, - 1 ].flatten ())
488+
489+ assert np .abs (image_slices [0 ] - image_slices [1 ]).max () < 1e-3
490+ assert np .abs (image_slices [0 ] - image_slices [2 ]).max () < 1e-3
491+
492+ def test_stable_diffusion_xl_img2img_negative_conditions (self ):
493+ device = "cpu" # ensure determinism for the device-dependent torch.Generator
494+ components = self .get_dummy_components ()
495+
496+ sd_pipe = self .pipeline_class (** components )
497+ sd_pipe = sd_pipe .to (device )
498+ sd_pipe .set_progress_bar_config (disable = None )
499+
500+ inputs = self .get_dummy_inputs (device )
501+ image = sd_pipe (** inputs ).images
502+ image_slice_with_no_neg_conditions = image [0 , - 3 :, - 3 :, - 1 ]
503+
504+ image = sd_pipe (
505+ ** inputs ,
506+ negative_original_size = (512 , 512 ),
507+ negative_crops_coords_top_left = (
508+ 0 ,
509+ 0 ,
510+ ),
511+ negative_target_size = (1024 , 1024 ),
512+ ).images
513+ image_slice_with_neg_conditions = image [0 , - 3 :, - 3 :, - 1 ]
514+
515+ assert (
516+ np .abs (image_slice_with_no_neg_conditions .flatten () - image_slice_with_neg_conditions .flatten ()).max ()
517+ > 1e-4
518+ )
519+
520+ def test_stable_diffusion_xl_img2img_negative_prompt_embeds (self ):
521+ components = self .get_dummy_components ()
522+ sd_pipe = StableDiffusionXLImg2ImgPipeline (** components )
523+ sd_pipe = sd_pipe .to (torch_device )
524+ sd_pipe = sd_pipe .to (torch_device )
525+ sd_pipe .set_progress_bar_config (disable = None )
526+
527+ # forward without prompt embeds
528+ generator_device = "cpu"
529+ inputs = self .get_dummy_inputs (generator_device )
530+ negative_prompt = 3 * ["this is a negative prompt" ]
531+ inputs ["negative_prompt" ] = negative_prompt
532+ inputs ["prompt" ] = 3 * [inputs ["prompt" ]]
533+
534+ output = sd_pipe (** inputs )
535+ image_slice_1 = output .images [0 , - 3 :, - 3 :, - 1 ]
536+
537+ # forward with prompt embeds
538+ generator_device = "cpu"
539+ inputs = self .get_dummy_inputs (generator_device )
540+ negative_prompt = 3 * ["this is a negative prompt" ]
541+ prompt = 3 * [inputs .pop ("prompt" )]
542+
543+ (
544+ prompt_embeds ,
545+ negative_prompt_embeds ,
546+ pooled_prompt_embeds ,
547+ negative_pooled_prompt_embeds ,
548+ ) = sd_pipe .encode_prompt (prompt , negative_prompt = negative_prompt )
549+
550+ output = sd_pipe (
551+ ** inputs ,
552+ prompt_embeds = prompt_embeds ,
553+ negative_prompt_embeds = negative_prompt_embeds ,
554+ pooled_prompt_embeds = pooled_prompt_embeds ,
555+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds ,
556+ )
557+ image_slice_2 = output .images [0 , - 3 :, - 3 :, - 1 ]
558+
559+ # make sure that it's equal
560+ assert np .abs (image_slice_1 .flatten () - image_slice_2 .flatten ()).max () < 1e-4
561+
562+ def test_attention_slicing_forward_pass (self ):
563+ super ().test_attention_slicing_forward_pass (expected_max_diff = 3e-3 )
564+
565+ def test_inference_batch_single_identical (self ):
566+ super ().test_inference_batch_single_identical (expected_max_diff = 3e-3 )
0 commit comments