14
14
# limitations under the License.
15
15
16
16
import gc
17
- import random
18
17
import unittest
19
18
20
19
import numpy as np
25
24
RobertaSeriesConfig ,
26
25
RobertaSeriesModelWithTransformation ,
27
26
)
28
- from diffusers .utils import floats_tensor , slow , torch_device
27
+ from diffusers .utils import slow , torch_device
29
28
from diffusers .utils .testing_utils import require_torch_gpu
30
- from transformers import XLMRobertaTokenizer
29
+ from transformers import CLIPTextConfig , CLIPTextModel , XLMRobertaTokenizer
31
30
32
31
from ...test_pipelines_common import PipelineTesterMixin
33
32
36
35
37
36
38
37
class AltDiffusionPipelineFastTests (PipelineTesterMixin , unittest .TestCase ):
39
- def tearDown (self ):
40
- # clean up the VRAM after each test
41
- super ().tearDown ()
42
- gc .collect ()
43
- torch .cuda .empty_cache ()
38
+ pipeline_class = AltDiffusionPipeline
44
39
45
- @property
46
- def dummy_image (self ):
47
- batch_size = 1
48
- num_channels = 3
49
- sizes = (32 , 32 )
50
-
51
- image = floats_tensor ((batch_size , num_channels ) + sizes , rng = random .Random (0 )).to (torch_device )
52
- return image
53
-
54
- @property
55
- def dummy_cond_unet (self ):
40
+ def get_dummy_components (self ):
56
41
torch .manual_seed (0 )
57
- model = UNet2DConditionModel (
42
+ unet = UNet2DConditionModel (
58
43
block_out_channels = (32 , 64 ),
59
44
layers_per_block = 2 ,
60
45
sample_size = 32 ,
@@ -64,202 +49,146 @@ def dummy_cond_unet(self):
64
49
up_block_types = ("CrossAttnUpBlock2D" , "UpBlock2D" ),
65
50
cross_attention_dim = 32 ,
66
51
)
67
- return model
68
-
69
- @property
70
- def dummy_cond_unet_inpaint (self ):
71
- torch .manual_seed (0 )
72
- model = UNet2DConditionModel (
73
- block_out_channels = (32 , 64 ),
74
- layers_per_block = 2 ,
75
- sample_size = 32 ,
76
- in_channels = 9 ,
77
- out_channels = 4 ,
78
- down_block_types = ("DownBlock2D" , "CrossAttnDownBlock2D" ),
79
- up_block_types = ("CrossAttnUpBlock2D" , "UpBlock2D" ),
80
- cross_attention_dim = 32 ,
52
+ scheduler = DDIMScheduler (
53
+ beta_start = 0.00085 ,
54
+ beta_end = 0.012 ,
55
+ beta_schedule = "scaled_linear" ,
56
+ clip_sample = False ,
57
+ set_alpha_to_one = False ,
81
58
)
82
- return model
83
-
84
- @property
85
- def dummy_vae (self ):
86
59
torch .manual_seed (0 )
87
- model = AutoencoderKL (
60
+ vae = AutoencoderKL (
88
61
block_out_channels = [32 , 64 ],
89
62
in_channels = 3 ,
90
63
out_channels = 3 ,
91
64
down_block_types = ["DownEncoderBlock2D" , "DownEncoderBlock2D" ],
92
65
up_block_types = ["UpDecoderBlock2D" , "UpDecoderBlock2D" ],
93
66
latent_channels = 4 ,
94
67
)
95
- return model
96
68
97
- @property
98
- def dummy_text_encoder (self ):
69
+ # TODO: address the non-deterministic text encoder (fails for save-load tests)
70
+ # torch.manual_seed(0)
71
+ # text_encoder_config = RobertaSeriesConfig(
72
+ # hidden_size=32,
73
+ # project_dim=32,
74
+ # intermediate_size=37,
75
+ # layer_norm_eps=1e-05,
76
+ # num_attention_heads=4,
77
+ # num_hidden_layers=5,
78
+ # vocab_size=5002,
79
+ # )
80
+ # text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config)
81
+
99
82
torch .manual_seed (0 )
100
- config = RobertaSeriesConfig (
83
+ text_encoder_config = CLIPTextConfig (
84
+ bos_token_id = 0 ,
85
+ eos_token_id = 2 ,
101
86
hidden_size = 32 ,
102
- project_dim = 32 ,
87
+ projection_dim = 32 ,
103
88
intermediate_size = 37 ,
104
89
layer_norm_eps = 1e-05 ,
105
90
num_attention_heads = 4 ,
106
91
num_hidden_layers = 5 ,
92
+ pad_token_id = 1 ,
107
93
vocab_size = 5002 ,
108
94
)
109
- return RobertaSeriesModelWithTransformation ( config )
95
+ text_encoder = CLIPTextModel ( text_encoder_config )
110
96
111
- @property
112
- def dummy_extractor (self ):
113
- def extract (* args , ** kwargs ):
114
- class Out :
115
- def __init__ (self ):
116
- self .pixel_values = torch .ones ([0 ])
117
-
118
- def to (self , device ):
119
- self .pixel_values .to (device )
120
- return self
121
-
122
- return Out ()
97
+ tokenizer = XLMRobertaTokenizer .from_pretrained ("hf-internal-testing/tiny-xlm-roberta" )
98
+ tokenizer .model_max_length = 77
123
99
124
- return extract
100
+ components = {
101
+ "unet" : unet ,
102
+ "scheduler" : scheduler ,
103
+ "vae" : vae ,
104
+ "text_encoder" : text_encoder ,
105
+ "tokenizer" : tokenizer ,
106
+ "safety_checker" : None ,
107
+ "feature_extractor" : None ,
108
+ }
109
+ return components
110
+
111
+ def get_dummy_inputs (self , device , seed = 0 ):
112
+ if str (device ).startswith ("mps" ):
113
+ generator = torch .manual_seed (seed )
114
+ else :
115
+ generator = torch .Generator (device = device ).manual_seed (seed )
116
+ inputs = {
117
+ "prompt" : "A painting of a squirrel eating a burger" ,
118
+ "generator" : generator ,
119
+ "num_inference_steps" : 2 ,
120
+ "guidance_scale" : 6.0 ,
121
+ "output_type" : "numpy" ,
122
+ }
123
+ return inputs
125
124
126
125
def test_alt_diffusion_ddim (self ):
127
126
device = "cpu" # ensure determinism for the device-dependent torch.Generator
128
- unet = self .dummy_cond_unet
129
- scheduler = DDIMScheduler (
130
- beta_start = 0.00085 ,
131
- beta_end = 0.012 ,
132
- beta_schedule = "scaled_linear" ,
133
- clip_sample = False ,
134
- set_alpha_to_one = False ,
135
- )
136
127
137
- vae = self .dummy_vae
138
- bert = self .dummy_text_encoder
139
- tokenizer = XLMRobertaTokenizer .from_pretrained ("hf-internal-testing/tiny-xlm-roberta" )
140
- tokenizer .model_max_length = 77
141
-
142
- # make sure here that pndm scheduler skips prk
143
- alt_pipe = AltDiffusionPipeline (
144
- unet = unet ,
145
- scheduler = scheduler ,
146
- vae = vae ,
147
- text_encoder = bert ,
148
- tokenizer = tokenizer ,
149
- safety_checker = None ,
150
- feature_extractor = self .dummy_extractor ,
128
+ components = self .get_dummy_components ()
129
+ torch .manual_seed (0 )
130
+ text_encoder_config = RobertaSeriesConfig (
131
+ hidden_size = 32 ,
132
+ project_dim = 32 ,
133
+ intermediate_size = 37 ,
134
+ layer_norm_eps = 1e-05 ,
135
+ num_attention_heads = 4 ,
136
+ num_hidden_layers = 5 ,
137
+ vocab_size = 5002 ,
151
138
)
139
+ # TODO: remove after fixing the non-deterministic text encoder
140
+ text_encoder = RobertaSeriesModelWithTransformation (text_encoder_config )
141
+ components ["text_encoder" ] = text_encoder
142
+
143
+ alt_pipe = AltDiffusionPipeline (** components )
152
144
alt_pipe = alt_pipe .to (device )
153
145
alt_pipe .set_progress_bar_config (disable = None )
154
146
155
- prompt = "A photo of an astronaut"
156
-
157
- generator = torch .Generator (device = device ).manual_seed (0 )
158
- output = alt_pipe ([prompt ], generator = generator , guidance_scale = 6.0 , num_inference_steps = 2 , output_type = "np" )
147
+ inputs = self .get_dummy_inputs (device )
148
+ inputs ["prompt" ] = "A photo of an astronaut"
149
+ output = alt_pipe (** inputs )
159
150
image = output .images
160
-
161
- generator = torch .Generator (device = device ).manual_seed (0 )
162
- image_from_tuple = alt_pipe (
163
- [prompt ],
164
- generator = generator ,
165
- guidance_scale = 6.0 ,
166
- num_inference_steps = 2 ,
167
- output_type = "np" ,
168
- return_dict = False ,
169
- )[0 ]
170
-
171
151
image_slice = image [0 , - 3 :, - 3 :, - 1 ]
172
- image_from_tuple_slice = image_from_tuple [0 , - 3 :, - 3 :, - 1 ]
173
152
174
153
assert image .shape == (1 , 64 , 64 , 3 )
175
154
expected_slice = np .array (
176
155
[0.5748162 , 0.60447145 , 0.48821217 , 0.50100636 , 0.5431185 , 0.45763683 , 0.49657696 , 0.48132733 , 0.47573093 ]
177
156
)
178
157
179
158
assert np .abs (image_slice .flatten () - expected_slice ).max () < 1e-2
180
- assert np .abs (image_from_tuple_slice .flatten () - expected_slice ).max () < 1e-2
181
159
182
160
def test_alt_diffusion_pndm (self ):
183
161
device = "cpu" # ensure determinism for the device-dependent torch.Generator
184
- unet = self .dummy_cond_unet
185
- scheduler = PNDMScheduler (skip_prk_steps = True )
186
- vae = self .dummy_vae
187
- bert = self .dummy_text_encoder
188
- tokenizer = XLMRobertaTokenizer .from_pretrained ("hf-internal-testing/tiny-xlm-roberta" )
189
- tokenizer .model_max_length = 77
190
162
191
- # make sure here that pndm scheduler skips prk
192
- alt_pipe = AltDiffusionPipeline (
193
- unet = unet ,
194
- scheduler = scheduler ,
195
- vae = vae ,
196
- text_encoder = bert ,
197
- tokenizer = tokenizer ,
198
- safety_checker = None ,
199
- feature_extractor = self .dummy_extractor ,
163
+ components = self .get_dummy_components ()
164
+ components ["scheduler" ] = PNDMScheduler (skip_prk_steps = True )
165
+ torch .manual_seed (0 )
166
+ text_encoder_config = RobertaSeriesConfig (
167
+ hidden_size = 32 ,
168
+ project_dim = 32 ,
169
+ intermediate_size = 37 ,
170
+ layer_norm_eps = 1e-05 ,
171
+ num_attention_heads = 4 ,
172
+ num_hidden_layers = 5 ,
173
+ vocab_size = 5002 ,
200
174
)
175
+ # TODO: remove after fixing the non-deterministic text encoder
176
+ text_encoder = RobertaSeriesModelWithTransformation (text_encoder_config )
177
+ components ["text_encoder" ] = text_encoder
178
+ alt_pipe = AltDiffusionPipeline (** components )
201
179
alt_pipe = alt_pipe .to (device )
202
180
alt_pipe .set_progress_bar_config (disable = None )
203
181
204
- prompt = "A painting of a squirrel eating a burger"
205
- generator = torch .Generator (device = device ).manual_seed (0 )
206
- output = alt_pipe ([prompt ], generator = generator , guidance_scale = 6.0 , num_inference_steps = 2 , output_type = "np" )
207
-
182
+ inputs = self .get_dummy_inputs (device )
183
+ output = alt_pipe (** inputs )
208
184
image = output .images
209
-
210
- generator = torch .Generator (device = device ).manual_seed (0 )
211
- image_from_tuple = alt_pipe (
212
- [prompt ],
213
- generator = generator ,
214
- guidance_scale = 6.0 ,
215
- num_inference_steps = 2 ,
216
- output_type = "np" ,
217
- return_dict = False ,
218
- )[0 ]
219
-
220
185
image_slice = image [0 , - 3 :, - 3 :, - 1 ]
221
- image_from_tuple_slice = image_from_tuple [0 , - 3 :, - 3 :, - 1 ]
222
186
223
187
assert image .shape == (1 , 64 , 64 , 3 )
224
188
expected_slice = np .array (
225
189
[0.51605093 , 0.5707241 , 0.47365507 , 0.50578886 , 0.5633877 , 0.4642503 , 0.5182081 , 0.48763484 , 0.49084237 ]
226
190
)
227
191
assert np .abs (image_slice .flatten () - expected_slice ).max () < 1e-2
228
- assert np .abs (image_from_tuple_slice .flatten () - expected_slice ).max () < 1e-2
229
-
230
- @unittest .skipIf (torch_device != "cuda" , "This test requires a GPU" )
231
- def test_alt_diffusion_fp16 (self ):
232
- """Test that stable diffusion works with fp16"""
233
- unet = self .dummy_cond_unet
234
- scheduler = PNDMScheduler (skip_prk_steps = True )
235
- vae = self .dummy_vae
236
- bert = self .dummy_text_encoder
237
- tokenizer = XLMRobertaTokenizer .from_pretrained ("hf-internal-testing/tiny-xlm-roberta" )
238
- tokenizer .model_max_length = 77
239
-
240
- # put models in fp16
241
- unet = unet .half ()
242
- vae = vae .half ()
243
- bert = bert .half ()
244
-
245
- # make sure here that pndm scheduler skips prk
246
- alt_pipe = AltDiffusionPipeline (
247
- unet = unet ,
248
- scheduler = scheduler ,
249
- vae = vae ,
250
- text_encoder = bert ,
251
- tokenizer = tokenizer ,
252
- safety_checker = None ,
253
- feature_extractor = self .dummy_extractor ,
254
- )
255
- alt_pipe = alt_pipe .to (torch_device )
256
- alt_pipe .set_progress_bar_config (disable = None )
257
-
258
- prompt = "A painting of a squirrel eating a burger"
259
- generator = torch .Generator (device = torch_device ).manual_seed (0 )
260
- image = alt_pipe ([prompt ], generator = generator , num_inference_steps = 2 , output_type = "np" ).images
261
-
262
- assert image .shape == (1 , 64 , 64 , 3 )
263
192
264
193
265
194
@slow
0 commit comments