|
31 | 31 |
|
32 | 32 |
|
33 | 33 | class UnCLIPPipeline(DiffusionPipeline): |
| 34 | + """ |
| 35 | + Pipeline for text-to-image generation using unCLIP |
| 36 | +
|
| 37 | + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the |
| 38 | + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) |
| 39 | +
|
| 40 | + Args: |
| 41 | + text_encoder ([`CLIPTextModelWithProjection`]): |
| 42 | + Frozen text-encoder. |
| 43 | + tokenizer (`CLIPTokenizer`): |
| 44 | + Tokenizer of class |
| 45 | + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). |
| 46 | + prior ([`PriorTransformer`]): |
| 47 | + The canonincal unCLIP prior to approximate the image embedding from the text embedding. |
| 48 | + decoder ([`UNet2DConditionModel`]): |
| 49 | + The decoder to invert the image embedding into an image. |
| 50 | + super_res_first ([`UNet2DModel`]): |
| 51 | + Super resolution unet. Used in all but the last step of the super resolution diffusion process. |
| 52 | + super_res_last ([`UNet2DModel`]): |
| 53 | + Super resolution unet. Used in the last step of the super resolution diffusion process. |
| 54 | + prior_scheduler ([`UnCLIPScheduler`]): |
| 55 | + Scheduler used in the prior denoising process. Just a modified DDPMScheduler. |
| 56 | + decoder_scheduler ([`UnCLIPScheduler`]): |
| 57 | + Scheduler used in the decoder denoising process. Just a modified DDPMScheduler. |
| 58 | + super_res_scheduler ([`UnCLIPScheduler`]): |
| 59 | + Scheduler used in the super resolution denoising process. Just a modified DDPMScheduler. |
| 60 | +
|
| 61 | + """ |
| 62 | + |
34 | 63 | prior: PriorTransformer |
35 | 64 | decoder: UNet2DConditionModel |
36 | 65 | text_proj: UnCLIPTextProjModel |
@@ -173,6 +202,50 @@ def __call__( |
173 | 202 | output_type: Optional[str] = "pil", |
174 | 203 | return_dict: bool = True, |
175 | 204 | ): |
| 205 | + """ |
| 206 | + Function invoked when calling the pipeline for generation. |
| 207 | +
|
| 208 | + Args: |
| 209 | + prompt (`str` or `List[str]`): |
| 210 | + The prompt or prompts to guide the image generation. |
| 211 | + num_images_per_prompt (`int`, *optional*, defaults to 1): |
| 212 | + The number of images to generate per prompt. |
| 213 | + prior_num_inference_steps (`int`, *optional*, defaults to 25): |
| 214 | + The number of denoising steps for the prior. More denoising steps usually lead to a higher quality |
| 215 | + image at the expense of slower inference. |
| 216 | + decoder_num_inference_steps (`int`, *optional*, defaults to 25): |
| 217 | + The number of denoising steps for the decoder. More denoising steps usually lead to a higher quality |
| 218 | + image at the expense of slower inference. |
| 219 | + super_res_num_inference_steps (`int`, *optional*, defaults to 7): |
| 220 | + The number of denoising steps for super resolution. More denoising steps usually lead to a higher |
| 221 | + quality image at the expense of slower inference. |
| 222 | + generator (`torch.Generator`, *optional*): |
| 223 | + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) |
| 224 | + to make generation deterministic. |
| 225 | + prior_latents (`torch.FloatTensor` of shape (batch size, embeddings dimension), *optional*): |
| 226 | + Pre-generated noisy latents to be used as inputs for the prior. |
| 227 | + decoder_latents (`torch.FloatTensor` of shape (batch size, channels, height, width), *optional*): |
| 228 | + Pre-generated noisy latents to be used as inputs for the decoder. |
| 229 | + super_res_latents (`torch.FloatTensor` of shape (batch size, channels, super res height, super res width), *optional*): |
| 230 | + Pre-generated noisy latents to be used as inputs for the decoder. |
| 231 | + prior_guidance_scale (`float`, *optional*, defaults to 4.0): |
| 232 | + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). |
| 233 | + `guidance_scale` is defined as `w` of equation 2. of [Imagen |
| 234 | + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > |
| 235 | + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, |
| 236 | + usually at the expense of lower image quality. |
| 237 | + decoder_guidance_scale (`float`, *optional*, defaults to 4.0): |
| 238 | + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). |
| 239 | + `guidance_scale` is defined as `w` of equation 2. of [Imagen |
| 240 | + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > |
| 241 | + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, |
| 242 | + usually at the expense of lower image quality. |
| 243 | + output_type (`str`, *optional*, defaults to `"pil"`): |
| 244 | + The output format of the generated image. Choose between |
| 245 | + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. |
| 246 | + return_dict (`bool`, *optional*, defaults to `True`): |
| 247 | + Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. |
| 248 | + """ |
176 | 249 | if isinstance(prompt, str): |
177 | 250 | batch_size = 1 |
178 | 251 | elif isinstance(prompt, list): |
|
0 commit comments