Skip to content

Commit 865ba10

Browse files
authored
[Qwen-Image] adding validation for guidance_scale, true_cfg_scale and negative_prompt (huggingface#12223)
* up
1 parent 552c127 commit 865ba10

File tree

5 files changed

+180
-70
lines changed

5 files changed

+180
-70
lines changed

src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -435,7 +435,7 @@ def __call__(
435435
width: Optional[int] = None,
436436
num_inference_steps: int = 50,
437437
sigmas: Optional[List[float]] = None,
438-
guidance_scale: float = 1.0,
438+
guidance_scale: Optional[float] = None,
439439
num_images_per_prompt: int = 1,
440440
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
441441
latents: Optional[torch.Tensor] = None,
@@ -462,7 +462,12 @@ def __call__(
462462
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
463463
not greater than `1`).
464464
true_cfg_scale (`float`, *optional*, defaults to 1.0):
465-
When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
465+
Guidance scale as defined in [Classifier-Free Diffusion
466+
Guidance](https://huggingface.co/papers/2207.12598). `true_cfg_scale` is defined as `w` of equation 2.
467+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Classifier-free guidance is enabled by
468+
setting `true_cfg_scale > 1` and a provided `negative_prompt`. Higher guidance scale encourages to
469+
generate images that are closely linked to the text `prompt`, usually at the expense of lower image
470+
quality.
466471
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
467472
The height in pixels of the generated image. This is set to 1024 by default for the best results.
468473
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
@@ -474,17 +479,16 @@ def __call__(
474479
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
475480
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
476481
will be used.
477-
guidance_scale (`float`, *optional*, defaults to 3.5):
478-
Guidance scale as defined in [Classifier-Free Diffusion
479-
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
480-
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
481-
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
482-
the text `prompt`, usually at the expense of lower image quality.
483-
484-
This parameter in the pipeline is there to support future guidance-distilled models when they come up.
485-
Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance,
486-
please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should
487-
enable classifier-free guidance computations.
482+
guidance_scale (`float`, *optional*, defaults to None):
483+
A guidance scale value for guidance distilled models. Unlike the traditional classifier-free guidance
484+
where the guidance scale is applied during inference through noise prediction rescaling, guidance
485+
distilled models take the guidance scale directly as an input parameter during forward pass. Guidance
486+
scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images
487+
that are closely linked to the text `prompt`, usually at the expense of lower image quality. This
488+
parameter in the pipeline is there to support future guidance-distilled models when they come up. It is
489+
ignored when not using guidance distilled models. To enable traditional classifier-free guidance,
490+
please pass `true_cfg_scale > 1.0` and `negative_prompt` (even an empty negative prompt like " " should
491+
enable classifier-free guidance computations).
488492
num_images_per_prompt (`int`, *optional*, defaults to 1):
489493
The number of images to generate per prompt.
490494
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -564,6 +568,16 @@ def __call__(
564568
has_neg_prompt = negative_prompt is not None or (
565569
negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
566570
)
571+
572+
if true_cfg_scale > 1 and not has_neg_prompt:
573+
logger.warning(
574+
f"true_cfg_scale is passed as {true_cfg_scale}, but classifier-free guidance is not enabled since no negative_prompt is provided."
575+
)
576+
elif true_cfg_scale <= 1 and has_neg_prompt:
577+
logger.warning(
578+
" negative_prompt is passed but classifier-free guidance is not enabled since true_cfg_scale <= 1"
579+
)
580+
567581
do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
568582
prompt_embeds, prompt_embeds_mask = self.encode_prompt(
569583
prompt=prompt,
@@ -618,10 +632,17 @@ def __call__(
618632
self._num_timesteps = len(timesteps)
619633

620634
# handle guidance
621-
if self.transformer.config.guidance_embeds:
635+
if self.transformer.config.guidance_embeds and guidance_scale is None:
636+
raise ValueError("guidance_scale is required for guidance-distilled model.")
637+
elif self.transformer.config.guidance_embeds:
622638
guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
623639
guidance = guidance.expand(latents.shape[0])
624-
else:
640+
elif not self.transformer.config.guidance_embeds and guidance_scale is not None:
641+
logger.warning(
642+
f"guidance_scale is passed as {guidance_scale}, but ignored since the model is not guidance-distilled."
643+
)
644+
guidance = None
645+
elif not self.transformer.config.guidance_embeds and guidance_scale is None:
625646
guidance = None
626647

627648
if self.attention_kwargs is None:

src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -535,7 +535,7 @@ def __call__(
535535
width: Optional[int] = None,
536536
num_inference_steps: int = 50,
537537
sigmas: Optional[List[float]] = None,
538-
guidance_scale: float = 1.0,
538+
guidance_scale: Optional[float] = None,
539539
control_guidance_start: Union[float, List[float]] = 0.0,
540540
control_guidance_end: Union[float, List[float]] = 1.0,
541541
control_image: PipelineImageInput = None,
@@ -566,7 +566,12 @@ def __call__(
566566
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
567567
not greater than `1`).
568568
true_cfg_scale (`float`, *optional*, defaults to 1.0):
569-
When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
569+
Guidance scale as defined in [Classifier-Free Diffusion
570+
Guidance](https://huggingface.co/papers/2207.12598). `true_cfg_scale` is defined as `w` of equation 2.
571+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Classifier-free guidance is enabled by
572+
setting `true_cfg_scale > 1` and a provided `negative_prompt`. Higher guidance scale encourages to
573+
generate images that are closely linked to the text `prompt`, usually at the expense of lower image
574+
quality.
570575
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
571576
The height in pixels of the generated image. This is set to 1024 by default for the best results.
572577
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
@@ -578,12 +583,16 @@ def __call__(
578583
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
579584
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
580585
will be used.
581-
guidance_scale (`float`, *optional*, defaults to 3.5):
582-
Guidance scale as defined in [Classifier-Free Diffusion
583-
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
584-
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
585-
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
586-
the text `prompt`, usually at the expense of lower image quality.
586+
guidance_scale (`float`, *optional*, defaults to None):
587+
A guidance scale value for guidance distilled models. Unlike the traditional classifier-free guidance
588+
where the guidance scale is applied during inference through noise prediction rescaling, guidance
589+
distilled models take the guidance scale directly as an input parameter during forward pass. Guidance
590+
scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images
591+
that are closely linked to the text `prompt`, usually at the expense of lower image quality. This
592+
parameter in the pipeline is there to support future guidance-distilled models when they come up. It is
593+
ignored when not using guidance distilled models. To enable traditional classifier-free guidance,
594+
please pass `true_cfg_scale > 1.0` and `negative_prompt` (even an empty negative prompt like " " should
595+
enable classifier-free guidance computations).
587596
num_images_per_prompt (`int`, *optional*, defaults to 1):
588597
The number of images to generate per prompt.
589598
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -674,6 +683,16 @@ def __call__(
674683
has_neg_prompt = negative_prompt is not None or (
675684
negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
676685
)
686+
687+
if true_cfg_scale > 1 and not has_neg_prompt:
688+
logger.warning(
689+
f"true_cfg_scale is passed as {true_cfg_scale}, but classifier-free guidance is not enabled since no negative_prompt is provided."
690+
)
691+
elif true_cfg_scale <= 1 and has_neg_prompt:
692+
logger.warning(
693+
" negative_prompt is passed but classifier-free guidance is not enabled since true_cfg_scale <= 1"
694+
)
695+
677696
do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
678697
prompt_embeds, prompt_embeds_mask = self.encode_prompt(
679698
prompt=prompt,
@@ -822,10 +841,17 @@ def __call__(
822841
controlnet_keep.append(keeps[0] if isinstance(self.controlnet, QwenImageControlNetModel) else keeps)
823842

824843
# handle guidance
825-
if self.transformer.config.guidance_embeds:
844+
if self.transformer.config.guidance_embeds and guidance_scale is None:
845+
raise ValueError("guidance_scale is required for guidance-distilled model.")
846+
elif self.transformer.config.guidance_embeds:
826847
guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
827848
guidance = guidance.expand(latents.shape[0])
828-
else:
849+
elif not self.transformer.config.guidance_embeds and guidance_scale is not None:
850+
logger.warning(
851+
f"guidance_scale is passed as {guidance_scale}, but ignored since the model is not guidance-distilled."
852+
)
853+
guidance = None
854+
elif not self.transformer.config.guidance_embeds and guidance_scale is None:
829855
guidance = None
830856

831857
if self.attention_kwargs is None:

src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,7 @@ def __call__(
532532
width: Optional[int] = None,
533533
num_inference_steps: int = 50,
534534
sigmas: Optional[List[float]] = None,
535-
guidance_scale: float = 1.0,
535+
guidance_scale: Optional[float] = None,
536536
num_images_per_prompt: int = 1,
537537
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
538538
latents: Optional[torch.Tensor] = None,
@@ -559,7 +559,12 @@ def __call__(
559559
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
560560
not greater than `1`).
561561
true_cfg_scale (`float`, *optional*, defaults to 1.0):
562-
When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
562+
true_cfg_scale (`float`, *optional*, defaults to 1.0): Guidance scale as defined in [Classifier-Free
563+
Diffusion Guidance](https://huggingface.co/papers/2207.12598). `true_cfg_scale` is defined as `w` of
564+
equation 2. of [Imagen Paper](https://huggingface.co/papers/2205.11487). Classifier-free guidance is
565+
enabled by setting `true_cfg_scale > 1` and a provided `negative_prompt`. Higher guidance scale
566+
encourages to generate images that are closely linked to the text `prompt`, usually at the expense of
567+
lower image quality.
563568
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
564569
The height in pixels of the generated image. This is set to 1024 by default for the best results.
565570
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
@@ -571,17 +576,16 @@ def __call__(
571576
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
572577
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
573578
will be used.
574-
guidance_scale (`float`, *optional*, defaults to 3.5):
575-
Guidance scale as defined in [Classifier-Free Diffusion
576-
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
577-
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
578-
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
579-
the text `prompt`, usually at the expense of lower image quality.
580-
581-
This parameter in the pipeline is there to support future guidance-distilled models when they come up.
582-
Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance,
583-
please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should
584-
enable classifier-free guidance computations.
579+
guidance_scale (`float`, *optional*, defaults to None):
580+
A guidance scale value for guidance distilled models. Unlike the traditional classifier-free guidance
581+
where the guidance scale is applied during inference through noise prediction rescaling, guidance
582+
distilled models take the guidance scale directly as an input parameter during forward pass. Guidance
583+
scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images
584+
that are closely linked to the text `prompt`, usually at the expense of lower image quality. This
585+
parameter in the pipeline is there to support future guidance-distilled models when they come up. It is
586+
ignored when not using guidance distilled models. To enable traditional classifier-free guidance,
587+
please pass `true_cfg_scale > 1.0` and `negative_prompt` (even an empty negative prompt like " " should
588+
enable classifier-free guidance computations).
585589
num_images_per_prompt (`int`, *optional*, defaults to 1):
586590
The number of images to generate per prompt.
587591
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -672,6 +676,16 @@ def __call__(
672676
has_neg_prompt = negative_prompt is not None or (
673677
negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
674678
)
679+
680+
if true_cfg_scale > 1 and not has_neg_prompt:
681+
logger.warning(
682+
f"true_cfg_scale is passed as {true_cfg_scale}, but classifier-free guidance is not enabled since no negative_prompt is provided."
683+
)
684+
elif true_cfg_scale <= 1 and has_neg_prompt:
685+
logger.warning(
686+
" negative_prompt is passed but classifier-free guidance is not enabled since true_cfg_scale <= 1"
687+
)
688+
675689
do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
676690
prompt_embeds, prompt_embeds_mask = self.encode_prompt(
677691
image=prompt_image,
@@ -734,10 +748,17 @@ def __call__(
734748
self._num_timesteps = len(timesteps)
735749

736750
# handle guidance
737-
if self.transformer.config.guidance_embeds:
751+
if self.transformer.config.guidance_embeds and guidance_scale is None:
752+
raise ValueError("guidance_scale is required for guidance-distilled model.")
753+
elif self.transformer.config.guidance_embeds:
738754
guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
739755
guidance = guidance.expand(latents.shape[0])
740-
else:
756+
elif not self.transformer.config.guidance_embeds and guidance_scale is not None:
757+
logger.warning(
758+
f"guidance_scale is passed as {guidance_scale}, but ignored since the model is not guidance-distilled."
759+
)
760+
guidance = None
761+
elif not self.transformer.config.guidance_embeds and guidance_scale is None:
741762
guidance = None
742763

743764
if self.attention_kwargs is None:

0 commit comments

Comments
 (0)