Port over tweaked pipeline/scheduler

iddl · iddl · commit c75e12d0e508 · 2025-09-26T10:23:39.000-04:00
diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -25,7 +25,6 @@
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import (
     USE_PEFT_BACKEND,
-    deprecate,
     is_torch_xla_available,
     logging,
     replace_example_docstring,
@@ -509,25 +508,13 @@ def enable_vae_slicing(self):
         Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
         compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
         """
-        depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
-        deprecate(
-            "enable_vae_slicing",
-            "0.40.0",
-            depr_message,
-        )
         self.vae.enable_slicing()
 
     def disable_vae_slicing(self):
         r"""
         Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
         computing decoding in one step.
         """
-        depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
-        deprecate(
-            "disable_vae_slicing",
-            "0.40.0",
-            depr_message,
-        )
         self.vae.disable_slicing()
 
     def enable_vae_tiling(self):
@@ -536,25 +523,13 @@ def enable_vae_tiling(self):
         compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
         processing larger images.
         """
-        depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
-        deprecate(
-            "enable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
         self.vae.enable_tiling()
 
     def disable_vae_tiling(self):
         r"""
         Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
         computing decoding in one step.
         """
-        depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
-        deprecate(
-            "disable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
         self.vae.disable_tiling()
 
     # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_latents
@@ -688,11 +663,11 @@ def __call__(
                 their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
                 will be used.
             guidance_scale (`float`, *optional*, defaults to 3.5):
-                Guidance scale as defined in [Classifier-Free Diffusion
-                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
-                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
-                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
-                the text `prompt`, usually at the expense of lower image quality.
+                Embedded guiddance scale is enabled by setting `guidance_scale` > 1. Higher `guidance_scale` encourages
+                a model to generate images more aligned with `prompt` at the expense of lower image quality.
+
+                Guidance-distilled models approximates true classifer-free guidance for `guidance_scale` > 1. Refer to
+                the [paper](https://huggingface.co/papers/2210.03142) to learn more.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -701,7 +676,7 @@ def __call__(
             latents (`torch.Tensor`, *optional*):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will be generated by sampling using the supplied random `generator`.
+                tensor will ge generated by sampling using the supplied random `generator`.
             prompt_embeds (`torch.Tensor`, *optional*):
                 Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                 provided, text embeddings will be generated from `prompt` input argument.
@@ -904,31 +879,49 @@ def __call__(
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
                 timestep = t.expand(latents.shape[0]).to(latents.dtype)
 
-                noise_pred = self.transformer(
-                    hidden_states=latents,
-                    timestep=timestep / 1000,
-                    encoder_hidden_states=prompt_embeds,
-                    txt_ids=text_ids,
-                    img_ids=latent_image_ids,
-                    attention_mask=attention_mask,
-                    joint_attention_kwargs=self.joint_attention_kwargs,
-                    return_dict=False,
-                )[0]
-
                 if self.do_classifier_free_guidance:
-                    if negative_image_embeds is not None:
-                        self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
-                    neg_noise_pred = self.transformer(
+                    # Batch positive and negative prompts for single transformer call
+                    batched_latents = torch.cat([latents, latents], dim=0)
+                    batched_timestep = torch.cat([timestep, timestep], dim=0)
+                    batched_encoder_hidden_states = torch.cat([prompt_embeds, negative_prompt_embeds], dim=0)
+                    batched_txt_ids = torch.cat([text_ids, negative_text_ids], dim=0)
+                    batched_img_ids = torch.cat([latent_image_ids, latent_image_ids], dim=0)
+
+                    # Handle attention masks
+                    if attention_mask is not None and negative_attention_mask is not None:
+                        batched_attention_mask = torch.cat([attention_mask, negative_attention_mask], dim=0)
+                    else:
+                        batched_attention_mask = None
+
+                    # Single transformer call with batched inputs
+                    batched_noise_pred = self.transformer(
+                        hidden_states=batched_latents,
+                        timestep=batched_timestep / 1000,
+                        encoder_hidden_states=batched_encoder_hidden_states,
+                        txt_ids=text_ids,
+                        img_ids=latent_image_ids,
+                        attention_mask=batched_attention_mask,
+                        joint_attention_kwargs=self.joint_attention_kwargs,
+                        return_dict=False,
+                    )[0]
+
+                    # Split the batched result back into positive and negative predictions
+                    noise_pred, neg_noise_pred = batched_noise_pred.chunk(2, dim=0)
+
+                    # Apply classifier-free guidance
+                    noise_pred = neg_noise_pred + guidance_scale * (noise_pred - neg_noise_pred)
+                else:
+                    # No guidance, single forward pass
+                    noise_pred = self.transformer(
                         hidden_states=latents,
                         timestep=timestep / 1000,
-                        encoder_hidden_states=negative_prompt_embeds,
-                        txt_ids=negative_text_ids,
+                        encoder_hidden_states=prompt_embeds,
+                        txt_ids=text_ids,
                         img_ids=latent_image_ids,
-                        attention_mask=negative_attention_mask,
+                        attention_mask=attention_mask,
                         joint_attention_kwargs=self.joint_attention_kwargs,
                         return_dict=False,
                     )[0]
-                    noise_pred = neg_noise_pred + guidance_scale * (noise_pred - neg_noise_pred)
 
                 # compute the previous noisy sample x_t -> x_t-1
                 latents_dtype = latents.dtype
@@ -971,4 +964,4 @@ def __call__(
         if not return_dict:
             return (image,)
 
-        return ChromaPipelineOutput(images=image)
+        return ChromaPipelineOutput(images=image)
diff --git a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
@@ -104,6 +104,7 @@ def __init__(
         use_beta_sigmas: Optional[bool] = False,
         time_shift_type: str = "exponential",
         stochastic_sampling: bool = False,
+        custom_sigmas = None
     ):
         if self.config.use_beta_sigmas and not is_scipy_available():
             raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
@@ -132,6 +133,7 @@ def __init__(
         self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
         self.sigma_min = self.sigmas[-1].item()
         self.sigma_max = self.sigmas[0].item()
+        self.custom_sigmas = custom_sigmas
 
     @property
     def shift(self):
@@ -343,6 +345,10 @@ def set_timesteps(
         else:
             sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
 
+        if self.custom_sigmas is not None:
+            timesteps = torch.tensor(self.custom_sigmas[:-1], device=sigmas.device, dtype=torch.float32) * self.config.num_train_timesteps
+            sigmas = torch.tensor(self.custom_sigmas, device=sigmas.device, dtype=torch.float32)
+
         self.timesteps = timesteps
         self.sigmas = sigmas
         self._step_index = None