[Core] Add PAG support for PixArtSigma (huggingface#8921)

sayakpaul · web-flow · commit 7b98c4cc67b7 · 2024-08-02T07:12:41.000+05:30
* feat: add pixart sigma pag.

* inits.

* fixes

* fix

* remove print.

* copy paste methods to the pixart pag mixin

* fix-copies

* add documentation.

* add tests.

* remove correction file.

* remove pag_applied_layers

* empty
diff --git a/docs/source/en/api/pipelines/pag.md b/docs/source/en/api/pipelines/pag.md
@@ -54,3 +54,9 @@ The abstract from the paper is:
 [[autodoc]] StableDiffusionXLControlNetPAGPipeline
 	- all
 	- __call__
+
+
+## PixArtSigmaPAGPipeline
+[[autodoc]] PixArtSigmaPAGPipeline
+	- all
+	- __call__
diff --git a/docs/source/en/using-diffusers/pag.md b/docs/source/en/using-diffusers/pag.md
@@ -22,7 +22,7 @@ This guide will show you how to use PAG for various tasks and use cases.
 You can apply PAG to the [`StableDiffusionXLPipeline`] for tasks such as text-to-image, image-to-image, and inpainting. To enable PAG for a specific task, load the pipeline using the [AutoPipeline](../api/pipelines/auto_pipeline) API with the `enable_pag=True` flag and the `pag_applied_layers` argument.
 
 > [!TIP]
-> 🤗 Diffusers currently only supports using PAG with selected SDXL pipelines, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you want to add PAG support to a new pipeline!
+> 🤗 Diffusers currently only supports using PAG with selected SDXL pipelines and [`PixArtSigmaPAGPipeline`]. But feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you want to add PAG support to a new pipeline!
 
 <hfoptions id="tasks">
 <hfoption id="Text-to-image">
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -295,6 +295,7 @@
             "PaintByExamplePipeline",
             "PIAPipeline",
             "PixArtAlphaPipeline",
+            "PixArtSigmaPAGPipeline",
             "PixArtSigmaPipeline",
             "SemanticStableDiffusionPipeline",
             "ShapEImg2ImgPipeline",
@@ -717,6 +718,7 @@
             PaintByExamplePipeline,
             PIAPipeline,
             PixArtAlphaPipeline,
+            PixArtSigmaPAGPipeline,
             PixArtSigmaPipeline,
             SemanticStableDiffusionPipeline,
             ShapEImg2ImgPipeline,
diff --git a/src/diffusers/models/transformers/pixart_transformer_2d.py b/src/diffusers/models/transformers/pixart_transformer_2d.py
@@ -11,14 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Union
 
 import torch
 from torch import nn
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import is_torch_version, logging
 from ..attention import BasicTransformerBlock
+from ..attention_processor import AttentionProcessor
 from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
@@ -186,6 +187,66 @@ def _set_gradient_checkpointing(self, module, value=False):
         if hasattr(module, "gradient_checkpointing"):
             module.gradient_checkpointing = value
 
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
@@ -151,6 +151,7 @@
             "StableDiffusionXLPAGInpaintPipeline",
             "StableDiffusionXLControlNetPAGPipeline",
             "StableDiffusionXLPAGImg2ImgPipeline",
+            "PixArtSigmaPAGPipeline",
         ]
     )
     _import_structure["controlnet_xs"].extend(
@@ -531,6 +532,7 @@
         from .musicldm import MusicLDMPipeline
         from .pag import (
             AnimateDiffPAGPipeline,
+            PixArtSigmaPAGPipeline,
             StableDiffusionControlNetPAGPipeline,
             StableDiffusionPAGPipeline,
             StableDiffusionXLControlNetPAGPipeline,
diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py
@@ -50,6 +50,7 @@
 from .kolors import KolorsImg2ImgPipeline, KolorsPipeline
 from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
 from .pag import (
+    PixArtSigmaPAGPipeline,
     StableDiffusionControlNetPAGPipeline,
     StableDiffusionPAGPipeline,
     StableDiffusionXLControlNetPAGPipeline,
@@ -98,6 +99,7 @@
         ("stable-diffusion-controlnet-pag", StableDiffusionControlNetPAGPipeline),
         ("stable-diffusion-xl-pag", StableDiffusionXLPAGPipeline),
         ("stable-diffusion-xl-controlnet-pag", StableDiffusionXLControlNetPAGPipeline),
+        ("pixart-sigma-pag", PixArtSigmaPAGPipeline),
         ("auraflow", AuraFlowPipeline),
         ("kolors", KolorsPipeline),
         ("flux", FluxPipeline),
diff --git a/src/diffusers/pipelines/pag/__init__.py b/src/diffusers/pipelines/pag/__init__.py
@@ -24,6 +24,7 @@
 else:
     _import_structure["pipeline_pag_controlnet_sd"] = ["StableDiffusionControlNetPAGPipeline"]
     _import_structure["pipeline_pag_controlnet_sd_xl"] = ["StableDiffusionXLControlNetPAGPipeline"]
+    _import_structure["pipeline_pag_pixart_sigma"] = ["PixArtSigmaPAGPipeline"]
     _import_structure["pipeline_pag_sd"] = ["StableDiffusionPAGPipeline"]
     _import_structure["pipeline_pag_sd_animatediff"] = ["AnimateDiffPAGPipeline"]
     _import_structure["pipeline_pag_sd_xl"] = ["StableDiffusionXLPAGPipeline"]
@@ -40,6 +41,7 @@
     else:
         from .pipeline_pag_controlnet_sd import StableDiffusionControlNetPAGPipeline
         from .pipeline_pag_controlnet_sd_xl import StableDiffusionXLControlNetPAGPipeline
+        from .pipeline_pag_pixart_sigma import PixArtSigmaPAGPipeline
         from .pipeline_pag_sd import StableDiffusionPAGPipeline
         from .pipeline_pag_sd_animatediff import AnimateDiffPAGPipeline
         from .pipeline_pag_sd_xl import StableDiffusionXLPAGPipeline
diff --git a/src/diffusers/pipelines/pag/pag_utils.py b/src/diffusers/pipelines/pag/pag_utils.py
@@ -275,3 +275,185 @@ def pag_attn_processors(self):
             if proc.__class__ in (PAGCFGIdentitySelfAttnProcessor2_0, PAGIdentitySelfAttnProcessor2_0):
                 processors[name] = proc
         return processors
+
+
+class PixArtPAGMixin:
+    @staticmethod
+    def _check_input_pag_applied_layer(layer):
+        r"""
+        Check if each layer input in `applied_pag_layers` is valid. It should be the block index: {block_index}.
+        """
+
+        # Check if the layer index is valid (should be int or str of int)
+        if isinstance(layer, int):
+            return  # Valid layer index
+
+        if isinstance(layer, str):
+            if layer.isdigit():
+                return  # Valid layer index
+
+        # If it is not a valid layer index, raise a ValueError
+        raise ValueError(f"Pag layer should only contain block index. Accept number string like '3', got {layer}.")
+
+    def _set_pag_attn_processor(self, pag_applied_layers, do_classifier_free_guidance):
+        r"""
+        Set the attention processor for the PAG layers.
+        """
+        if do_classifier_free_guidance:
+            pag_attn_proc = PAGCFGIdentitySelfAttnProcessor2_0()
+        else:
+            pag_attn_proc = PAGIdentitySelfAttnProcessor2_0()
+
+        def is_self_attn(module_name):
+            r"""
+            Check if the module is self-attention module based on its name.
+            """
+            return (
+                "attn1" in module_name and len(module_name.split(".")) == 3
+            )  # include transformer_blocks.1.attn1, exclude transformer_blocks.18.attn1.to_q, transformer_blocks.1.attn1.add_q_proj, ...
+
+        def get_block_index(module_name):
+            r"""
+            Get the block index from the module name. can be "block_0", "block_1", ... If there is only one block (e.g.
+            mid_block) and index is ommited from the name, it will be "block_0".
+            """
+            # transformer_blocks.23.attn -> "23"
+            return module_name.split(".")[1]
+
+        for pag_layer_input in pag_applied_layers:
+            # for each PAG layer input, we find corresponding self-attention layers in the transformer model
+            target_modules = []
+
+            block_index = str(pag_layer_input)
+
+            for name, module in self.transformer.named_modules():
+                if is_self_attn(name) and get_block_index(name) == block_index:
+                    target_modules.append(module)
+
+            if len(target_modules) == 0:
+                raise ValueError(f"Cannot find pag layer to set attention processor for: {pag_layer_input}")
+
+            for module in target_modules:
+                module.processor = pag_attn_proc
+
+    # Copied from diffusers.pipelines.pag.pag_utils.PAGMixin.set_pag_applied_layers
+    def set_pag_applied_layers(self, pag_applied_layers):
+        r"""
+        set the the self-attention layers to apply PAG. Raise ValueError if the input is invalid.
+        """
+
+        if not isinstance(pag_applied_layers, list):
+            pag_applied_layers = [pag_applied_layers]
+
+        for pag_layer in pag_applied_layers:
+            self._check_input_pag_applied_layer(pag_layer)
+
+        self.pag_applied_layers = pag_applied_layers
+
+    # Copied from diffusers.pipelines.pag.pag_utils.PAGMixin._get_pag_scale
+    def _get_pag_scale(self, t):
+        r"""
+        Get the scale factor for the perturbed attention guidance at timestep `t`.
+        """
+
+        if self.do_pag_adaptive_scaling:
+            signal_scale = self.pag_scale - self.pag_adaptive_scale * (1000 - t)
+            if signal_scale < 0:
+                signal_scale = 0
+            return signal_scale
+        else:
+            return self.pag_scale
+
+    # Copied from diffusers.pipelines.pag.pag_utils.PAGMixin._apply_perturbed_attention_guidance
+    def _apply_perturbed_attention_guidance(self, noise_pred, do_classifier_free_guidance, guidance_scale, t):
+        r"""
+        Apply perturbed attention guidance to the noise prediction.
+
+        Args:
+            noise_pred (torch.Tensor): The noise prediction tensor.
+            do_classifier_free_guidance (bool): Whether to apply classifier-free guidance.
+            guidance_scale (float): The scale factor for the guidance term.
+            t (int): The current time step.
+
+        Returns:
+            torch.Tensor: The updated noise prediction tensor after applying perturbed attention guidance.
+        """
+        pag_scale = self._get_pag_scale(t)
+        if do_classifier_free_guidance:
+            noise_pred_uncond, noise_pred_text, noise_pred_perturb = noise_pred.chunk(3)
+            noise_pred = (
+                noise_pred_uncond
+                + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                + pag_scale * (noise_pred_text - noise_pred_perturb)
+            )
+        else:
+            noise_pred_text, noise_pred_perturb = noise_pred.chunk(2)
+            noise_pred = noise_pred_text + pag_scale * (noise_pred_text - noise_pred_perturb)
+        return noise_pred
+
+    # Copied from diffusers.pipelines.pag.pag_utils.PAGMixin._prepare_perturbed_attention_guidance
+    def _prepare_perturbed_attention_guidance(self, cond, uncond, do_classifier_free_guidance):
+        """
+        Prepares the perturbed attention guidance for the PAG model.
+
+        Args:
+            cond (torch.Tensor): The conditional input tensor.
+            uncond (torch.Tensor): The unconditional input tensor.
+            do_classifier_free_guidance (bool): Flag indicating whether to perform classifier-free guidance.
+
+        Returns:
+            torch.Tensor: The prepared perturbed attention guidance tensor.
+        """
+
+        cond = torch.cat([cond] * 2, dim=0)
+
+        if do_classifier_free_guidance:
+            cond = torch.cat([uncond, cond], dim=0)
+        return cond
+
+    @property
+    # Copied from diffusers.pipelines.pag.pag_utils.PAGMixin.pag_scale
+    def pag_scale(self):
+        """
+        Get the scale factor for the perturbed attention guidance.
+        """
+        return self._pag_scale
+
+    @property
+    # Copied from diffusers.pipelines.pag.pag_utils.PAGMixin.pag_adaptive_scale
+    def pag_adaptive_scale(self):
+        """
+        Get the adaptive scale factor for the perturbed attention guidance.
+        """
+        return self._pag_adaptive_scale
+
+    @property
+    # Copied from diffusers.pipelines.pag.pag_utils.PAGMixin.do_pag_adaptive_scaling
+    def do_pag_adaptive_scaling(self):
+        """
+        Check if the adaptive scaling is enabled for the perturbed attention guidance.
+        """
+        return self._pag_adaptive_scale > 0 and self._pag_scale > 0 and len(self.pag_applied_layers) > 0
+
+    @property
+    # Copied from diffusers.pipelines.pag.pag_utils.PAGMixin.do_perturbed_attention_guidance
+    def do_perturbed_attention_guidance(self):
+        """
+        Check if the perturbed attention guidance is enabled.
+        """
+        return self._pag_scale > 0 and len(self.pag_applied_layers) > 0
+
+    @property
+    # Copied from diffusers.pipelines.pag.pag_utils.PAGMixin.pag_attn_processors with unet->transformer
+    def pag_attn_processors(self):
+        r"""
+        Returns:
+            `dict` of PAG attention processors: A dictionary contains all PAG attention processors used in the model
+            with the key as the name of the layer.
+        """
+
+        processors = {}
+        for name, proc in self.transformer.attn_processors.items():
+            if proc.__class__ in (PAGCFGIdentitySelfAttnProcessor2_0, PAGIdentitySelfAttnProcessor2_0):
+                processors[name] = proc
+        return processors
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py b/src/diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
diff --git a/tests/pipelines/pag/test_pag_pixart_sigma.py b/tests/pipelines/pag/test_pag_pixart_sigma.py