T2I-Adapter diffusers implementation

huo-ju · huo-ju · commit 47d3406d64a2 · 2023-02-20T07:34:25.000Z
orginal: https://github.com/TencentARC/T2I-Adapter
diff --git a/scripts/t2i_adapter_tester.py b/scripts/t2i_adapter_tester.py
@@ -0,0 +1,86 @@
+import cv2
+import argparse
+#import importlib
+import random
+import copy
+import torch
+import sys
+import os
+root_path = os.getcwd()
+print(root_path )
+sys.path.append(f"{root_path}/src")
+import diffusers
+from PIL import Image
+from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
+from extra.t2iadapter.adapter import Adapter
+from basicsr.utils import img2tensor, tensor2img, scandir, get_time_str, get_root_logger, get_env_info
+
+diffusers.utils.logging.disable_progress_bar()
+
+class DummySafetyChecker():
+    def safety_checker(self, images, *args, **kwargs):
+        return images, [False] * len(images)
+
+def loadmodel(pipeline_name, model_path, **kwargs):
+    print("load pipeline")
+    print("load model from:", pipeline_name, model_path)
+
+    kwargs["torch_dtype"] = torch.float16
+    if pipeline_name == "StableDiffusionPipeline":
+        kwargs["revision"] = "fp16"
+
+    safechecker = DummySafetyChecker().safety_checker
+    kwargs["safety_checker"] = safechecker
+
+    pipe = StableDiffusionPipeline.from_pretrained(model_path, **kwargs)
+    return pipe.to("cuda")
+
+
+def generation(pipe, prompt, seed, features_adapter=None):
+    settings = {
+        "height": 512,
+        "width": 512,
+        "num_inference_steps": 50,
+    }
+    settings["prompt"] = prompt
+    g = torch.Generator(device="cuda")
+    settings["generator"] = g.manual_seed(seed)
+    settings["features_adapter"] = features_adapter
+    settings["features_adapter_strength"] = 0.4
+    images = pipe(**settings).images
+    return images
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="auto aiart generator")
+    parser.add_argument(
+        "-p", "--pipeline", help="Diffusers pipeline name", required=True
+    )
+    parser.add_argument("-m", "--model_path", help="model path", required=True)
+    parser.add_argument("-ad", "--ckpt_ad", help="path to checkpoint of adapter", required=True)
+    parser.add_argument("-cond", "--path_cond", help="path to adapter condition", required=True)
+    args = parser.parse_args()
+    kwargs = {}
+
+    device = "cuda"
+    model_ad = Adapter(channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True, use_conv=False).to(device).half()
+    model_ad.load_state_dict(torch.load(args.ckpt_ad))
+    edge = cv2.imread(args.path_cond)
+    edge = cv2.resize(edge,(512,512))
+    edge = img2tensor(edge)[0].unsqueeze(0).unsqueeze(0)/255.
+    edge = edge>0.5
+    edge = edge.float().half()
+    features_adapter = model_ad(edge.to(device))
+
+    pipe = loadmodel(args.pipeline, args.model_path, **kwargs)
+
+    if args.pipeline == "StableDiffusionPipeline":
+        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+
+    prompt = f"A car with flying wings"
+    outputimg = generation(pipe, prompt, 52, features_adapter)
+    filename = f"output.png"
+    outputimg[0].save(f"{filename}")
+
+if __name__ == "__main__":
+    sys.exit(main())  # next section explains the use of sys.exit
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
@@ -808,7 +808,13 @@ def __init__(
         self.gradient_checkpointing = False
 
     def forward(
-        self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+        features_adapter=None,
     ):
         # TODO(Patrick, William) - attention mask is not used
         output_states = ()
@@ -842,6 +848,9 @@ def custom_forward(*inputs):
 
             output_states += (hidden_states,)
 
+        if features_adapter is not None:
+            hidden_states = hidden_states + features_adapter
+
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
                 hidden_states = downsampler(hidden_states)
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
@@ -492,6 +492,7 @@ def forward(
         timestep_cond: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        features_adapter: list = None,
         return_dict: bool = True,
     ) -> Union[UNet2DConditionOutput, Tuple]:
         r"""
@@ -574,21 +575,30 @@ def forward(
         sample = self.conv_in(sample)
 
         # 3. down
+
         down_block_res_samples = (sample,)
+        feature_idx = 0
         for downsample_block in self.down_blocks:
             if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                fa = None
+                if features_adapter is not None:
+                    fa = features_adapter[feature_idx]
                 sample, res_samples = downsample_block(
                     hidden_states=sample,
                     temb=emb,
                     encoder_hidden_states=encoder_hidden_states,
                     attention_mask=attention_mask,
                     cross_attention_kwargs=cross_attention_kwargs,
+                    features_adapter=fa,
                 )
+                feature_idx = feature_idx + 1
             else:
                 sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-
             down_block_res_samples += res_samples
 
+        if features_adapter is not None:
+            sample = sample + features_adapter[feature_idx]
+
         # 4. mid
         if self.mid_block is not None:
             sample = self.mid_block(
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -500,6 +500,8 @@ def __call__(
         eta: float = 0.0,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
+        features_adapter: list = None,
+        features_adapter_strength: float = 0.4,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         output_type: Optional[str] = "pil",
@@ -638,13 +640,23 @@ def __call__(
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
                 # predict the noise residual
+
+                input_features_adapter = None
+                #num_inference_steps
+                if features_adapter_strength > 1:
+                    features_adapter_strength = 1
+                elif features_adapter_strength < 0:
+                    features_adapter_strength = 0
+                if i < int(num_inference_steps * features_adapter_strength):
+                    input_features_adapter = features_adapter
+
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
                     cross_attention_kwargs=cross_attention_kwargs,
+                    features_adapter=input_features_adapter
                 ).sample
 
                 # perform guidance
diff --git a/src/extra/t2iadapter/__init__.py b/src/extra/t2iadapter/__init__.py
diff --git a/src/extra/t2iadapter/adapter.py b/src/extra/t2iadapter/adapter.py
@@ -0,0 +1,123 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+#from ldm.modules.attention import SpatialTransformer, BasicTransformerBlock
+
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(
+                dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+
+
+class ResnetBlock(nn.Module):
+    def __init__(self, in_c, out_c, down, ksize=3, sk=False, use_conv=True):
+        super().__init__()
+        ps = ksize//2
+        if in_c != out_c or sk==False:
+            self.in_conv = nn.Conv2d(in_c, out_c, ksize, 1, ps)
+        else:
+            # print('n_in')
+            self.in_conv = None
+        self.block1 = nn.Conv2d(out_c, out_c, 3, 1, 1)
+        self.act = nn.ReLU()
+        self.block2 = nn.Conv2d(out_c, out_c, ksize, 1, ps)
+        if sk==False:
+            self.skep = nn.Conv2d(in_c, out_c, ksize, 1, ps)
+        else:
+            self.skep = None
+
+        self.down = down
+        if self.down == True:
+            self.down_opt = Downsample(in_c, use_conv=use_conv)
+
+    def forward(self, x):
+        if self.down == True:
+            x = self.down_opt(x)
+        if self.in_conv is not None: # edit
+            x = self.in_conv(x)
+
+        h = self.block1(x)
+        h = self.act(h)
+        h = self.block2(h)
+        if self.skep is not None:
+            return h + self.skep(x)
+        else:
+            return h + x
+
+
+class Adapter(nn.Module):
+    def __init__(self, channels=[320, 640, 1280, 1280], nums_rb=3, cin=64, ksize=3, sk=False, use_conv=True):
+        super(Adapter, self).__init__()
+        self.unshuffle = nn.PixelUnshuffle(8)
+        self.channels = channels
+        self.nums_rb = nums_rb
+        self.body = []
+        for i in range(len(channels)):
+            for j in range(nums_rb):
+                if (i!=0) and (j==0):
+                    self.body.append(ResnetBlock(channels[i-1], channels[i], down=True, ksize=ksize, sk=sk, use_conv=use_conv))
+                else:
+                    self.body.append(ResnetBlock(channels[i], channels[i], down=False, ksize=ksize, sk=sk, use_conv=use_conv))
+        self.body = nn.ModuleList(self.body)
+        self.conv_in = nn.Conv2d(cin,channels[0], 3, 1, 1)
+
+    def forward(self, x):
+        # unshuffle
+        x = self.unshuffle(x)
+        # extract features
+        features = []
+        x = self.conv_in(x)
+        for i in range(len(self.channels)):
+            for j in range(self.nums_rb):
+                idx = i*self.nums_rb +j
+                x = self.body[idx](x)
+            features.append(x)
+
+        return features
diff --git a/src/extra/t2iadapter/modules.py b/src/extra/t2iadapter/modules.py