VersatileDiffusion: fix input processing (huggingface#1568)

LukasStruppek · anton-l · web-flow · commit 589330595dfa · 2022-12-12T13:45:27.000+01:00
* fix versatile diffusion input

* merge main

* `make fix-copies`

Co-authored-by: anton- &lt;anton@huggingface.co&gt;
diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
@@ -271,7 +271,8 @@ def check_inputs(self, image, height, width, callback_steps):
             and not isinstance(image, list)
         ):
             raise ValueError(
-                f"`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `list` but is {type(image)}"
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
             )
 
         if height % 8 != 0 or width % 8 != 0:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -240,7 +240,8 @@ def check_inputs(self, image, height, width, callback_steps):
             and not isinstance(image, list)
         ):
             raise ValueError(
-                f"`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `list` but is {type(image)}"
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
             )
 
         if height % 8 != 0 or width % 8 != 0:
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -134,6 +134,9 @@ def normalize_embeddings(encoder_output):
             embeds = embeds / torch.norm(embeds_pooled, dim=-1, keepdim=True)
             return embeds
 
+        if isinstance(prompt, torch.Tensor) and len(prompt.shape) == 4:
+            prompt = [p for p in prompt]
+
         batch_size = len(prompt) if isinstance(prompt, list) else 1
 
         # get prompt text embeddings
@@ -212,9 +215,17 @@ def prepare_extra_step_kwargs(self, generator, eta):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline.check_inputs
     def check_inputs(self, image, height, width, callback_steps):
-        if not isinstance(image, PIL.Image.Image) and not isinstance(image, torch.Tensor):
-            raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(image)}")
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
 
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

Original file line number	Diff line number	Diff line change
`@@ -271,7 +271,8 @@ def check_inputs(self, image, height, width, callback_steps):`
`271`	`271`	`and not isinstance(image, list)`
`272`	`272`	`):`
`273`	`273`	`raise ValueError(`
`274`		- f"`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `list` but is {type(image)}"
	`274`	+ "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
	`275`	`+ f" {type(image)}"`
`275`	`276`	`)`
`276`	`277`
`277`	`278`	`if height % 8 != 0 or width % 8 != 0:`
Original file line number	Diff line number	Diff line change
`@@ -240,7 +240,8 @@ def check_inputs(self, image, height, width, callback_steps):`
`240`	`240`	`and not isinstance(image, list)`
`241`	`241`	`):`
`242`	`242`	`raise ValueError(`
`243`		- f"`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `list` but is {type(image)}"
	`243`	+ "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
	`244`	`+ f" {type(image)}"`
`244`	`245`	`)`
`245`	`246`
`246`	`247`	`if height % 8 != 0 or width % 8 != 0:`