Description
Describe the bug
padding_mask_crop works for no controlnet and with 1 controlnet, but when we have multiple controlnets, library returns - ValueError: The image should be a PIL image when inpainting mask crop, but is of type <class 'list'>. have double confirmed my other inputs are as required
ValueError Traceback (most recent call last)
Cell In[21], line 6
4 control_images = [control_image, control_image]
5 mask_image = preprocess_image(Image.open(r"./inpainting_img/Inpainting01_InputImageMask.jpg"))
----> 6 images = pipe(
7 prompt,
8 image=init_image,
9 mask_image = mask_image,
10 guidance_scale = 0.7,
11 control_image=control_images,
12 num_inference_steps=30,
13 num_images_per_prompt = 2,
14 padding_mask_crop = 60,
15 generator=generator,
16 cross_attention_kwargs={"scale":0.8}, device="cuda",
17 controlnet_conditioning_scale=[0.4,0.4],
18 control_guidance_start = [0.2,0.2],
19 control_guidance_end = [0.8,0.8]
20
21 ).images
File c:\Users\SEED360\Desktop\Stable_Diffusion\diffusers.venv\lib\site-packages\torch\utils_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File c:\Users\SEED360\Desktop\Stable_Diffusion\diffusers.venv\lib\site-packages\diffusers\pipelines\controlnet\pipeline_controlnet_inpaint_sd_xl.py:1418, in StableDiffusionXLControlNetInpaintPipeline.call(self, prompt, prompt_2, image, mask_image, control_image, height, width, padding_mask_crop, strength, num_inference_steps, denoising_start, denoising_end, guidance_scale, negative_prompt, negative_prompt_2, num_images_per_prompt, eta, generator, latents, prompt_embeds, negative_prompt_embeds, ip_adapter_image, ip_adapter_image_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, output_type, return_dict, cross_attention_kwargs, controlnet_conditioning_scale, guess_mode, control_guidance_start, control_guidance_end, guidance_rescale, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, clip_skip, callback_on_step_end, callback_on_step_end_tensor_inputs, **kwargs)
1412 control_guidance_start, control_guidance_end = (
1413 mult * [control_guidance_start],
1414 mult * [control_guidance_end],
1415 )
1417 # 1. Check inputs
-> 1418 self.check_inputs(
1419 prompt,
1420 prompt_2,
1421 control_image,
1422 mask_image,
1423 strength,
1424 num_inference_steps,
1425 callback_steps,
1426 output_type,
1427 negative_prompt,
1428 negative_prompt_2,
1429 prompt_embeds,
1430 negative_prompt_embeds,
1431 ip_adapter_image,
1432 ip_adapter_image_embeds,
1433 pooled_prompt_embeds,
1434 negative_pooled_prompt_embeds,
1435 controlnet_conditioning_scale,
1436 control_guidance_start,
1437 control_guidance_end,
1438 callback_on_step_end_tensor_inputs,
1439 padding_mask_crop,
1440 )
1442 self._guidance_scale = guidance_scale
1443 self._clip_skip = clip_skip
File c:\Users\SEED360\Desktop\Stable_Diffusion\diffusers.venv\lib\site-packages\diffusers\pipelines\controlnet\pipeline_controlnet_inpaint_sd_xl.py:731, in StableDiffusionXLControlNetInpaintPipeline.check_inputs(self, prompt, prompt_2, image, mask_image, strength, num_inference_steps, callback_steps, output_type, negative_prompt, negative_prompt_2, prompt_embeds, negative_prompt_embeds, ip_adapter_image, ip_adapter_image_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, controlnet_conditioning_scale, control_guidance_start, control_guidance_end, callback_on_step_end_tensor_inputs, padding_mask_crop)
729 if padding_mask_crop is not None:
730 if not isinstance(image, PIL.Image.Image):
--> 731 raise ValueError(
732 f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
733 )
734 if not isinstance(mask_image, PIL.Image.Image):
735 raise ValueError(
736 f"The mask image should be a PIL image when inpainting mask crop, but is of type"
737 f" {type(mask_image)}."
738 )
ValueError: The image should be a PIL image when inpainting mask crop, but is of type <class 'list'>.
Reproduction
from diffusers import StableDiffusionXLControlNetInpaintPipeline, ControlNetModel, AutoencoderKL, UniPCMultistepScheduler
import torch
from diffusers.utils import load_image, make_image_grid
from PIL import Image
import cv2
import numpy as np
import torch
from torchvision import transforms
#convert image to tensor
def preprocess_image(image,resize = (1024,1024)):
#Encode
# Define transformation: Convert the image to a tensor and normalize it
preprocess = transforms.Compose([
transforms.Resize(resize), # Resize image to the model's expected input size
transforms.ToTensor(), # Convert image to PyTorch tensor
transforms.Lambda(lambda x: x[:3, :, :]) ,
#transforms.Normalize([0.5], [0.5]), # Normalize to [-1, 1] range
transforms.ToPILImage()
])
output_image = preprocess(image)
#get rid of the extra alpha channel. from rgba to rgb
return output_image
init_image = preprocess_image(Image.open(r"./inpainting_img/Inpainting01_InputImage.png"))
def process_controlnet_image(image):
image = np.array(image)
low_threshold = 100
high_threshold = 200
image = cv2.Canny(image, low_threshold, high_threshold)
image = image[:, :, None]
image = np.concatenate([image, image, image], axis=2)
canny_image = Image.fromarray(image)
return canny_image
control_image = preprocess_image(Image.open(r"inpainting_img\Inpainting01_InputImageControlnet.png"))
process_controlnet_image(control_image).show()
controlnet1 = ControlNetModel.from_pretrained("controlnet\controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True, local_files_only = True)
controlnet2 = ControlNetModel.from_pretrained("controlnet\controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16", local_files_only = True)
controlnets = [controlnet1,controlnet2]
model = "hugginface_epicrealism"
pipe = StableDiffusionXLControlNetInpaintPipeline.from_pretrained(
model , controlnet=controlnets, torch_dtype=torch.float16, use_safetensors=True).to("cuda")
pipe.enable_model_cpu_offload()
prompt ="a happy dog"
generator = torch.Generator("cuda").manual_seed(31)
prompt ="luxurious mixed use development in the day"
init_image = preprocess_image(Image.open(r"./inpainting_img/Inpainting01_InputImage.png"))
control_image = process_controlnet_image(preprocess_image(Image.open(r"inpainting_img\Inpainting01_InputImageControlnet.png")))
control_images = [control_image, control_image]
mask_image = preprocess_image(Image.open(r"./inpainting_img/Inpainting01_InputImageMask.jpg"))
images = pipe(
prompt,
image=init_image,
mask_image = mask_image,
guidance_scale = 0.7,
control_image=control_images,
num_inference_steps=30,
num_images_per_prompt = 2,
padding_mask_crop = 60,
generator=generator,
cross_attention_kwargs={"scale":0.8}, device="cuda",
controlnet_conditioning_scale=[0.4,0.4],
control_guidance_start = [0.2,0.2],
control_guidance_end = [0.8,0.8]
).images
Logs
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[21], line 6
4 control_images = [control_image, control_image]
5 mask_image = preprocess_image(Image.open(r"./inpainting_img/Inpainting01_InputImageMask.jpg"))
----> 6 images = pipe(
7 prompt,
8 image=init_image,
9 mask_image = mask_image,
10 guidance_scale = 0.7,
11 control_image=control_images,
12 num_inference_steps=30,
13 num_images_per_prompt = 2,
14 padding_mask_crop = 60,
15 generator=generator,
16 cross_attention_kwargs={"scale":0.8}, device="cuda",
17 controlnet_conditioning_scale=[0.4,0.4],
18 control_guidance_start = [0.2,0.2],
19 control_guidance_end = [0.8,0.8]
20
21 ).images
File c:\Users\SEED360\Desktop\Stable_Diffusion\diffusers\.venv\lib\site-packages\torch\utils\_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File c:\Users\SEED360\Desktop\Stable_Diffusion\diffusers\.venv\lib\site-packages\diffusers\pipelines\controlnet\pipeline_controlnet_inpaint_sd_xl.py:1418, in StableDiffusionXLControlNetInpaintPipeline.__call__(self, prompt, prompt_2, image, mask_image, control_image, height, width, padding_mask_crop, strength, num_inference_steps, denoising_start, denoising_end, guidance_scale, negative_prompt, negative_prompt_2, num_images_per_prompt, eta, generator, latents, prompt_embeds, negative_prompt_embeds, ip_adapter_image, ip_adapter_image_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, output_type, return_dict, cross_attention_kwargs, controlnet_conditioning_scale, guess_mode, control_guidance_start, control_guidance_end, guidance_rescale, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, clip_skip, callback_on_step_end, callback_on_step_end_tensor_inputs, **kwargs)
1412 control_guidance_start, control_guidance_end = (
1413 mult * [control_guidance_start],
1414 mult * [control_guidance_end],
1415 )
1417 # 1. Check inputs
-> 1418 self.check_inputs(
1419 prompt,
1420 prompt_2,
1421 control_image,
1422 mask_image,
1423 strength,
1424 num_inference_steps,
1425 callback_steps,
1426 output_type,
1427 negative_prompt,
1428 negative_prompt_2,
1429 prompt_embeds,
1430 negative_prompt_embeds,
1431 ip_adapter_image,
1432 ip_adapter_image_embeds,
1433 pooled_prompt_embeds,
1434 negative_pooled_prompt_embeds,
1435 controlnet_conditioning_scale,
1436 control_guidance_start,
1437 control_guidance_end,
1438 callback_on_step_end_tensor_inputs,
1439 padding_mask_crop,
1440 )
1442 self._guidance_scale = guidance_scale
1443 self._clip_skip = clip_skip
File c:\Users\SEED360\Desktop\Stable_Diffusion\diffusers\.venv\lib\site-packages\diffusers\pipelines\controlnet\pipeline_controlnet_inpaint_sd_xl.py:731, in StableDiffusionXLControlNetInpaintPipeline.check_inputs(self, prompt, prompt_2, image, mask_image, strength, num_inference_steps, callback_steps, output_type, negative_prompt, negative_prompt_2, prompt_embeds, negative_prompt_embeds, ip_adapter_image, ip_adapter_image_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, controlnet_conditioning_scale, control_guidance_start, control_guidance_end, callback_on_step_end_tensor_inputs, padding_mask_crop)
729 if padding_mask_crop is not None:
730 if not isinstance(image, PIL.Image.Image):
--> 731 raise ValueError(
732 f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
733 )
734 if not isinstance(mask_image, PIL.Image.Image):
735 raise ValueError(
736 f"The mask image should be a PIL image when inpainting mask crop, but is of type"
737 f" {type(mask_image)}."
738 )
ValueError: The image should be a PIL image when inpainting mask crop, but is of type <class 'list'>.
System Info
- 🤗 Diffusers version: 0.32.1
- Platform: Windows-10-10.0.22631-SP0
- Running on Google Colab?: No
- Python version: 3.10.9
- PyTorch version (GPU?): 2.3.0+cu118 (True)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Huggingface_hub version: 0.27.0
- Transformers version: 4.44.2
- Accelerate version: 0.34.2
- PEFT version: 0.14.0
- Bitsandbytes version: not installed
- Safetensors version: 0.4.5
- xFormers version: not installed
- Accelerator: NVIDIA RTX A6000, 49140 MiB
- Using GPU in script?: yes
- Using distributed or parallel set-up in script?: no