convert_open_clip_checkpoint use hidden_size for text_proj_dim

Teriks · Teriks · commit d25c76a5cfa4 · 2025-03-03T01:38:54.000-06:00
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
@@ -1447,9 +1447,9 @@ def convert_open_clip_checkpoint(
     text_proj_key = prefix + "text_projection"
 
     if text_proj_key in checkpoint:
-        text_proj_dim = int(checkpoint[text_proj_key].shape[0])
-    elif hasattr(text_model.config, "projection_dim"):
-        text_proj_dim = text_model.config.projection_dim
+        text_proj_dim = int(checkpoint[text_proj_key].shape[1])
+    elif hasattr(text_model.config, "hidden_size"):
+        text_proj_dim = text_model.config.hidden_size
     else:
         text_proj_dim = LDM_OPEN_CLIP_TEXT_PROJECTION_DIM
 
@@ -1545,14 +1545,6 @@ def create_diffusers_clip_model_from_ldm(
             config["pretrained_model_name_or_path"] = clip_config
             subfolder = ""
 
-    if is_open_clip_model(checkpoint):
-        # infer projection_dim for the text_encoder using the checkpoint.
-        # should fix SD2.X LDM checkpoint loads from CivitAI and similar.
-        # The configuration on the hub is often (or always) incorrect for these models
-        # which need projection_dim=1024 and not projection_dim=512
-        if 'cond_stage_model.model.transformer.resblocks.0.mlp.c_proj.weight' in checkpoint:
-            config['projection_dim'] = checkpoint['cond_stage_model.model.transformer.resblocks.0.mlp.c_proj.weight'].shape[0]
-
     model_config = cls.config_class.from_pretrained(**config, subfolder=subfolder, local_files_only=local_files_only)
     ctx = init_empty_weights if is_accelerate_available() else nullcontext
     with ctx():