pix2pix: fixes for 2-cfg

stduhpf · stduhpf · commit 402476574c0e · 2025-05-16T12:37:39.000+02:00
diff --git a/model.h b/model.h
@@ -12,9 +12,9 @@
 
 #include "ggml-backend.h"
 #include "ggml.h"
+#include "gguf.h"
 #include "json.hpp"
 #include "zip.h"
-#include "gguf.h"
 
 #define SD_MAX_DIMS 5
 
@@ -82,6 +82,10 @@ static inline bool sd_version_is_dit(SDVersion version) {
     return false;
 }
 
+static bool sd_version_use_concat(SDVersion version) {
+    return version == VERSION_INSTRUCT_PIX2PIX || sd_version_is_inpaint(version);
+}
+
 enum PMVersion {
     PM_VERSION_1,
     PM_VERSION_2,
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -808,6 +808,10 @@ class StableDiffusionGGML {
         // TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance)
 
         float img_cfg_scale = guidance;
+        if (img_cfg_scale != cfg_scale && !sd_version_use_concat(version)) {
+            LOG_WARN("2-conditioning CFG is not supported with this model, disabling it...");
+            img_cfg_scale = cfg_scale;
+        }
 
         LOG_DEBUG("Sample");
         struct ggml_init_params params;
@@ -830,9 +834,8 @@ class StableDiffusionGGML {
 
         struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise);
 
-        bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL;
-        bool has_img_guidance  = version == VERSION_INSTRUCT_PIX2PIX && cfg_scale != img_cfg_scale;
-        has_unconditioned      = has_unconditioned || has_img_guidance;
+        bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != NULL;
+        bool has_img_guidance  = cfg_scale != img_cfg_scale && uncond.c_crossattn != NULL;
         bool has_skiplayer     = slg_scale != 0.0 && skip_layers.size() > 0;
 
         // denoise wrapper
@@ -989,9 +992,13 @@ class StableDiffusionGGML {
                         if (has_img_guidance) {
                             latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]);
                         } else {
+                            // img_cfg_scale == cfg_scale
                             latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
                         }
                     }
+                } else if(has_img_guidance){
+                    // img_cfg_scale == 1
+                    latent_result = img_cond_data[i] + cfg_scale * (positive_data[i] - img_cond_data[i]);
                 }
                 if (is_skiplayer_step) {
                     latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;