Skip to content

Commit 44ca43b

Browse files
authored
Update llama.cpp
1 parent c66b7d2 commit 44ca43b

File tree

1 file changed

+186
-0
lines changed

1 file changed

+186
-0
lines changed

src/llama.cpp

+186
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,10 @@ enum llm_arch {
205205
LLM_ARCH_GRANITE,
206206
LLM_ARCH_GRANITE_MOE,
207207
LLM_ARCH_CHAMELEON,
208+
LLM_ARCH_FLUX,
209+
LLM_ARCH_SD1,
210+
LLM_ARCH_SDXL,
211+
LLM_ARCH_SD3,
208212
LLM_ARCH_UNKNOWN,
209213
};
210214

@@ -258,6 +262,10 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
258262
{ LLM_ARCH_GRANITE, "granite" },
259263
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
260264
{ LLM_ARCH_CHAMELEON, "chameleon" },
265+
{ LLM_ARCH_FLUX, "flux" },
266+
{ LLM_ARCH_SD1, "sd1" },
267+
{ LLM_ARCH_SDXL, "sdxl" },
268+
{ LLM_ARCH_SD3, "sd3" },
261269
{ LLM_ARCH_UNKNOWN, "(unknown)" },
262270
};
263271

@@ -1531,6 +1539,10 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
15311539
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
15321540
},
15331541
},
1542+
{ LLM_ARCH_FLUX, {}},
1543+
{ LLM_ARCH_SD1, {}},
1544+
{ LLM_ARCH_SDXL, {}},
1545+
{ LLM_ARCH_SD3, {}},
15341546
{
15351547
LLM_ARCH_UNKNOWN,
15361548
{
@@ -5403,6 +5415,12 @@ static void llm_load_hparams(
54035415
// get general kv
54045416
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
54055417

5418+
// Disable LLM metadata for image models
5419+
if (model.arch == LLM_ARCH_FLUX || model.arch == LLM_ARCH_SD1 || model.arch == LLM_ARCH_SDXL || model.arch == LLM_ARCH_SD3) {
5420+
model.ftype = ml.ftype;
5421+
return;
5422+
}
5423+
54065424
// get hparams kv
54075425
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
54085426

@@ -18016,6 +18034,122 @@ static void llama_tensor_dequantize_internal(
1801618034
workers.clear();
1801718035
}
1801818036

18037+
static ggml_type img_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
18038+
// Special function for quantizing image model tensors
18039+
const std::string name = ggml_get_name(tensor);
18040+
const llm_arch arch = qs.model.arch;
18041+
18042+
// Sanity check
18043+
if (
18044+
(name.find("model.diffusion_model.") != std::string::npos) ||
18045+
(name.find("first_stage_model.") != std::string::npos) ||
18046+
(name.find("single_transformer_blocks.") != std::string::npos)
18047+
) {
18048+
throw std::runtime_error("Invalid input GGUF file. This is not a supported UNET model");
18049+
}
18050+
18051+
// Unsupported quant types - exclude all IQ quants for now
18052+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
18053+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
18054+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
18055+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
18056+
ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
18057+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_4 ||
18058+
ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_8 || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_8_8) {
18059+
throw std::runtime_error("Invalid quantization type for image model (Not supported)");
18060+
}
18061+
18062+
if ( // Rules for to_v attention
18063+
(name.find("attn_v.weight") != std::string::npos) ||
18064+
(name.find(".to_v.weight") != std::string::npos)
18065+
){
18066+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
18067+
new_type = GGML_TYPE_Q3_K;
18068+
}
18069+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
18070+
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18071+
}
18072+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
18073+
new_type = GGML_TYPE_Q5_K;
18074+
}
18075+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
18076+
new_type = GGML_TYPE_Q6_K;
18077+
}
18078+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) {
18079+
new_type = GGML_TYPE_Q5_K;
18080+
}
18081+
++qs.i_attention_wv;
18082+
} else if ( // Rules for fused qkv attention
18083+
(name.find("attn_qkv.weight") != std::string::npos) ||
18084+
(name.find("attn.qkv.weight") != std::string::npos)
18085+
) {
18086+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
18087+
new_type = GGML_TYPE_Q4_K;
18088+
}
18089+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
18090+
new_type = GGML_TYPE_Q5_K;
18091+
}
18092+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
18093+
new_type = GGML_TYPE_Q6_K;
18094+
}
18095+
} else if ( // Rules for ffn
18096+
(name.find("ffn_down") != std::string::npos)
18097+
) {
18098+
// TODO: add back `layer_info` with some model specific logic + logic further down
18099+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
18100+
new_type = GGML_TYPE_Q4_K;
18101+
}
18102+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
18103+
new_type = GGML_TYPE_Q5_K;
18104+
}
18105+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
18106+
new_type = GGML_TYPE_Q5_K;
18107+
}
18108+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
18109+
new_type = GGML_TYPE_Q6_K;
18110+
}
18111+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
18112+
new_type = GGML_TYPE_Q6_K;
18113+
}
18114+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0) {
18115+
new_type = GGML_TYPE_Q4_1;
18116+
}
18117+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_0) {
18118+
new_type = GGML_TYPE_Q5_1;
18119+
}
18120+
++qs.i_ffn_down;
18121+
}
18122+
18123+
// Sanity check for row shape
18124+
bool convert_incompatible_tensor = false;
18125+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
18126+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
18127+
int nx = tensor->ne[0];
18128+
int ny = tensor->ne[1];
18129+
if (nx % QK_K != 0) {
18130+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
18131+
convert_incompatible_tensor = true;
18132+
} else {
18133+
++qs.n_k_quantized;
18134+
}
18135+
}
18136+
if (convert_incompatible_tensor) {
18137+
// TODO: Possibly reenable this in the future
18138+
// switch (new_type) {
18139+
// case GGML_TYPE_Q2_K:
18140+
// case GGML_TYPE_Q3_K:
18141+
// case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
18142+
// case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
18143+
// case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
18144+
// default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
18145+
// }
18146+
new_type = GGML_TYPE_F16;
18147+
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
18148+
++qs.n_fallback;
18149+
}
18150+
return new_type;
18151+
}
18152+
1801918153
static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
1802018154
const std::string name = ggml_get_name(tensor);
1802118155

@@ -18547,6 +18681,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1854718681
ctx_outs[i_split] = gguf_init_empty();
1854818682
}
1854918683
gguf_add_tensor(ctx_outs[i_split], tensor);
18684+
// SD3 pos_embed needs special fix as first dim is 1, which gets truncated here
18685+
if (model.arch == LLM_ARCH_SD3) {
18686+
const std::string name = ggml_get_name(tensor);
18687+
if (name == "pos_embed" && tensor->ne[2] == 1) {
18688+
const int n_dim = 3;
18689+
gguf_set_tensor_ndim(ctx_outs[i_split], "pos_embed", n_dim);
18690+
LLAMA_LOG_INFO("\n%s: Correcting pos_embed shape for SD3: [key:%s]\n", __func__, tensor->name);
18691+
}
18692+
}
1855018693
}
1855118694

1855218695
// Set split info if needed
@@ -18647,6 +18790,45 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1864718790
// do not quantize relative position bias (T5)
1864818791
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
1864918792

18793+
// rules for image models
18794+
bool image_model = false;
18795+
if (model.arch == LLM_ARCH_FLUX) {
18796+
image_model = true;
18797+
quantize &= name.find("txt_in.") == std::string::npos;
18798+
quantize &= name.find("img_in.") == std::string::npos;
18799+
quantize &= name.find("time_in.") == std::string::npos;
18800+
quantize &= name.find("vector_in.") == std::string::npos;
18801+
quantize &= name.find("guidance_in.") == std::string::npos;
18802+
quantize &= name.find("final_layer.") == std::string::npos;
18803+
}
18804+
if (model.arch == LLM_ARCH_SD1 || model.arch == LLM_ARCH_SDXL) {
18805+
image_model = true;
18806+
quantize &= name.find("class_embedding.") == std::string::npos;
18807+
quantize &= name.find("time_embedding.") == std::string::npos;
18808+
quantize &= name.find("add_embedding.") == std::string::npos;
18809+
quantize &= name.find("time_embed.") == std::string::npos;
18810+
quantize &= name.find("label_emb.") == std::string::npos;
18811+
quantize &= name.find("conv_in.") == std::string::npos;
18812+
quantize &= name.find("conv_out.") == std::string::npos;
18813+
quantize &= name != "input_blocks.0.0.weight";
18814+
quantize &= name != "out.2.weight";
18815+
}
18816+
if (model.arch == LLM_ARCH_SD3) {
18817+
image_model = true;
18818+
quantize &= name.find("final_layer.") == std::string::npos;
18819+
quantize &= name.find("time_text_embed.") == std::string::npos;
18820+
quantize &= name.find("context_embedder.") == std::string::npos;
18821+
quantize &= name.find("t_embedder.") == std::string::npos;
18822+
quantize &= name.find("y_embedder.") == std::string::npos;
18823+
quantize &= name.find("x_embedder.") == std::string::npos;
18824+
quantize &= name != "proj_out.weight";
18825+
quantize &= name != "pos_embed";
18826+
}
18827+
// ignore 3D/4D tensors for image models as the code was never meant to handle these
18828+
if (image_model) {
18829+
quantize &= ggml_n_dims(tensor) == 2;
18830+
}
18831+
1865018832
enum ggml_type new_type;
1865118833
void * new_data;
1865218834
size_t new_size;
@@ -18655,6 +18837,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1865518837
new_type = default_type;
1865618838

1865718839
// get more optimal quantization type based on the tensor shape, layer, etc.
18840+
if (image_model) {
18841+
new_type = img_tensor_get_type(qs, new_type, tensor, ftype);
18842+
} else {
1865818843
if (!params->pure && ggml_is_quantized(default_type)) {
1865918844
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
1866018845
}
@@ -18664,6 +18849,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1866418849
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
1866518850
new_type = params->output_tensor_type;
1866618851
}
18852+
}
1866718853

1866818854
// If we've decided to quantize to the same type the tensor is already
1866918855
// in then there's nothing to do.

0 commit comments

Comments
 (0)