Skip to content

Commit c9b8654

Browse files
authored
Update llama.cpp
1 parent 26167a2 commit c9b8654

File tree

1 file changed

+166
-0
lines changed

1 file changed

+166
-0
lines changed

src/llama.cpp

+166
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,9 @@ enum llm_arch {
203203
LLM_ARCH_GRANITE,
204204
LLM_ARCH_GRANITE_MOE,
205205
LLM_ARCH_CHAMELEON,
206+
LLM_ARCH_FLUX,
207+
LLM_ARCH_SD1,
208+
LLM_ARCH_SDXL,
206209
LLM_ARCH_UNKNOWN,
207210
};
208211

@@ -256,6 +259,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
256259
{ LLM_ARCH_GRANITE, "granite" },
257260
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
258261
{ LLM_ARCH_CHAMELEON, "chameleon" },
262+
{ LLM_ARCH_FLUX, "flux" },
263+
{ LLM_ARCH_SD1, "sd1" },
264+
{ LLM_ARCH_SDXL, "sdxl" },
259265
{ LLM_ARCH_UNKNOWN, "(unknown)" },
260266
};
261267

@@ -1529,6 +1535,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
15291535
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
15301536
},
15311537
},
1538+
{ LLM_ARCH_FLUX, {}},
1539+
{ LLM_ARCH_SD1, {}},
1540+
{ LLM_ARCH_SDXL, {}},
15321541
{
15331542
LLM_ARCH_UNKNOWN,
15341543
{
@@ -5413,6 +5422,12 @@ static void llm_load_hparams(
54135422
// get general kv
54145423
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
54155424

5425+
// Disable LLM metadata for image models
5426+
if (model.arch == LLM_ARCH_FLUX || model.arch == LLM_ARCH_SD1 || model.arch == LLM_ARCH_SDXL) {
5427+
model.ftype = ml.ftype;
5428+
return;
5429+
}
5430+
54165431
// get hparams kv
54175432
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
54185433

@@ -18010,11 +18025,162 @@ static void llama_tensor_dequantize_internal(
1801018025
workers.clear();
1801118026
}
1801218027

18028+
static ggml_type img_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
18029+
// Special function for quantizing image model tensors
18030+
const std::string name = ggml_get_name(tensor);
18031+
const llm_arch arch = qs.model.arch;
18032+
18033+
// Sanity check
18034+
if (
18035+
(name.find("model.diffusion_model.") != std::string::npos) ||
18036+
(name.find("first_stage_model.") != std::string::npos) ||
18037+
(name.find("single_transformer_blocks.") != std::string::npos)
18038+
) {
18039+
throw std::runtime_error("Invalid input GGUF file. This is not a supported UNET model");
18040+
}
18041+
18042+
// Unsupported quant types - exclude all IQ quants for now
18043+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
18044+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
18045+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
18046+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
18047+
ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
18048+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_4 ||
18049+
ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_8 || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_8_8) {
18050+
throw std::runtime_error("Invalid quantization type for image model (Not supported)");
18051+
}
18052+
18053+
if ( // Tensors to keep in FP32 precision
18054+
(arch == LLM_ARCH_FLUX) && (
18055+
(name.find("img_in.") != std::string::npos) ||
18056+
(name.find("time_in.in_layer.") != std::string::npos) ||
18057+
(name.find("vector_in.in_layer.") != std::string::npos) ||
18058+
(name.find("guidance_in.in_layer.") != std::string::npos) ||
18059+
(name.find("final_layer.linear.") != std::string::npos)
18060+
) || (arch == LLM_ARCH_SD1 || arch == LLM_ARCH_SDXL) && (
18061+
(name.find("conv_in.") != std::string::npos) ||
18062+
(name.find("conv_out.") != std::string::npos) ||
18063+
(name == "input_blocks.0.0.weight") ||
18064+
(name == "out.2.weight")
18065+
)) {
18066+
new_type = GGML_TYPE_F32;
18067+
} else if ( // Tensors to keep in FP16 precision
18068+
(arch == LLM_ARCH_FLUX) && (
18069+
(name.find("txt_in.") != std::string::npos) ||
18070+
(name.find("time_in.") != std::string::npos) ||
18071+
(name.find("vector_in.") != std::string::npos) ||
18072+
(name.find("guidance_in.") != std::string::npos) ||
18073+
(name.find("final_layer.") != std::string::npos)
18074+
) || (arch == LLM_ARCH_SD1 || arch == LLM_ARCH_SDXL) && (
18075+
(name.find("class_embedding.") != std::string::npos) ||
18076+
(name.find("time_embedding.") != std::string::npos) ||
18077+
(name.find("add_embedding.") != std::string::npos) ||
18078+
(name.find("time_embed.") != std::string::npos) ||
18079+
(name.find("label_emb.") != std::string::npos) ||
18080+
(name.find("proj_in.") != std::string::npos) ||
18081+
(name.find("proj_out.") != std::string::npos)
18082+
// (name.find("conv_shortcut.") != std::string::npos) // marginal improvement
18083+
)) {
18084+
new_type = GGML_TYPE_F16;
18085+
} else if ( // Rules for to_v attention
18086+
(name.find("attn_v.weight") != std::string::npos) ||
18087+
(name.find(".to_v.weight") != std::string::npos)
18088+
){
18089+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
18090+
new_type = GGML_TYPE_Q3_K;
18091+
}
18092+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
18093+
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18094+
}
18095+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
18096+
new_type = GGML_TYPE_Q5_K;
18097+
}
18098+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
18099+
new_type = GGML_TYPE_Q6_K;
18100+
}
18101+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) {
18102+
new_type = GGML_TYPE_Q5_K;
18103+
}
18104+
++qs.i_attention_wv;
18105+
} else if ( // Rules for fused qkv attention
18106+
(name.find("attn_qkv.weight") != std::string::npos) ||
18107+
(name.find("attn.qkv.weight") != std::string::npos)
18108+
) {
18109+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
18110+
new_type = GGML_TYPE_Q4_K;
18111+
}
18112+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
18113+
new_type = GGML_TYPE_Q5_K;
18114+
}
18115+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
18116+
new_type = GGML_TYPE_Q6_K;
18117+
}
18118+
} else if ( // Rules for ffn
18119+
(name.find("ffn_down") != std::string::npos) ||
18120+
(name.find("DenseReluDense.wo") != std::string::npos)
18121+
) {
18122+
// TODO: add back `layer_info` with some model specific logic + logic further down
18123+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
18124+
new_type = GGML_TYPE_Q4_K;
18125+
}
18126+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
18127+
new_type = GGML_TYPE_Q5_K;
18128+
}
18129+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
18130+
new_type = GGML_TYPE_Q5_K;
18131+
}
18132+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
18133+
new_type = GGML_TYPE_Q6_K;
18134+
}
18135+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
18136+
new_type = GGML_TYPE_Q6_K;
18137+
}
18138+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0) {
18139+
new_type = GGML_TYPE_Q4_1;
18140+
}
18141+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_0) {
18142+
new_type = GGML_TYPE_Q5_1;
18143+
}
18144+
++qs.i_ffn_down;
18145+
}
18146+
18147+
// Sanity check for row shape
18148+
bool convert_incompatible_tensor = false;
18149+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
18150+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
18151+
int nx = tensor->ne[0];
18152+
int ny = tensor->ne[1];
18153+
if (nx % QK_K != 0) {
18154+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
18155+
convert_incompatible_tensor = true;
18156+
} else {
18157+
++qs.n_k_quantized;
18158+
}
18159+
}
18160+
if (convert_incompatible_tensor) {
18161+
// TODO: Possibly reenable this in the future
18162+
// switch (new_type) {
18163+
// case GGML_TYPE_Q2_K:
18164+
// case GGML_TYPE_Q3_K:
18165+
// case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
18166+
// case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
18167+
// case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
18168+
// default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
18169+
// }
18170+
new_type = GGML_TYPE_F16;
18171+
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
18172+
++qs.n_fallback;
18173+
}
18174+
return new_type;
18175+
}
18176+
18177+
1801318178
static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
1801418179
const std::string name = ggml_get_name(tensor);
1801518180

1801618181
// TODO: avoid hardcoded tensor names - use the TN_* constants
1801718182
const llm_arch arch = qs.model.arch;
18183+
if (arch == LLM_ARCH_FLUX || arch == LLM_ARCH_SD1 || arch == LLM_ARCH_SDXL) { return img_tensor_get_type(qs, new_type, tensor, ftype); };
1801818184
const auto tn = LLM_TN(arch);
1801918185

1802018186
auto use_more_bits = [](int i_layer, int n_layers) -> bool {

0 commit comments

Comments
 (0)