Skip to content

mtmd : remove libllava, remove clip-quantize-cli (⚠️ breaking change) #13460

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 13, 2025
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
rm clip_model_quantize
  • Loading branch information
ngxson committed May 11, 2025
commit 37954239e6d2298038ebd556d2e2474b6d51903f
135 changes: 0 additions & 135 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3576,141 +3576,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
return true;
}

bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
assert(itype < GGML_TYPE_COUNT);
ggml_type type = static_cast<ggml_type>(itype);

auto * ctx_clip = clip_init(fname_inp, clip_context_params{
/* use_gpu */ false,
/* verbosity */ GGML_LOG_LEVEL_ERROR,
});

const auto & ctx_src = ctx_clip->ctx_gguf.get();
const auto & ctx_data = ctx_clip->ctx_data.get();

auto * ctx_out = gguf_init_empty();
gguf_set_kv(ctx_out, ctx_src);
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
gguf_set_val_u32(ctx_out, "general.file_type", itype);

auto fout = std::ofstream(fname_out, std::ios::binary);

const int n_tensors = gguf_get_n_tensors(ctx_src);

for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx_src, i);
ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
gguf_add_tensor(ctx_out, cur);
}

const size_t meta_size = gguf_get_meta_size(ctx_out);
for (size_t i = 0; i < meta_size; ++i) {
fout.put(0);
}

// regexes of tensor names to be quantized
const std::vector<std::string> k_names = {
".*weight",
};

std::vector<uint8_t> work(512);
std::vector<float> conv_buf(512);
size_t total_size_org = 0;
size_t total_size_new = 0;

for (int i = 0; i < n_tensors; ++i) {
const std::string name = gguf_get_tensor_name(ctx_src, i);
ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());

enum ggml_type new_type;
void * new_data;
size_t new_size;

bool quantize = false;
for (const auto & s : k_names) {
if (std::regex_match(name, std::regex(s))) {
quantize = true;
break;
}
}

// quantize only 2D tensors and bigger than block size
quantize &= (ggml_n_dims(cur) == 2) && cur->ne[0] > ggml_blck_size(type);

if (quantize) {
new_type = type;
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
// LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
}
const size_t n_elms = ggml_nelements(cur);
float * f32_data;

switch (cur->type) {
case GGML_TYPE_F32:
f32_data = (float *)cur->data;
break;
case GGML_TYPE_F16:
if (conv_buf.size() < n_elms) {
conv_buf.resize(n_elms);
}
for (size_t j = 0; j < n_elms; ++j) {
conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]);
}
f32_data = (float *)conv_buf.data();
break;
default:
LOG_ERR("%s: Please use an input file in f32 or f16\n", __func__);
gguf_free(ctx_out);
return false;
}

if (work.size() < n_elms * 4) {
work.resize(n_elms * 4);
}
new_data = work.data();

new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr);
} else {
new_type = cur->type;
new_data = cur->data;
new_size = ggml_nbytes(cur);
}
const size_t orig_size = ggml_nbytes(cur);
total_size_org += orig_size;
total_size_new += new_size;
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size);
gguf_set_tensor_data(ctx_out, name.c_str(), new_data);
fout.write((const char *)new_data, new_size);
size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
for (size_t j = 0; j < pad; ++j) {
fout.put(0);
}

LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
}

// go back to beginning of file and write the updated metadata
fout.seekp(0, std::ios::beg);
std::vector<uint8_t> meta(meta_size);
gguf_get_meta_data(ctx_out, meta.data());
fout.write((const char *)meta.data(), meta_size);

fout.close();

clip_free(ctx_clip);
gguf_free(ctx_out);

{
LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
}

return true;
}

int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
switch (ctx->proj_type) {
case PROJECTOR_TYPE_LDP:
Expand Down
Loading