From 201cdafad71f4ad87e81dd3fe8ea8a79240f4435 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 12 May 2025 12:34:35 +0200 Subject: [PATCH] clip : cap max image size 1024 for qwen vl model --- tools/mtmd/clip.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 0adf03163fcc4..41ba45a79b5ab 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1909,16 +1909,20 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_QWEN2VL: { - // max image size = sqrt(max_pixels) - // https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json - hparams.image_size = 3584; + // max image size = sqrt(max_pixels) = 3584 + // ref: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json + // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable + // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10 + hparams.image_size = 1024; hparams.warmup_image_size = hparams.patch_size * 8; } break; case PROJECTOR_TYPE_QWEN25VL: { // max image size = sqrt(max_pixels) // https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json - hparams.image_size = 3584; + // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable + // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10 + hparams.image_size = 1024; hparams.warmup_image_size = hparams.patch_size * 8; get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern); } break;