diff --git a/common/arg.cpp b/common/arg.cpp
index b6bfe6f89bead..e0f4a15cc7784 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1570,7 +1570,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.image.emplace_back(value);
}
- ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+ ).set_examples({LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_VISION}));
if (llama_supports_rpc()) {
add_opt(common_arg(
{"--rpc"}, "SERVERS",
diff --git a/common/common.h b/common/common.h
index 1c0f199774976..88becc7f3181b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -80,6 +80,7 @@ enum llama_example {
LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
+ LLAMA_EXAMPLE_VISION,
LLAMA_EXAMPLE_COUNT,
};
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index b5d95bd5639f3..a26df6d3eafe5 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -17,6 +17,7 @@
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
from itertools import chain
+from transformers import AutoConfig
import math
import numpy as np
import torch
@@ -66,6 +67,13 @@ class Model:
metadata_override: Path | None
dir_model_card: Path
+ # for vision model
+ vision_arch: gguf.MODEL_ARCH | None = None
+ preprocessor_config: dict[str, Any] | None = None
+ vparams: dict[str, Any] | None = None
+ v_tensor_map: gguf.TensorNameMap | None = None
+ v_tensor_names: set[str] | None
+
# subclasses should define this!
model_arch: gguf.MODEL_ARCH
@@ -126,6 +134,16 @@ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
return None
raise KeyError(f"could not find any of: {keys}")
+ def find_vparams(self, keys: Iterable[str], optional: bool = False) -> Any:
+ if self.vparams is None:
+ raise ValueError("vision model parameters not set")
+ key = next((k for k in keys if k in self.vparams), None)
+ if key is not None:
+ return self.vparams[key]
+ if optional:
+ return None
+ raise KeyError(f"(vision) could not find any of: {keys}")
+
def set_vocab(self):
self._set_vocab_gpt2()
@@ -186,9 +204,10 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
f"Missing tensors: {missing}\n"
f"Extra tensors: {extra}")
- def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
- if key not in gguf.MODEL_TENSORS[self.model_arch]:
- raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}")
+ def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight", is_vision = False) -> str:
+ arch = self.vision_arch if is_vision and self.vision_arch is not None else self.model_arch
+ if key not in gguf.MODEL_TENSORS[arch]:
+ raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {arch!r}")
name: str = gguf.TENSOR_NAMES[key]
if "{bid}" in name:
assert bid is not None
@@ -210,9 +229,13 @@ def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int |
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
- if new_name is None:
+ new_name_vision = self.v_tensor_map.get_name(key=name, try_suffixes=try_suffixes) if self.v_tensor_map is not None else None
+ if new_name is not None:
+ return new_name
+ elif new_name_vision is not None:
+ return new_name_vision
+ else:
raise ValueError(f"Can not map tensor {name!r}")
- return new_name
def set_gguf_parameters(self):
self.gguf_writer.add_block_count(self.block_count)
@@ -257,6 +280,23 @@ def set_gguf_parameters(self):
self.gguf_writer.add_key_length(head_dim)
self.gguf_writer.add_value_length(head_dim)
+ # Vision model parameters
+ if self.vparams is not None and self.preprocessor_config is not None and self.vision_arch is not None:
+ self.gguf_writer.add_vision_type("vit")
+ self.gguf_writer.add_vision_image_size(self.vparams["image_size"])
+ self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"])
+ self.gguf_writer.add_vision_vit_architecture(gguf.MODEL_ARCH_NAMES[self.vision_arch])
+ self.gguf_writer.add_vision_vit_block_count(self.vparams["num_hidden_layers"])
+ self.gguf_writer.add_vision_vit_embedding_length(self.vparams["hidden_size"])
+ self.gguf_writer.add_vision_vit_feed_forward_length(self.vparams["intermediate_size"])
+ self.gguf_writer.add_vision_vit_head_count(self.vparams["num_attention_heads"])
+ self.gguf_writer.add_vision_vit_image_mean(self.preprocessor_config["image_mean"])
+ self.gguf_writer.add_vision_vit_image_std(self.preprocessor_config["image_std"])
+ try:
+ self.gguf_writer.add_vision_vit_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"]))
+ except KeyError:
+ self.gguf_writer.add_vision_vit_select_layer(0)
+
self.gguf_writer.add_file_type(self.ftype)
logger.info(f"gguf: file type = {self.ftype}")
@@ -466,7 +506,20 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]
@staticmethod
def load_hparams(dir_model: Path):
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
- return json.load(f)
+ hparams = json.load(f)
+ if "text_config" in hparams:
+ hparams = {**hparams["text_config"], **hparams}
+ return hparams
+
+ @staticmethod
+ def load_preprocessor_config(dir_model: Path):
+ # TODO: this varies vastly among models, need to handle more cases in the future
+ file_path = dir_model / "preprocessor_config.json"
+ if os.path.exists(file_path):
+ with open(file_path, "r", encoding="utf-8") as f:
+ return json.load(f)
+ else:
+ raise Exception(f"Preprocessor config not found at {file_path}")
@classmethod
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -519,7 +572,9 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
toktypes: list[int] = []
from transformers import AutoTokenizer
- tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
+ # DEBIAN_FRONTEND=noninteractive means that the script is running in a non-interactive environment (i.e. CI), so we cannot answer Y/N when it asks for user input
+ is_cli_non_interactive = os.environ.get("DEBIAN_FRONTEND", "") == "noninteractive"
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=is_cli_non_interactive)
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
@@ -954,6 +1009,29 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab
self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
+# TODO: maybe merge this with Model in the future
+class VisionModelHelper:
+ model: Model
+ tok_embd_tensor: Tensor | None = None
+
+ def __init__(self, model: Model):
+ self.model = model
+ # TODO: how to do this without reading the whole safetensor file?
+ for tname, tensor in model.get_tensors():
+ if tname.endswith("embed_tokens.weight"):
+ self.tok_embd_tensor = tensor
+
+ def get_embd_for_tokens(self, map_token_to_tensor_name: Iterable[tuple[str, gguf.MODEL_TENSOR]], tensor_name_postfix = '.weight') -> Iterable[tuple[str, Tensor]]:
+ if self.tok_embd_tensor is None:
+ raise ValueError("Token embedding tensor not found")
+ from transformers import AutoTokenizer
+ tokenizer = AutoTokenizer.from_pretrained(self.model.dir_model, trust_remote_code=True)
+ for token, tensor_name in map_token_to_tensor_name:
+ tok_id = tokenizer.get_vocab()[token]
+ row = self.tok_embd_tensor[tok_id]
+ yield gguf.TENSOR_NAMES[tensor_name] + tensor_name_postfix, row
+
+
@Model.register("GPTNeoXForCausalLM")
class GPTNeoXModel(Model):
model_arch = gguf.MODEL_ARCH.GPTNEOX
@@ -1566,10 +1644,39 @@ def prepare_tensors(self):
raise ValueError(f"Unprocessed norms: {norms}")
-@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
+@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "MobileLlamaForCausalLM", "Idefics3ForConditionalGeneration")
class LlamaModel(Model):
model_arch = gguf.MODEL_ARCH.LLAMA
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ model_type = self.hparams.get("model_type")
+ self.vision_arch = None
+
+ # only tested with https://huggingface.co/llava-hf/llava-1.5-7b-hf
+ if "vision_config" in self.hparams and model_type == "llava":
+ self.vparams = self.hparams["vision_config"]
+ self.preprocessor_config = self.load_preprocessor_config(self.dir_model)
+ self.vision_arch = gguf.MODEL_ARCH.VISION_LLAVA
+
+ # only tested with https://huggingface.co/mtgv/MobileVLM_V2-1.7B
+ if "mm_vision_tower" in self.hparams and model_type == "mobilevlm":
+ from transformers import AutoImageProcessor
+ vision_model_id = self.hparams["mm_vision_tower"]
+ self.vparams = AutoConfig.from_pretrained(vision_model_id).to_dict()["vision_config"]
+ self.preprocessor_config = AutoImageProcessor.from_pretrained(vision_model_id).to_dict()
+ self.vision_arch = gguf.MODEL_ARCH.VISION_MOBILEVLM
+
+ # only tested with https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct
+ if "vision_config" in self.hparams and model_type == "idefics3":
+ self.vparams = self.hparams["vision_config"]
+ self.preprocessor_config = self.load_preprocessor_config(self.dir_model)
+ self.vision_arch = gguf.MODEL_ARCH.VISION_IDEFICS3
+
+ if self.vparams is not None and self.vision_arch is not None:
+ self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"])
+
def set_vocab(self):
try:
self._set_vocab_sentencepiece()
@@ -1619,6 +1726,24 @@ def set_gguf_parameters(self):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+ # For vision model
+ if self.vparams is not None:
+ max_pos_embd = -1
+ self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
+ # TODO: should not hardcode these, but they are currently missing from config.json
+ if self.vision_arch == gguf.MODEL_ARCH.VISION_LLAVA:
+ self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.MLP)
+ max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
+ if self.vision_arch == gguf.MODEL_ARCH.VISION_MOBILEVLM:
+ self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.LDPV2)
+ max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
+ if self.vision_arch == gguf.MODEL_ARCH.VISION_IDEFICS3:
+ self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.MLP)
+ self.gguf_writer.add_vision_vit_scale_factor(self.hparams["scale_factor"])
+ max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2
+ self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-05)
+ self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd)
+
@staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
if n_head_kv is not None and n_head != n_head_kv:
@@ -1632,11 +1757,23 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
-
- if name.endswith(("q_proj.weight", "q_proj.bias")):
- data_torch = LlamaModel.permute(data_torch, n_head, n_head)
- if name.endswith(("k_proj.weight", "k_proj.bias")):
- data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+ is_vision_tensor = "vision_tower" in name or "vision_model" in name
+
+ if is_vision_tensor:
+ name = name.replace("model.vision_tower.", "")
+ if "post_layernorm" in name and self.vision_arch != gguf.MODEL_ARCH.VISION_IDEFICS3:
+ return [] # skip post_layernorm
+
+ if not is_vision_tensor:
+ if name.startswith("model.text_model"):
+ name = name.replace("text_model.", "") # for SmolVLM
+ elif name.startswith("language_model"):
+ # language model tensors, remove the prefix
+ name = name.replace("language_model.", "")
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
@@ -1713,6 +1850,22 @@ def prepare_tensors(self):
raise ValueError(f"Unprocessed experts: {experts}")
+@Model.register("LlavaForConditionalGeneration")
+class LlavaModel(LlamaModel):
+ model_arch = gguf.MODEL_ARCH.LLAMA
+
+ def __init__(self, *args, **kwargs):
+ # quick fix for llava model
+ # see: https://huggingface.co/llava-hf/llava-1.5-7b-hf/discussions/34
+ hparams = Model.load_hparams(kwargs["dir_model"])
+ if "vision_config" in hparams and hparams.get("model_type") == "llava":
+ text_config = hparams["text_config"]
+ text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict()
+ kwargs["hparams"] = {**text_config, **hparams}
+
+ super().__init__(*args, **kwargs)
+
+
@Model.register("DeciLMForCausalLM")
class DeciModel(Model):
model_arch = gguf.MODEL_ARCH.DECI
@@ -2240,6 +2393,173 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
yield name, data
+@Model.register("MiniCPMV")
+class MiniCPMVModel(Qwen2Model):
+ # MiniCPM-V 2.5 is Qwen2 and 2.6 is Qwen-2.5
+ model_arch = gguf.MODEL_ARCH.QWEN2
+ proj_type: gguf.constants.CLIPProjectorType | None
+ resampler_n_embd = 0
+ vhelper: VisionModelHelper | None
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ model_type = self.hparams.get("model_type", None)
+
+ # only tested with https://huggingface.co/openbmb/MiniCPM-V-2_6
+ if "vision_config" in self.hparams and model_type == "minicpmv":
+ self.vparams = self.hparams["vision_config"]
+ self.preprocessor_config = self.load_preprocessor_config(self.dir_model)
+ self.vision_arch = gguf.MODEL_ARCH.VISION_MINICPMV
+ version = str(self.hparams.get("version", "unknown"))
+ if version == "2.5":
+ self.proj_type = gguf.constants.CLIPProjectorType.MINICPMV_2_5
+ elif version == "2.6":
+ self.proj_type = gguf.constants.CLIPProjectorType.MINICPMV_2_6
+ else:
+ raise ValueError(f"Unsupported MiniCPM-V version: {version}")
+ self.vhelper = VisionModelHelper(self)
+ # TODO: how to do this without reading the whole safetensor file?
+ for tname, tensor in self.get_tensors():
+ if tname == "resampler.ln_post.bias":
+ self.resampler_n_embd = tensor.shape[0]
+ if self.resampler_n_embd < 2:
+ raise ValueError("Failed to detect resampler embedding size")
+ else:
+ raise ValueError("Expected vision_config, but not found")
+
+ assert self.vparams is not None
+ assert self.vision_arch is not None
+ assert self.preprocessor_config is not None
+ self.preprocessor_config["image_mean"] = [0.5, 0.5, 0.5]
+ self.preprocessor_config["image_std"] = [0.5, 0.5, 0.5]
+ self.hparams["vision_feature_layer"] = 0
+ self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"])
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+ assert self.vparams is not None and self.proj_type is not None
+ self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
+ self.gguf_writer.add_vision_vit_projector_type(self.proj_type)
+ self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-06)
+ max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2
+ self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd)
+
+
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+ # because the model operates excusively on 70x70 patches for now, we should precompute the positional embeddings to gain performance
+ # in the future, we can do it in cpp if we figure out how to do it efficiently
+ yield (
+ self.format_tensor_name(gguf.MODEL_TENSOR.V_RESMPL_POS_EMBD_K, is_vision=True),
+ torch.from_numpy(self._get_2d_sincos_pos_embed(self.resampler_n_embd, (70, 70)))
+ )
+ assert self.vhelper is not None
+ added_tokens = [
+ ("", gguf.MODEL_TENSOR.V_TOK_EMBD_IMAGE),
+ ("", gguf.MODEL_TENSOR.V_TOK_EMBD_END_IMAGE),
+ ("", gguf.MODEL_TENSOR.V_TOK_EMBD_SLICE),
+ ("", gguf.MODEL_TENSOR.V_TOK_EMBD_END_SLICE),
+ ]
+ for tensor_name, tensor in self.vhelper.get_embd_for_tokens(added_tokens):
+ yield tensor_name, tensor
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ del bid # unused
+
+ # for language part
+ if name.startswith("llm."):
+ return [(self.map_tensor_name(name.replace("llm.", "")), data_torch)]
+
+ # split the resampler.attn.in_proj_(weight|bias) tensors into q, k, v
+ if name.endswith("in_proj_weight") or name.endswith("in_proj_bias"):
+ assert data_torch.shape[0] == 3 * self.resampler_n_embd
+ split_tensor = data_torch.chunk(3, dim=0)
+ name_q = name.replace("in_proj_", "in_proj_q.") # in_proj_q.(weight|bias)
+ name_k = name.replace("in_proj_", "in_proj_k.") # in_proj_k.(weight|bias)
+ name_v = name.replace("in_proj_", "in_proj_v.") # in_proj_v.(weight|bias)
+ return [
+ # TODO: permute these
+ (self.map_tensor_name(name_q), split_tensor[0]),
+ (self.map_tensor_name(name_k), split_tensor[1]),
+ (self.map_tensor_name(name_v), split_tensor[2]),
+ ]
+
+ # append .weight to these tensors
+ if name == "resampler.proj" or name == "resampler.query":
+ name += ".weight"
+
+ if name.startswith("resampler.proj"):
+ data_torch = data_torch.transpose(-1, -2).contiguous()
+
+ if "post_layernorm" in name:
+ return [] # skip post_layernorm
+
+ return [(self.map_tensor_name(name), data_torch)]
+
+ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
+ del name, bid # unused
+ if "v.resmpl.query" in new_name or "v.resmpl.pos_embd_k" in new_name:
+ return gguf.GGMLQuantizationType.F32
+ if "v.resmpl." in new_name:
+ return gguf.GGMLQuantizationType.F32 if n_dims == 1 else gguf.GGMLQuantizationType.F16
+ return False
+
+ # utils to work with MiniCPM-V resampler
+
+ # https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+ def _get_2d_sincos_pos_embed(self, embed_dim: int, grid_size: tuple[int, int] | int, cls_token=False) -> np.ndarray:
+ """
+ grid_size: int of the grid height and width
+ return:
+ pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ if isinstance(grid_size, int):
+ grid_h_size, grid_w_size = grid_size, grid_size
+ else:
+ grid_h_size, grid_w_size = grid_size[0], grid_size[1]
+
+ grid_h = np.arange(grid_h_size, dtype=np.float32)
+ grid_w = np.arange(grid_w_size, dtype=np.float32)
+ grid = np.meshgrid(grid_w, grid_h) # here w goes first
+ grid = np.stack(grid, axis=0)
+
+ grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
+ pos_embed = self._get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+ if cls_token:
+ pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+ def _get_2d_sincos_pos_embed_from_grid(self, embed_dim: int, grid: np.ndarray) -> np.ndarray:
+ assert embed_dim % 2 == 0
+
+ # use half of dimensions to encode grid_h
+ emb_h = self._get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
+ emb_w = self._get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
+
+ emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+ return emb
+
+ def _get_1d_sincos_pos_embed_from_grid(self, embed_dim: int, pos: np.ndarray) -> np.ndarray:
+ """
+ embed_dim: output dimension for each position
+ pos: a list of positions to be encoded: size (M,)
+ out: (M, D)
+ """
+ assert embed_dim % 2 == 0
+ omega = np.arange(embed_dim // 2, dtype=np.float32)
+ omega /= embed_dim / 2.
+ omega = 1. / 10000 ** omega # (D/2,)
+
+ pos = pos.reshape(-1) # (M,)
+ out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
+
+ emb_sin = np.sin(out) # (M, D/2)
+ emb_cos = np.cos(out) # (M, D/2)
+
+ emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
+ return emb
+
+
@Model.register("WavTokenizerDec")
class WavTokenizerDecModel(Model):
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
@@ -5034,7 +5354,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
- description="Convert a huggingface model to a GGML compatible file")
+ description="Convert a huggingface model to a GGML compatible file\n\nNote: When converting vision models, this script may use internet connection to download configuration files via Hugging Face.")
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 66cfab2c3b796..41d968ed64531 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -53,6 +53,7 @@ else()
add_subdirectory(tokenize)
add_subdirectory(tts)
add_subdirectory(gen-docs)
+ add_subdirectory(vision)
if (NOT GGML_BACKEND_DL)
# these examples use the backends directly and cannot be built with dynamic loading
add_subdirectory(convert-llama2c-to-ggml)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 71e053b202cd2..d5cbbf2ed474c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3156,6 +3156,7 @@ struct server_context {
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
+ nullptr,
};
const int ret = llama_decode(ctx, batch_view);
diff --git a/examples/vision/CMakeLists.txt b/examples/vision/CMakeLists.txt
new file mode 100644
index 0000000000000..ab009157a957f
--- /dev/null
+++ b/examples/vision/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-vision)
+add_executable(${TARGET} vision.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/vision/README.md b/examples/vision/README.md
new file mode 100644
index 0000000000000..c2468444caa89
--- /dev/null
+++ b/examples/vision/README.md
@@ -0,0 +1,3 @@
+# llama.cpp/example/simple-vision
+
+Minimal demo for vision API
diff --git a/examples/vision/vision.cpp b/examples/vision/vision.cpp
new file mode 100644
index 0000000000000..359a023ae86e3
--- /dev/null
+++ b/examples/vision/vision.cpp
@@ -0,0 +1,224 @@
+#include "llama.h"
+#include "common.h"
+#include "arg.h"
+#include "log.h"
+#include "sampling.h"
+#include
+#include
+#include
+#include
+#include
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+static void print_usage(int, char ** argv) {
+ printf("\nexample usage:\n");
+ printf("\n %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [--image img_path] [-p prompt]\n", argv[0]);
+ printf("\n");
+}
+
+static llama_vision_bitmap * load_image_from_file(const char * fname) {
+ std::ifstream file(fname, std::ios::binary);
+ if (!file) {
+ throw std::runtime_error("Unable to open file");
+ }
+ std::vector image_bytes = std::vector(
+ std::istreambuf_iterator(file),
+ std::istreambuf_iterator());
+ // decode image to byte array
+ int nx, ny, nc;
+ auto * bytes = (unsigned char *) image_bytes.data();
+ auto * img = stbi_load_from_memory(bytes, image_bytes.size(), &nx, &ny, &nc, 3);
+ if (!img) {
+ throw std::runtime_error("failed to decode image bytes");
+ }
+ // printf("nx=%d ny=%d nc=%d\n", nx, ny, nc);
+ // GGML_ASSERT(nc == 3);
+ // for (int y = 0; y < ny; y++) {
+ // for (int x = 0; x < nx; x++) {
+ // unsigned char * pix = img + x*nc + y*nc*nx;
+ // printf("%02x%02x%02x ", pix[0], pix[1], pix[2]);
+ // }
+ // printf("\n");
+ // }
+ // printf("\n");
+ llama_vision_bitmap * result = llama_vision_bitmap_init(nx, ny);
+ memcpy(result->data, img, nx*ny*3);
+ stbi_image_free(img);
+ return result;
+}
+
+// split string by a `std::string delim` instead of `char delim`
+static std::vector string_split_str(std::string s, const std::string & delimiter) {
+ std::vector tokens;
+ size_t pos = 0;
+ std::string token;
+ while ((pos = s.find(delimiter)) != std::string::npos) {
+ token = s.substr(0, pos);
+ tokens.push_back(token);
+ s.erase(0, pos + delimiter.length());
+ }
+ tokens.push_back(s);
+ return tokens;
+}
+
+struct tokenized_part {
+ llama_tokens tokens;
+ bool is_image;
+};
+
+// TODO: this function is hacky, need to be improved
+// static const llama_token TOKEN_IMG_PLACEMENT = -1000;
+static const std::string IMG_PLACEMENT = "";
+static std::vector tokenize_with_img_placement(
+ const llama_vocab * vocab,
+ const std::string & text,
+ bool add_special,
+ bool parse_special) {
+ std::vector parts = string_split_str(text, IMG_PLACEMENT);
+ std::vector output;
+ for (const auto & part : parts) {
+ //printf("tokenizing part: %s\n", part.c_str());
+ bool add_bos = &parts.front() == ∂
+ auto tokens = common_tokenize(vocab, part, add_special && add_bos, parse_special);
+ if (tokens.empty()) {
+ continue;
+ }
+ output.push_back({std::move(tokens), false});
+ if (&parts.back() != &part) {
+ // add image token to middle of 2 parts
+ output.push_back({{}, true});
+ }
+ }
+ return output;
+}
+
+int main(int argc, char ** argv) {
+ common_params params;
+
+ // default prompt for llava 1.5
+ //params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:\nwhat did you see?\nASSISTANT:";
+ // default prompt for minicpmv 2.6
+ params.prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n\nwhat do you see?<|im_end|>\n<|im_start|>assistant\n";
+ params.n_predict = 64;
+ params.n_batch = 2048;
+ params.n_ubatch = 1024;
+ params.n_gpu_layers = 99;
+
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_VISION, print_usage)) {
+ return 1;
+ }
+
+ common_init();
+ common_init_result llama_init = common_init_from_params(params);
+ llama_context * ctx = llama_init.context.get();
+ const llama_model * model = llama_init.model.get();
+ const llama_vocab * vocab = llama_model_get_vocab(model);
+ if (!model) {
+ LOG_ERR("failed to load model\n");
+ return 1;
+ }
+
+ llama_vision_context_params vparams = llama_vision_context_default_params();
+ vparams.n_threads = llama_n_threads(ctx);
+ llama_vision_context * vctx = llama_vision_init_from_model(model, vparams);
+ if (!vctx) {
+ LOG_ERR("model does not have vision encoder\n");
+ return 1;
+ }
+
+ struct common_sampler * smpl = common_sampler_init(model, params.sampling);
+
+ llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
+ int n_past = 0;
+ int n_prompt = 0;
+
+ // process image
+ llama_vision_tokens * img_tokens = nullptr;
+ {
+ const char * img_path = params.image[0].c_str();
+ if (params.image[0].empty()) {
+ LOG_ERR("no image path provided\n");
+ return 1;
+ }
+ llama_vision_bitmap * img = load_image_from_file(img_path);
+ LOG_INF("loaded image %s, size = %d x %d\n", img_path, img->nx, img->ny);
+ img_tokens = llama_vision_tokenize(vctx, img);
+ if (!img_tokens) {
+ LOG_ERR("failed to create image tokens\n");
+ return 1;
+ }
+ if (llama_vision_encode(vctx, img_tokens)) {
+ LOG_ERR("failed to encode image\n");
+ return 1;
+ }
+ LOG_INF("encoded image\n");
+ }
+
+ // process prompt
+ {
+ std::vector parts = tokenize_with_img_placement(vocab, params.prompt, true, true);
+ for (const tokenized_part & part : parts) {
+ if (!part.is_image) {
+ for (const llama_token & token : part.tokens) {
+ //LOG_INF("%d -> %s\n", token, common_token_to_piece(ctx, token).c_str());
+ common_batch_add(batch, token, n_past++, {0}, &part == &parts.back());
+ }
+ LOG_INF("eval text batch (%d tokens)\n", batch.n_tokens);
+ if (llama_decode(ctx, batch)) {
+ LOG_ERR("failed to decode text prompt\n");
+ return 1;
+ }
+ } else {
+ auto * img_embd = llama_vision_get_output_tensor(vctx);
+ // std::vector output_debug(ggml_nelements(img_embd));
+ // ggml_backend_tensor_get(img_embd, output_debug.data(), 0, ggml_nbytes(img_embd));
+ // for (int row = 0; row < 10; row++) {
+ // int off = row * img_embd->ne[0];
+ // printf("... %f %f %f\n", output_debug[off], output_debug[off+1], output_debug[off+2]);
+ // }
+ // exit(1);
+ llama_batch batch_img = llama_batch_get_one_from_tensor(img_embd, n_past, 0);
+ n_past += batch_img.n_tokens;
+ LOG_INF("eval image batch (%d embeddings)\n", batch_img.n_tokens);
+ if (llama_decode(ctx, batch_img)) {
+ LOG_ERR("failed to decode image prompt\n");
+ return 1;
+ }
+ llama_batch_free(batch_img);
+ }
+ }
+ n_prompt = n_past;
+ LOG_INF("prompt processed, %d tokens\n", n_prompt);
+ }
+
+ // generate response
+ while (true){
+ int n_generated = n_past - n_prompt;
+ if (n_generated > params.n_predict) {
+ printf("\n");
+ break;
+ }
+
+ llama_token token_id = common_sampler_sample(smpl, ctx, -1);
+ common_sampler_accept(smpl, token_id, true);
+ printf("%s", common_token_to_piece(ctx, token_id).c_str());
+ fflush(stdout);
+
+ if (llama_vocab_is_eog(vocab, token_id)) {
+ printf("\n");
+ break;
+ }
+
+ // eval the token
+ common_batch_clear(batch);
+ common_batch_add(batch, token_id, n_past++, {0}, true);
+ if (llama_decode(ctx, batch)) {
+ LOG_ERR("failed to decode token\n");
+ break;
+ }
+ }
+
+ return 0;
+}
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 19624eae04ece..3f0ccf13d1af9 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -202,6 +202,9 @@ class Tokenizer:
FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id"
FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id"
FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id"
+ # Vision models
+ IMAGE_START_ID = "tokenizer.ggml.image_start_token_id"
+ IMAGE_END_ID = "tokenizer.ggml.image_end_token_id"
# deprecated:
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
@@ -211,6 +214,32 @@ class Adapter:
TYPE = "adapter.type"
LORA_ALPHA = "adapter.lora.alpha"
+ class Vision:
+ # only support vision.type = "vit" for now
+ TYPE = "vision.type"
+ IMAGE_SIZE = "vision.image_size"
+ PATCH_SIZE = "vision.patch_size"
+ IMAGE_MEAN = "vision.image_mean"
+ IMAGE_STD = "vision.image_std"
+
+ class Vit:
+ ARCHITECTURE = "vision.vit.architecture"
+ CONTEXT_LENGTH = "vision.vit.context_length"
+ EMBEDDING_LENGTH = "vision.vit.embedding_length"
+ BLOCK_COUNT = "vision.vit.block_count"
+ FEED_FORWARD_LENGTH = "vision.vit.feed_forward_length"
+ PROJECTION_TYPE = "vision.vit.projection_type"
+ PROJECTION_DIM = "vision.vit.projection_dim"
+ USE_GELU = "vision.vit.use_gelu"
+ MAX_POS_EMBEDDING = "vision.vit.max_position_embeddings"
+ MAX_SLICES = "vision.vit.max_slices"
+ PROJECTOR_TYPE = "vision.vit.projector_type"
+ SELECT_LAYER = "vision.vit.select_layer"
+ PATCH_MERGE_TYPE = "vision.vit.patch_merge_type"
+ HEAD_COUNT = "vision.vit.attention.head_count"
+ LAYERNORM_EPS = "vision.vit.attention.layer_norm_epsilon"
+ SCALE_FACTOR = "vision.vit.scale_factor" # only used by idefics3 for now
+
#
# recommended mapping of model tensor names for storage in gguf
#
@@ -280,6 +309,11 @@ class MODEL_ARCH(IntEnum):
GRANITE_MOE = auto()
CHAMELEON = auto()
WAVTOKENIZER_DEC = auto()
+ # vision models
+ VISION_LLAVA = auto()
+ VISION_MOBILEVLM = auto()
+ VISION_MINICPMV = auto()
+ VISION_IDEFICS3 = auto()
class MODEL_TENSOR(IntEnum):
@@ -391,6 +425,7 @@ class MODEL_TENSOR(IntEnum):
ENC_OUTPUT_NORM = auto()
CLS = auto() # classifier
CLS_OUT = auto() # classifier output projection
+ # wavtokenizer
CONV1D = auto()
CONVNEXT_DW = auto()
CONVNEXT_NORM = auto()
@@ -407,6 +442,39 @@ class MODEL_TENSOR(IntEnum):
POSNET_ATTN_K = auto()
POSNET_ATTN_V = auto()
POSNET_ATTN_OUT = auto()
+ # vision
+ V_MMPROJ = auto()
+ V_MMPROJ_FC = auto()
+ V_MMPROJ_MLP = auto()
+ V_MMPROJ_PEG = auto()
+ V_ENC_EMBD_CLS = auto()
+ V_ENC_EMBD_PATCH = auto()
+ V_ENC_EMBD_POS = auto()
+ V_ENC_ATTN_Q = auto()
+ V_ENC_ATTN_K = auto()
+ V_ENC_ATTN_V = auto()
+ V_ENC_INPUT_NORM = auto()
+ V_ENC_OUTPUT = auto()
+ V_ENC_OUTPUT_NORM = auto()
+ V_ENC_FFN_UP = auto()
+ V_ENC_FFN_DOWN = auto()
+ V_PRE_NORM = auto()
+ V_POST_NORM = auto()
+ V_RESMPL_POS_EMBD_K = auto() # minicpmv
+ V_RESMPL_ATTN_Q = auto() # minicpmv
+ V_RESMPL_ATTN_K = auto() # minicpmv
+ V_RESMPL_ATTN_V = auto() # minicpmv
+ V_RESMPL_ATTN_OUT = auto() # minicpmv
+ V_RESMPL_KV = auto() # minicpmv
+ V_RESMPL_KV_NORM = auto() # minicpmv
+ V_RESMPL_POST_NORM = auto() # minicpmv
+ V_RESMPL_Q_NORM = auto() # minicpmv
+ V_RESMPL_PROJ = auto() # minicpmv
+ V_RESMPL_QUERY = auto() # minicpmv
+ V_TOK_EMBD_IMAGE = auto() # embedding for token
+ V_TOK_EMBD_END_IMAGE = auto() # embedding for token
+ V_TOK_EMBD_SLICE = auto() # embedding for token
+ V_TOK_EMBD_END_SLICE = auto() # embedding for token
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -468,6 +536,11 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.GRANITE_MOE: "granitemoe",
MODEL_ARCH.CHAMELEON: "chameleon",
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
+ # vision
+ MODEL_ARCH.VISION_LLAVA: "llava",
+ MODEL_ARCH.VISION_MOBILEVLM: "mobilevlm",
+ MODEL_ARCH.VISION_MINICPMV: "minicpmv",
+ MODEL_ARCH.VISION_IDEFICS3: "idefics3",
}
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -595,6 +668,39 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k",
MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v",
MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
+ # vision
+ MODEL_TENSOR.V_MMPROJ: "v.mmproj_{bid}",
+ MODEL_TENSOR.V_MMPROJ_FC: "v.mmproj.fc",
+ MODEL_TENSOR.V_MMPROJ_MLP: "v.mmproj.mlp.{bid}",
+ MODEL_TENSOR.V_MMPROJ_PEG: "v.mmproj.peg.{bid}",
+ MODEL_TENSOR.V_ENC_EMBD_CLS: "v.enc.embd.cls",
+ MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.enc.embd.patch",
+ MODEL_TENSOR.V_ENC_EMBD_POS: "v.enc.embd.pos",
+ MODEL_TENSOR.V_ENC_ATTN_Q: "v.enc.blk.{bid}.attn_q",
+ MODEL_TENSOR.V_ENC_ATTN_K: "v.enc.blk.{bid}.attn_k",
+ MODEL_TENSOR.V_ENC_ATTN_V: "v.enc.blk.{bid}.attn_v",
+ MODEL_TENSOR.V_ENC_INPUT_NORM: "v.enc.blk.{bid}.input_norm",
+ MODEL_TENSOR.V_ENC_OUTPUT: "v.enc.blk.{bid}.output",
+ MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.enc.blk.{bid}.output_norm",
+ MODEL_TENSOR.V_ENC_FFN_UP: "v.enc.blk.{bid}.ffn_up",
+ MODEL_TENSOR.V_ENC_FFN_DOWN: "v.enc.blk.{bid}.ffn_down",
+ MODEL_TENSOR.V_PRE_NORM: "v.pre_norm",
+ MODEL_TENSOR.V_POST_NORM: "v.post_norm",
+ MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "v.resmpl.pos_embd_k",
+ MODEL_TENSOR.V_RESMPL_ATTN_Q: "v.resmpl.attn_q",
+ MODEL_TENSOR.V_RESMPL_ATTN_K: "v.resmpl.attn_k",
+ MODEL_TENSOR.V_RESMPL_ATTN_V: "v.resmpl.attn_v",
+ MODEL_TENSOR.V_RESMPL_ATTN_OUT: "v.resmpl.attn_out",
+ MODEL_TENSOR.V_RESMPL_KV: "v.resmpl.kv",
+ MODEL_TENSOR.V_RESMPL_KV_NORM: "v.resmpl.kv_norm",
+ MODEL_TENSOR.V_RESMPL_POST_NORM: "v.resmpl.post_norm",
+ MODEL_TENSOR.V_RESMPL_Q_NORM: "v.resmpl.q_norm",
+ MODEL_TENSOR.V_RESMPL_PROJ: "v.resmpl.proj",
+ MODEL_TENSOR.V_RESMPL_QUERY: "v.resmpl.query",
+ MODEL_TENSOR.V_TOK_EMBD_IMAGE: "v.tok_embd.image",
+ MODEL_TENSOR.V_TOK_EMBD_END_IMAGE: "v.tok_embd.end_image",
+ MODEL_TENSOR.V_TOK_EMBD_SLICE: "v.tok_embd.slice",
+ MODEL_TENSOR.V_TOK_EMBD_END_SLICE: "v.tok_embd.end_slice",
}
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -1556,6 +1662,80 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.POSNET_ATTN_V,
MODEL_TENSOR.POSNET_ATTN_OUT,
],
+ MODEL_ARCH.VISION_LLAVA: [
+ MODEL_TENSOR.V_MMPROJ,
+ MODEL_TENSOR.V_ENC_EMBD_CLS,
+ MODEL_TENSOR.V_ENC_EMBD_PATCH,
+ MODEL_TENSOR.V_ENC_EMBD_POS,
+ MODEL_TENSOR.V_ENC_ATTN_Q,
+ MODEL_TENSOR.V_ENC_ATTN_K,
+ MODEL_TENSOR.V_ENC_ATTN_V,
+ MODEL_TENSOR.V_ENC_INPUT_NORM,
+ MODEL_TENSOR.V_ENC_OUTPUT,
+ MODEL_TENSOR.V_ENC_OUTPUT_NORM,
+ MODEL_TENSOR.V_ENC_FFN_UP,
+ MODEL_TENSOR.V_ENC_FFN_DOWN,
+ MODEL_TENSOR.V_PRE_NORM,
+ MODEL_TENSOR.V_POST_NORM,
+ ],
+ MODEL_ARCH.VISION_MOBILEVLM: [
+ MODEL_TENSOR.V_MMPROJ_MLP,
+ MODEL_TENSOR.V_MMPROJ_PEG,
+ MODEL_TENSOR.V_ENC_EMBD_CLS,
+ MODEL_TENSOR.V_ENC_EMBD_PATCH,
+ MODEL_TENSOR.V_ENC_EMBD_POS,
+ MODEL_TENSOR.V_ENC_ATTN_Q,
+ MODEL_TENSOR.V_ENC_ATTN_K,
+ MODEL_TENSOR.V_ENC_ATTN_V,
+ MODEL_TENSOR.V_ENC_INPUT_NORM,
+ MODEL_TENSOR.V_ENC_OUTPUT,
+ MODEL_TENSOR.V_ENC_OUTPUT_NORM,
+ MODEL_TENSOR.V_ENC_FFN_UP,
+ MODEL_TENSOR.V_ENC_FFN_DOWN,
+ MODEL_TENSOR.V_PRE_NORM,
+ MODEL_TENSOR.V_POST_NORM,
+ ],
+ MODEL_ARCH.VISION_MINICPMV: [
+ MODEL_TENSOR.V_ENC_EMBD_PATCH,
+ MODEL_TENSOR.V_ENC_EMBD_POS,
+ MODEL_TENSOR.V_ENC_ATTN_Q,
+ MODEL_TENSOR.V_ENC_ATTN_K,
+ MODEL_TENSOR.V_ENC_ATTN_V,
+ MODEL_TENSOR.V_ENC_INPUT_NORM,
+ MODEL_TENSOR.V_ENC_OUTPUT,
+ MODEL_TENSOR.V_ENC_OUTPUT_NORM,
+ MODEL_TENSOR.V_ENC_FFN_UP,
+ MODEL_TENSOR.V_ENC_FFN_DOWN,
+ MODEL_TENSOR.V_RESMPL_POS_EMBD_K,
+ MODEL_TENSOR.V_RESMPL_ATTN_Q,
+ MODEL_TENSOR.V_RESMPL_ATTN_K,
+ MODEL_TENSOR.V_RESMPL_ATTN_V,
+ MODEL_TENSOR.V_RESMPL_ATTN_OUT,
+ MODEL_TENSOR.V_RESMPL_KV,
+ MODEL_TENSOR.V_RESMPL_KV_NORM,
+ MODEL_TENSOR.V_RESMPL_POST_NORM,
+ MODEL_TENSOR.V_RESMPL_Q_NORM,
+ MODEL_TENSOR.V_RESMPL_PROJ,
+ MODEL_TENSOR.V_RESMPL_QUERY,
+ MODEL_TENSOR.V_TOK_EMBD_IMAGE,
+ MODEL_TENSOR.V_TOK_EMBD_END_IMAGE,
+ MODEL_TENSOR.V_TOK_EMBD_SLICE,
+ MODEL_TENSOR.V_TOK_EMBD_END_SLICE,
+ ],
+ MODEL_ARCH.VISION_IDEFICS3: [
+ MODEL_TENSOR.V_MMPROJ_FC,
+ MODEL_TENSOR.V_ENC_EMBD_PATCH,
+ MODEL_TENSOR.V_ENC_EMBD_POS,
+ MODEL_TENSOR.V_ENC_ATTN_Q,
+ MODEL_TENSOR.V_ENC_ATTN_K,
+ MODEL_TENSOR.V_ENC_ATTN_V,
+ MODEL_TENSOR.V_ENC_INPUT_NORM,
+ MODEL_TENSOR.V_ENC_OUTPUT,
+ MODEL_TENSOR.V_ENC_OUTPUT_NORM,
+ MODEL_TENSOR.V_ENC_FFN_UP,
+ MODEL_TENSOR.V_ENC_FFN_DOWN,
+ MODEL_TENSOR.V_POST_NORM,
+ ],
# TODO
}
@@ -1637,6 +1817,18 @@ class PoolingType(IntEnum):
CLS = 2
+class CLIPProjectorType(Enum):
+ MLP = 'mlp'
+ LDPV2 = 'ldpv2'
+ MINICPMV_2_5 = 'minicpmv-2.5' # resampler
+ MINICPMV_2_6 = 'minicpmv-2.6' # resampler
+
+
+class CLIPPatchMergeType(Enum):
+ FLAT = 'flat'
+ SPATIAL_UNPAD = 'spatial_unpad'
+
+
class GGMLQuantizationType(IntEnum):
F32 = 0
F16 = 1
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 080d2b9dce5cb..a31ab736bc20a 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -27,6 +27,8 @@
PoolingType,
TokenType,
ExpertGatingFuncType,
+ CLIPPatchMergeType,
+ CLIPProjectorType,
)
from .quants import quant_shape_from_byte_shape
@@ -875,6 +877,60 @@ def add_remove_extra_whitespaces(self, value: bool) -> None:
def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
+ def add_vision_type(self, value: str) -> None:
+ self.add_string(Keys.Vision.TYPE, value)
+
+ def add_vision_image_size(self, value: int) -> None:
+ self.add_uint32(Keys.Vision.IMAGE_SIZE, value)
+
+ def add_vision_patch_size(self, value: int) -> None:
+ self.add_uint32(Keys.Vision.PATCH_SIZE, value)
+
+ def add_vision_vit_architecture(self, value: str) -> None:
+ self.add_string(Keys.Vision.Vit.ARCHITECTURE, value)
+
+ def add_vision_vit_context_length(self, value: int) -> None:
+ self.add_uint32(Keys.Vision.Vit.CONTEXT_LENGTH, value)
+
+ def add_vision_vit_embedding_length(self, value: int) -> None:
+ self.add_uint32(Keys.Vision.Vit.EMBEDDING_LENGTH, value)
+
+ def add_vision_vit_block_count(self, value: int) -> None:
+ self.add_uint32(Keys.Vision.Vit.BLOCK_COUNT, value)
+
+ def add_vision_vit_feed_forward_length(self, value: int) -> None:
+ self.add_uint32(Keys.Vision.Vit.FEED_FORWARD_LENGTH, value)
+
+ def add_vision_vit_head_count(self, value: int) -> None:
+ self.add_uint32(Keys.Vision.Vit.HEAD_COUNT, value)
+
+ def add_vision_vit_max_position_embeddings(self, value: int) -> None:
+ self.add_uint32(Keys.Vision.Vit.MAX_POS_EMBEDDING, value)
+
+ def add_vision_vit_projector_type(self, value: CLIPProjectorType) -> None:
+ self.add_string(Keys.Vision.Vit.PROJECTOR_TYPE, value.value)
+
+ def add_vision_vit_max_slices(self, value: int) -> None:
+ self.add_uint32(Keys.Vision.Vit.MAX_SLICES, value)
+
+ def add_vision_vit_select_layer(self, value: int) -> None:
+ self.add_int32(Keys.Vision.Vit.SELECT_LAYER, value)
+
+ def add_vision_vit_patch_merge_type(self, value: CLIPPatchMergeType) -> None:
+ self.add_string(Keys.Vision.Vit.PATCH_MERGE_TYPE, value.value)
+
+ def add_vision_vit_layer_norm_epsilon(self, value: float) -> None:
+ self.add_float32(Keys.Vision.Vit.LAYERNORM_EPS, value)
+
+ def add_vision_vit_image_mean(self, value: Sequence[float]) -> None:
+ self.add_array(Keys.Vision.IMAGE_MEAN, value)
+
+ def add_vision_vit_image_std(self, value: Sequence[float]) -> None:
+ self.add_array(Keys.Vision.IMAGE_STD, value)
+
+ def add_vision_vit_scale_factor(self, value: int) -> None:
+ self.add_int32(Keys.Vision.Vit.SCALE_FACTOR, value)
+
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
if not isinstance(value, str):
template_default = None
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 617791e240b60..3f247d787ba11 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -787,6 +787,157 @@ class TensorNameMap:
MODEL_TENSOR.POSNET_ATTN_OUT: (
"backbone.posnet.{bid}.proj_out", # wavtokenizer
),
+
+ #############################################################################
+
+ MODEL_TENSOR.V_MMPROJ: (
+ "multi_modal_projector.linear_{bid}",
+ ),
+
+ MODEL_TENSOR.V_MMPROJ_FC: (
+ "model.connector.modality_projection.proj", # SmolVLM
+ ),
+
+ MODEL_TENSOR.V_MMPROJ_MLP: (
+ "model.mm_projector.mlp.mlp.{bid}",
+ ),
+
+ MODEL_TENSOR.V_MMPROJ_PEG: (
+ "model.mm_projector.peg.peg.{bid}",
+ ),
+
+ MODEL_TENSOR.V_ENC_EMBD_CLS: (
+ "vision_tower.vision_model.embeddings.class_embedding",
+ ),
+
+ MODEL_TENSOR.V_ENC_EMBD_PATCH: (
+ "vision_tower.vision_model.embeddings.patch_embedding",
+ "vpm.embeddings.patch_embedding",
+ "model.vision_model.embeddings.patch_embedding", # SmolVLM
+ ),
+
+ MODEL_TENSOR.V_ENC_EMBD_POS: (
+ "vision_tower.vision_model.embeddings.position_embedding",
+ "vpm.embeddings.position_embedding",
+ "model.vision_model.embeddings.position_embedding", # SmolVLM
+ ),
+
+ MODEL_TENSOR.V_ENC_ATTN_Q: (
+ "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
+ "vpm.encoder.layers.{bid}.self_attn.q_proj",
+ "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
+ ),
+
+ MODEL_TENSOR.V_ENC_ATTN_K: (
+ "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
+ "vpm.encoder.layers.{bid}.self_attn.k_proj",
+ "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
+ ),
+
+ MODEL_TENSOR.V_ENC_ATTN_V: (
+ "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
+ "vpm.encoder.layers.{bid}.self_attn.v_proj",
+ "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
+ ),
+
+ MODEL_TENSOR.V_ENC_INPUT_NORM: (
+ "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
+ "vpm.encoder.layers.{bid}.layer_norm1",
+ "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
+ ),
+
+ MODEL_TENSOR.V_ENC_OUTPUT: (
+ "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
+ "vpm.encoder.layers.{bid}.self_attn.out_proj",
+ "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
+ ),
+
+ MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
+ "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
+ "vpm.encoder.layers.{bid}.layer_norm2",
+ "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
+ ),
+
+ MODEL_TENSOR.V_ENC_FFN_UP: (
+ "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
+ "vpm.encoder.layers.{bid}.mlp.fc1",
+ "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM
+ ),
+
+ MODEL_TENSOR.V_ENC_FFN_DOWN: (
+ "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
+ "vpm.encoder.layers.{bid}.mlp.fc2",
+ "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM
+ ),
+
+ MODEL_TENSOR.V_PRE_NORM: (
+ "vision_tower.vision_model.pre_layrnorm",
+ ),
+
+ MODEL_TENSOR.V_POST_NORM: (
+ "vision_tower.vision_model.post_layernorm",
+ "model.vision_model.post_layernorm", # SmolVLM
+ ),
+
+ MODEL_TENSOR.V_RESMPL_POS_EMBD_K: (
+ "resampler.pos_embed_k",
+ ),
+
+ MODEL_TENSOR.V_RESMPL_ATTN_Q: (
+ "resampler.attn.in_proj_q", # tensor generated from resampler.attn.in_proj
+ ),
+
+ MODEL_TENSOR.V_RESMPL_ATTN_K: (
+ "resampler.attn.in_proj_k", # tensor generated from resampler.attn.in_proj
+ ),
+
+ MODEL_TENSOR.V_RESMPL_ATTN_V: (
+ "resampler.attn.in_proj_v", # tensor generated from resampler.attn.in_proj
+ ),
+
+ MODEL_TENSOR.V_RESMPL_ATTN_OUT: (
+ "resampler.attn.out_proj",
+ ),
+
+ MODEL_TENSOR.V_RESMPL_KV: (
+ "resampler.kv_proj",
+ ),
+
+ MODEL_TENSOR.V_RESMPL_POST_NORM: (
+ "resampler.ln_post",
+ ),
+
+ MODEL_TENSOR.V_RESMPL_KV_NORM: (
+ "resampler.ln_kv",
+ ),
+
+ MODEL_TENSOR.V_RESMPL_Q_NORM: (
+ "resampler.ln_q",
+ ),
+
+ MODEL_TENSOR.V_RESMPL_PROJ: (
+ "resampler.proj",
+ ),
+
+ MODEL_TENSOR.V_RESMPL_QUERY: (
+ "resampler.query",
+ ),
+
+ MODEL_TENSOR.V_TOK_EMBD_IMAGE:(
+ "v.tok_embd.image", # tensor generated from token embeddings
+ ),
+
+ MODEL_TENSOR.V_TOK_EMBD_END_IMAGE:(
+ "v.tok_embd.end_image", # tensor generated from token embeddings
+ ),
+
+ MODEL_TENSOR.V_TOK_EMBD_SLICE:(
+ "v.tok_embd.slice", # tensor generated from token embeddings
+ ),
+
+ MODEL_TENSOR.V_TOK_EMBD_END_SLICE:(
+ "v.tok_embd.end_slice", # tensor generated from token embeddings
+ ),
}
# architecture-specific block mappings
diff --git a/include/llama.h b/include/llama.h
index 6a44be404d914..85302c67dec8b 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -231,6 +231,20 @@ extern "C" {
bool sorted;
} llama_token_data_array;
+ struct llama_vision_context;
+
+ // Structure represents the basic input unit of vision model
+ // This can be a processed image or slices of images under the hood
+ struct llama_vision_tokens;
+
+ // represent an RGB image
+ // size of data must be equal to 3*nx*ny
+ typedef struct llama_vision_bitmap {
+ uint32_t nx;
+ uint32_t ny;
+ unsigned char * data;
+ } llama_vision_bitmap;
+
typedef bool (*llama_progress_callback)(float progress, void * user_data);
// Input data for llama_decode
@@ -255,6 +269,8 @@ extern "C" {
int32_t * n_seq_id;
llama_seq_id ** seq_id;
int8_t * logits; // TODO: rename this to "output"
+
+ struct ggml_tensor * embd_tensor;
} llama_batch;
enum llama_model_kv_override_type {
@@ -353,6 +369,10 @@ extern "C" {
void * abort_callback_data;
};
+ struct llama_vision_context_params {
+ int32_t n_threads;
+ };
+
// model quantization parameters
typedef struct llama_model_quantize_params {
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
@@ -390,6 +410,7 @@ extern "C" {
// TODO: update API to start accepting pointers to params structs (https://github.com/ggml-org/llama.cpp/discussions/9172)
LLAMA_API struct llama_model_params llama_model_default_params(void);
LLAMA_API struct llama_context_params llama_context_default_params(void);
+ LLAMA_API struct llama_vision_context_params llama_vision_context_default_params(void);
LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void);
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
@@ -907,6 +928,10 @@ extern "C" {
int32_t embd,
int32_t n_seq_max);
+ // Allocates a batch based on a tensor, only used by vision API for now
+ // Unlike llama_batch_get_one, this will need to be freed after use
+ LLAMA_API struct llama_batch llama_batch_get_one_from_tensor(struct ggml_tensor * tensor, int32_t p0, int32_t seq_id);
+
// Frees a batch of tokens allocated with llama_batch_init()
LLAMA_API void llama_batch_free(struct llama_batch batch);
@@ -1357,6 +1382,35 @@ extern "C" {
// TODO: extend in the future
//LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);
+ //
+ // Vision API
+ //
+
+ // Vision context
+ LLAMA_API struct llama_vision_context * llama_vision_init_from_model(
+ const struct llama_model * model,
+ struct llama_vision_context_params params);
+ LLAMA_API void llama_vision_free(struct llama_vision_context * ctx);
+
+ // Container for RGB bitmap
+ LLAMA_API struct llama_vision_bitmap * llama_vision_bitmap_init(uint32_t nx, uint32_t ny);
+ LLAMA_API void llama_vision_bitmap_free(struct llama_vision_bitmap * bmp);
+
+ // Create image tokens from the RGB bitmap
+ LLAMA_API struct llama_vision_tokens * llama_vision_tokenize(
+ struct llama_vision_context * ctx,
+ struct llama_vision_bitmap * bmp);
+ LLAMA_API void llama_vision_tokens_free(struct llama_vision_tokens * img_tokens);
+
+ // User must reserve N number of tokens in tokenized text prompt for each image
+ // LLAMA_API int32_t llama_vision_get_n_tokens(const llama_vision_img_tokens * img_tokens);
+
+ // Encode patches into embeddings
+ LLAMA_API int32_t llama_vision_encode(
+ struct llama_vision_context * ctx,
+ struct llama_vision_tokens * img_tokens);
+ LLAMA_API struct ggml_tensor * llama_vision_get_output_tensor(struct llama_vision_context * ctx);
+
//
// Model split
//
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b340dae5b28cd..aded67d4efdcc 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -27,9 +27,10 @@ add_library(llama
llama-quant.cpp
llama-sampling.cpp
llama-vocab.cpp
- unicode-data.cpp
- unicode.cpp
+ llama-vision.cpp
unicode.h
+ unicode.cpp
+ unicode-data.cpp
)
target_include_directories(llama PUBLIC . ../include ../common)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 28f2bbc8f72bf..f07ef9afe844c 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -3,6 +3,7 @@
#include "llama-impl.h"
#include