JohnnyOpcode
diff --git a/‎CMakeLists.txt
Lines changed: 6 additions & 5 deletions b/‎CMakeLists.txt
Lines changed: 6 additions & 5 deletions
diff --git a/‎Package.swift
Lines changed: 1 addition & 1 deletion b/‎Package.swift
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 1 addition & 0 deletions b/‎README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/common-ggml.cpp
Lines changed: 5 additions & 1 deletion b/‎examples/common-ggml.cpp
Lines changed: 5 additions & 1 deletion
diff --git a/‎examples/gpt-2/main-backend.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/gpt-2/main-backend.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/gpt-2/main-batched.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/gpt-2/main-batched.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/gpt-2/main.cpp
Lines changed: 2 additions & 2 deletions b/‎examples/gpt-2/main.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/python/ggml/__init__.pyi
Lines changed: 2 additions & 21 deletions b/‎examples/python/ggml/__init__.pyi
Lines changed: 2 additions & 21 deletions
diff --git a/‎examples/python/ggml/cffi.py
Lines changed: 1 addition & 1 deletion b/‎examples/python/ggml/cffi.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/starcoder/CMakeLists.txt
Lines changed: 0 additions & 7 deletions b/‎examples/starcoder/CMakeLists.txt
Lines changed: 0 additions & 7 deletions
@@ -105,11 +105,12 @@ if (GGML_ALL_WARNINGS)
 endif()
 
 if (NOT MSVC)
-    add_compile_options(
-        "$<$<COMPILE_LANGUAGE:C>:-Werror=vla>"
-        "$<$<COMPILE_LANGUAGE:CXX>:-Werror=vla>"
-        "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler;-Werror=vla>"
-    )
+    # TODO: temporary disabled until we figure out ggml-metal.m
+    #add_compile_options(
+    #    "$<$<COMPILE_LANGUAGE:C>:-Werror=vla>"
+    #    "$<$<COMPILE_LANGUAGE:CXX>:-Werror=vla>"
+    #    "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler;-Werror=vla>"
+    #)
 endif()
 
 #
 
@@ -28,7 +28,7 @@ let package = Package(
             resources: [
                 .process("src/ggml-metal.metal")
             ],
-            publicHeadersPath: "include/ggml",
+            publicHeadersPath: "spm-headers",
             cSettings: [
                 .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
                 .define("GGML_USE_ACCELERATE"),
 
@@ -49,6 +49,7 @@ Some of the development is currently happening in the [llama.cpp](https://github
 - [X] Example of Qwen inference [QwenLM/qwen.cpp](https://github.com/QwenLM/qwen.cpp)
 - [X] Example of YOLO inference [examples/yolo](https://github.com/ggerganov/ggml/tree/master/examples/yolo)
 - [X] Example of ViT inference [staghado/vit.cpp](https://github.com/staghado/vit.cpp)
+- [X] Example of multiple LLMs inference [foldl/chatllm.cpp](https://github.com/foldl/chatllm.cpp)
 - [X] SeamlessM4T inference *(in development)* https://github.com/facebookresearch/seamless_communication/tree/main/ggml
 
 ## Whisper inference (example)
 
@@ -62,6 +62,8 @@ bool ggml_common_quantize_0(
         case GGML_FTYPE_ALL_F32:
         case GGML_FTYPE_MOSTLY_F16:
         case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
+        case GGML_FTYPE_MOSTLY_IQ2_XXS:
+        case GGML_FTYPE_MOSTLY_IQ2_XS:
                 {
                     fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                     return false;
@@ -182,7 +184,7 @@ bool ggml_common_quantize_0(
                 case GGML_TYPE_Q5_K:
                 case GGML_TYPE_Q6_K:
                     {
-                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data());
+                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], hist_cur.data(), nullptr);
                     } break;
                 case GGML_TYPE_F32:
                 case GGML_TYPE_F16:
@@ -191,6 +193,8 @@ bool ggml_common_quantize_0(
                 case GGML_TYPE_I32:
                 case GGML_TYPE_Q8_1:
                 case GGML_TYPE_Q8_K:
+                case GGML_TYPE_IQ2_XXS:
+                case GGML_TYPE_IQ2_XS:
                 case GGML_TYPE_COUNT:
                     {
                         fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
 
@@ -209,7 +209,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 #ifdef GGML_USE_METAL
     if (n_gpu_layers > 0) {
         fprintf(stderr, "%s: using Metal backend\n", __func__);
-        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+        ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
         model.backend = ggml_backend_metal_init();
         if (!model.backend) {
             fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
 
@@ -298,7 +298,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 #ifdef GGML_USE_METAL
     if (n_gpu_layers > 0) {
         fprintf(stderr, "%s: using Metal backend\n", __func__);
-        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+        ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
         model.backend = ggml_backend_metal_init();
         if (!model.backend) {
             fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
 
@@ -118,7 +118,7 @@ void init_backends(gpt2_model & model, const gpt_params & params) {
 #ifdef GGML_USE_METAL
     if (params.n_gpu_layers > 0) {
         fprintf(stderr, "%s: using Metal backend\n", __func__);
-        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+        ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
         gpu_backend = ggml_backend_metal_init();
         if (!gpu_backend) {
             fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
@@ -947,7 +947,7 @@ int main(int argc, char ** argv) {
     ggml_backend_sched_t sched;
     {
         // initialize the scheduler
-        sched = ggml_backend_sched_new(model.backends.data(), model.backends.size());
+        sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES);
 
         // create the worst case graph for memory usage estimation
         int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
 
@@ -506,15 +506,6 @@ class lib:
                 struct ggml_tensor  * a);
     """
     ...
-  def ggml_cont_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-    make contiguous, in-place
-
-        GGML_API struct ggml_tensor * ggml_cont_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
   def ggml_conv_1d(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, s0: int, p0: int, d0: int) -> ffi.CData:
     """
         GGML_API struct ggml_tensor * ggml_conv_1d(
@@ -614,16 +605,6 @@ class lib:
                 struct ggml_tensor  * b);
     """
     ...
-  def ggml_cpy_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-    a -> b, in-place, return view(b)
-
-        GGML_API struct ggml_tensor * ggml_cpy_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
   def ggml_cross_entropy_loss(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
     """
         GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
@@ -1202,7 +1183,7 @@ class lib:
     - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
     - max_size specifies the maximum size of a tensor and is used to create shared views such
     that it is guaranteed that the tensor will fit in at least one of the views
-    
+
 
     bool ggml_metal_add_buffer(
             struct ggml_metal_context * ctx,
@@ -2428,4 +2409,4 @@ class lib:
     ...
   def quantize_row_q8_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None:
     """void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);"""
-    ...
+    ...
@@ -5,13 +5,6 @@ set(TEST_TARGET starcoder)
 add_executable(${TEST_TARGET} main.cpp)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
 
-#
-# starcoder-mmap
-
-set(TEST_TARGET starcoder-mmap)
-add_executable(${TEST_TARGET} starcoder-mmap.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
 #
 # starcoder-quantize