Skip to content

Commit 83de321

Browse files
authored
Merge branch 'ggerganov:master' into master
2 parents 04302a5 + fe7ea02 commit 83de321

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+6860
-4411
lines changed

CMakeLists.txt

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -105,11 +105,12 @@ if (GGML_ALL_WARNINGS)
105105
endif()
106106

107107
if (NOT MSVC)
108-
add_compile_options(
109-
"$<$<COMPILE_LANGUAGE:C>:-Werror=vla>"
110-
"$<$<COMPILE_LANGUAGE:CXX>:-Werror=vla>"
111-
"$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler;-Werror=vla>"
112-
)
108+
# TODO: temporary disabled until we figure out ggml-metal.m
109+
#add_compile_options(
110+
# "$<$<COMPILE_LANGUAGE:C>:-Werror=vla>"
111+
# "$<$<COMPILE_LANGUAGE:CXX>:-Werror=vla>"
112+
# "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler;-Werror=vla>"
113+
#)
113114
endif()
114115

115116
#

Package.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ let package = Package(
2828
resources: [
2929
.process("src/ggml-metal.metal")
3030
],
31-
publicHeadersPath: "include/ggml",
31+
publicHeadersPath: "spm-headers",
3232
cSettings: [
3333
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
3434
.define("GGML_USE_ACCELERATE"),

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ Some of the development is currently happening in the [llama.cpp](https://github
4949
- [X] Example of Qwen inference [QwenLM/qwen.cpp](https://github.com/QwenLM/qwen.cpp)
5050
- [X] Example of YOLO inference [examples/yolo](https://github.com/ggerganov/ggml/tree/master/examples/yolo)
5151
- [X] Example of ViT inference [staghado/vit.cpp](https://github.com/staghado/vit.cpp)
52+
- [X] Example of multiple LLMs inference [foldl/chatllm.cpp](https://github.com/foldl/chatllm.cpp)
5253
- [X] SeamlessM4T inference *(in development)* https://github.com/facebookresearch/seamless_communication/tree/main/ggml
5354

5455
## Whisper inference (example)

examples/common-ggml.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ bool ggml_common_quantize_0(
6262
case GGML_FTYPE_ALL_F32:
6363
case GGML_FTYPE_MOSTLY_F16:
6464
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
65+
case GGML_FTYPE_MOSTLY_IQ2_XXS:
66+
case GGML_FTYPE_MOSTLY_IQ2_XS:
6567
{
6668
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
6769
return false;
@@ -182,7 +184,7 @@ bool ggml_common_quantize_0(
182184
case GGML_TYPE_Q5_K:
183185
case GGML_TYPE_Q6_K:
184186
{
185-
cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data());
187+
cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], hist_cur.data(), nullptr);
186188
} break;
187189
case GGML_TYPE_F32:
188190
case GGML_TYPE_F16:
@@ -191,6 +193,8 @@ bool ggml_common_quantize_0(
191193
case GGML_TYPE_I32:
192194
case GGML_TYPE_Q8_1:
193195
case GGML_TYPE_Q8_K:
196+
case GGML_TYPE_IQ2_XXS:
197+
case GGML_TYPE_IQ2_XS:
194198
case GGML_TYPE_COUNT:
195199
{
196200
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));

examples/gpt-2/main-backend.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
209209
#ifdef GGML_USE_METAL
210210
if (n_gpu_layers > 0) {
211211
fprintf(stderr, "%s: using Metal backend\n", __func__);
212-
ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
212+
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
213213
model.backend = ggml_backend_metal_init();
214214
if (!model.backend) {
215215
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);

examples/gpt-2/main-batched.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
298298
#ifdef GGML_USE_METAL
299299
if (n_gpu_layers > 0) {
300300
fprintf(stderr, "%s: using Metal backend\n", __func__);
301-
ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
301+
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
302302
model.backend = ggml_backend_metal_init();
303303
if (!model.backend) {
304304
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);

examples/gpt-2/main.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ void init_backends(gpt2_model & model, const gpt_params & params) {
118118
#ifdef GGML_USE_METAL
119119
if (params.n_gpu_layers > 0) {
120120
fprintf(stderr, "%s: using Metal backend\n", __func__);
121-
ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
121+
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
122122
gpu_backend = ggml_backend_metal_init();
123123
if (!gpu_backend) {
124124
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
@@ -947,7 +947,7 @@ int main(int argc, char ** argv) {
947947
ggml_backend_sched_t sched;
948948
{
949949
// initialize the scheduler
950-
sched = ggml_backend_sched_new(model.backends.data(), model.backends.size());
950+
sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES);
951951

952952
// create the worst case graph for memory usage estimation
953953
int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);

examples/python/ggml/__init__.pyi

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -506,15 +506,6 @@ class lib:
506506
struct ggml_tensor * a);
507507
"""
508508
...
509-
def ggml_cont_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
510-
"""
511-
make contiguous, in-place
512-
513-
GGML_API struct ggml_tensor * ggml_cont_inplace(
514-
struct ggml_context * ctx,
515-
struct ggml_tensor * a);
516-
"""
517-
...
518509
def ggml_conv_1d(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, s0: int, p0: int, d0: int) -> ffi.CData:
519510
"""
520511
GGML_API struct ggml_tensor * ggml_conv_1d(
@@ -614,16 +605,6 @@ class lib:
614605
struct ggml_tensor * b);
615606
"""
616607
...
617-
def ggml_cpy_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
618-
"""
619-
a -> b, in-place, return view(b)
620-
621-
GGML_API struct ggml_tensor * ggml_cpy_inplace(
622-
struct ggml_context * ctx,
623-
struct ggml_tensor * a,
624-
struct ggml_tensor * b);
625-
"""
626-
...
627608
def ggml_cross_entropy_loss(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
628609
"""
629610
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
@@ -1202,7 +1183,7 @@ class lib:
12021183
- you don't need to keep the host memory buffer allocated as it is never accessed by Metal
12031184
- max_size specifies the maximum size of a tensor and is used to create shared views such
12041185
that it is guaranteed that the tensor will fit in at least one of the views
1205-
1186+
12061187
12071188
bool ggml_metal_add_buffer(
12081189
struct ggml_metal_context * ctx,
@@ -2428,4 +2409,4 @@ class lib:
24282409
...
24292410
def quantize_row_q8_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None:
24302411
"""void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);"""
2431-
...
2412+
...

examples/python/ggml/cffi.py

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

examples/starcoder/CMakeLists.txt

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,6 @@ set(TEST_TARGET starcoder)
55
add_executable(${TEST_TARGET} main.cpp)
66
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
77

8-
#
9-
# starcoder-mmap
10-
11-
set(TEST_TARGET starcoder-mmap)
12-
add_executable(${TEST_TARGET} starcoder-mmap.cpp)
13-
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
14-
158
#
169
# starcoder-quantize
1710

0 commit comments

Comments
 (0)