From 6c12db851e2bfbec4601cb80edc6e2eb1fac02ae Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 12 May 2025 21:43:59 +0200 Subject: [PATCH] llama-bench : add defrag-thold, check for invalid ranges --- include/llama.h | 2 +- tools/llama-bench/README.md | 7 ++-- tools/llama-bench/llama-bench.cpp | 55 ++++++++++++++++++++++++------- 3 files changed, 49 insertions(+), 15 deletions(-) diff --git a/include/llama.h b/include/llama.h index abedebdb78af1..99e5fba244fcc 100644 --- a/include/llama.h +++ b/include/llama.h @@ -345,7 +345,7 @@ extern "C" { float yarn_beta_fast; // YaRN low correction dim float yarn_beta_slow; // YaRN high correction dim uint32_t yarn_orig_ctx; // YaRN original context size - float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default) + float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default) ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index 4fb2a24e19e10..0479f81a30b55 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -43,12 +43,13 @@ test parameters: -ub, --ubatch-size (default: 512) -ctk, --cache-type-k (default: f16) -ctv, --cache-type-v (default: f16) - -t, --threads (default: 16) + -dt, --defrag-thold (default: -1) + -t, --threads (default: system dependent) -C, --cpu-mask (default: 0x0) --cpu-strict <0|1> (default: 0) --poll <0...100> (default: 50) -ngl, --n-gpu-layers (default: 99) - -rpc, --rpc (default: ) + -rpc, --rpc (default: none) -sm, --split-mode (default: layer) -mg, --main-gpu (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0) @@ -62,7 +63,7 @@ test parameters: Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. Ranges can be given as -'start-end' or 'start-end+step' or 'start-end*mult'. +'first-last' or 'first-last+step' or 'first-last*mult'. ``` llama-bench can perform three types of tests: diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index ca0d0aed5e9fa..9457e6815e231 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -211,6 +211,8 @@ static std::vector parse_int_range(const std::string & s) { for (int i = first; i <= last;) { result.push_back(i); + int prev_i = i; + if (op == '+') { i += step; } else if (op == '*') { @@ -218,6 +220,10 @@ static std::vector parse_int_range(const std::string & s) { } else { throw std::invalid_argument("invalid range format"); } + + if (i <= prev_i) { + throw std::invalid_argument("invalid range"); + } } search_start = match.suffix().first; } @@ -239,6 +245,7 @@ struct cmd_params { std::vector n_ubatch; std::vector type_k; std::vector type_v; + std::vector defrag_thold; std::vector n_threads; std::vector cpu_mask; std::vector cpu_strict; @@ -274,6 +281,7 @@ static const cmd_params cmd_params_defaults = { /* n_ubatch */ { 512 }, /* type_k */ { GGML_TYPE_F16 }, /* type_v */ { GGML_TYPE_F16 }, + /* defrag_thold */ { -1.0f }, /* n_threads */ { cpu_get_num_math() }, /* cpu_mask */ { "0x0" }, /* cpu_strict */ { false }, @@ -335,6 +343,8 @@ static void print_usage(int /* argc */, char ** argv) { join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); printf(" -ctv, --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); + printf(" -dt, --defrag-thold (default: %s)\n", + join(cmd_params_defaults.defrag_thold, ",").c_str()); printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -C, --cpu-mask (default: %s)\n", @@ -368,7 +378,7 @@ static void print_usage(int /* argc */, char ** argv) { printf( "Multiple values can be given for each parameter by separating them with ','\n" "or by specifying the parameter multiple times. Ranges can be given as\n" - "'start-end' or 'start-end+step' or 'start-end*mult'.\n"); + "'first-last' or 'first-last+step' or 'first-last*mult'.\n"); } static ggml_type ggml_type_from_name(const std::string & s) { @@ -519,6 +529,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.type_v.insert(params.type_v.end(), types.begin(), types.end()); + } else if (arg == "-dt" || arg == "--defrag-thold") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.defrag_thold.insert(params.defrag_thold.end(), p.begin(), p.end()); } else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { invalid_param = true; @@ -825,6 +842,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } + if (params.defrag_thold.empty()) { + params.defrag_thold = cmd_params_defaults.defrag_thold; + } if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } @@ -883,6 +903,7 @@ struct cmd_params_instance { int n_ubatch; ggml_type type_k; ggml_type type_v; + float defrag_thold; int n_threads; std::string cpu_mask; bool cpu_strict; @@ -959,15 +980,16 @@ struct cmd_params_instance { llama_context_params to_llama_cparams() const { llama_context_params cparams = llama_context_default_params(); - cparams.n_ctx = n_prompt + n_gen + n_depth; - cparams.n_batch = n_batch; - cparams.n_ubatch = n_ubatch; - cparams.type_k = type_k; - cparams.type_v = type_v; - cparams.offload_kqv = !no_kv_offload; - cparams.flash_attn = flash_attn; - cparams.embeddings = embeddings; - cparams.op_offload = !no_op_offload; + cparams.n_ctx = n_prompt + n_gen + n_depth; + cparams.n_batch = n_batch; + cparams.n_ubatch = n_ubatch; + cparams.type_k = type_k; + cparams.type_v = type_v; + cparams.defrag_thold = defrag_thold; + cparams.offload_kqv = !no_kv_offload; + cparams.flash_attn = flash_attn; + cparams.embeddings = embeddings; + cparams.op_offload = !no_op_offload; return cparams; } @@ -992,6 +1014,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & nub : params.n_ubatch) for (const auto & tk : params.type_k) for (const auto & tv : params.type_v) + for (const auto & defrag_thold : params.defrag_thold) for (const auto & nkvo : params.no_kv_offload) for (const auto & fa : params.flash_attn) for (const auto & nt : params.n_threads) @@ -1012,6 +1035,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_ubatch = */ nub, /* .type_k = */ tk, /* .type_v = */ tv, + /* .defrag_thold = */ defrag_thold, /* .n_threads = */ nt, /* .cpu_mask = */ cm, /* .cpu_strict = */ cs, @@ -1044,6 +1068,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_ubatch = */ nub, /* .type_k = */ tk, /* .type_v = */ tv, + /* .defrag_thold = */ defrag_thold, /* .n_threads = */ nt, /* .cpu_mask = */ cm, /* .cpu_strict = */ cs, @@ -1076,6 +1101,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_ubatch = */ nub, /* .type_k = */ tk, /* .type_v = */ tv, + /* .defrag_thold = */ defrag_thold, /* .n_threads = */ nt, /* .cpu_mask = */ cm, /* .cpu_strict = */ cs, @@ -1117,6 +1143,7 @@ struct test { int poll; ggml_type type_k; ggml_type type_v; + float defrag_thold; int n_gpu_layers; llama_split_mode split_mode; int main_gpu; @@ -1151,6 +1178,7 @@ struct test { poll = inst.poll; type_k = inst.type_k; type_v = inst.type_v; + defrag_thold = inst.defrag_thold; n_gpu_layers = inst.n_gpu_layers; split_mode = inst.split_mode; main_gpu = inst.main_gpu; @@ -1206,6 +1234,7 @@ struct test { "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", + "defrag_thold", "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", }; @@ -1225,7 +1254,7 @@ struct test { field == "use_mmap" || field == "embeddings") { return BOOL; } - if (field == "avg_ts" || field == "stddev_ts") { + if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") { return FLOAT; } return STRING; @@ -1292,6 +1321,7 @@ struct test { std::to_string(flash_attn), tensor_split_str, tensor_buft_overrides_str, + std::to_string(defrag_thold), std::to_string(use_mmap), std::to_string(embeddings), std::to_string(no_op_offload), @@ -1558,6 +1588,9 @@ struct markdown_printer : public printer { if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) { fields.emplace_back("type_v"); } + if (params.defrag_thold.size() > 1 || params.defrag_thold != cmd_params_defaults.defrag_thold) { + fields.emplace_back("defrag_thold"); + } if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) { fields.emplace_back("main_gpu"); }