ggml-org · matteoserva · Apr 29, 2025 · Apr 29, 2025 · Apr 30, 2025 · Apr 30, 2025
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -2774,6 +2774,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.ssl_file_cert = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
+    add_opt(common_arg(
+        {"--chat-template-kwargs"}, "STRING",
+        string_format("sets additional params for the json template parser"),
+        [](common_params & params, const std::string &  value) {
+            auto parsed = json::parse(value);
+            for (const auto & item : parsed.items()) {
+                params.default_template_kwargs[item.key()] = item.value().dump();
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("CHAT_TEMPLATE_KWARGS"));
     add_opt(common_arg(
         {"-to", "--timeout"}, "N",
         string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),

diff --git a/common/chat.cpp b/common/chat.cpp
@@ -34,6 +34,7 @@ struct templates_params {
     bool add_generation_prompt = true;
     bool extract_reasoning     = true;
     std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
+    json extra_context;
 };
 
 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -1582,7 +1583,7 @@ static common_chat_msg common_chat_parse_hermes_2_pro(const std::string& input,
 
 static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
-    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, inputs.extra_context);
     data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
     data.grammar_lazy = false;
     if (!inputs.json_schema.is_null()) {
@@ -1613,6 +1614,11 @@ static common_chat_params common_chat_templates_apply_jinja(
     params.tool_choice = inputs.tool_choice;
     params.grammar = inputs.grammar;
     params.now = inputs.now;
+
+    for (auto el : inputs.chat_template_kwargs) {
+        params.extra_context[el.first] = json::parse(el.second);
+    }
+
     if (!inputs.json_schema.empty()) {
         params.json_schema = json::parse(inputs.json_schema);
     }

diff --git a/common/chat.h b/common/chat.h
@@ -6,6 +6,7 @@
 #include <chrono>
 #include <string>
 #include <vector>
+#include <map>
 
 struct common_chat_templates;
 
@@ -73,6 +74,7 @@ struct common_chat_templates_inputs {
     bool parallel_tool_calls = false;
     bool extract_reasoning     = true;
     std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
+    std::map<std::string, std::string> chat_template_kwargs;
 };
 
 struct common_chat_params {

diff --git a/common/common.h b/common/common.h
@@ -8,6 +8,7 @@
 #include <string>
 #include <string_view>
 #include <vector>
+#include <map>
 #include <sstream>
 
 #ifdef _WIN32
@@ -374,6 +375,8 @@ struct common_params {
     std::string ssl_file_key  = "";                                                                         // NOLINT
     std::string ssl_file_cert = "";                                                                         // NOLINT
 
+    std::map<std::string, std::string> default_template_kwargs;
+
     // "advanced" endpoints are disabled by default for better security
     bool webui            = true;
     bool endpoint_slots   = false;

@@ -163,6 +163,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--api-key-file FNAME` | path to file containing API keys (default: none) |
 | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
 | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
+| `--chat-template-kwargs STRING` | JSON object containing additional params for the json template parser. Example: `--chat_template_kwargs "{\"enable_thinking\":false}`"<br/>(env: CHAT_TEMPLATE_KWARGS) |
 | `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
@@ -1111,6 +1112,8 @@ See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs
 
 The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
 
+`chat_template_kwargs`: Allows sending additional parameters to the json templating system. For example: `{"enable_thinking": false}`
+
 *Examples:*
 
 You can use either Python `openai` library with appropriate checkpoints:

@@ -4341,6 +4341,7 @@ int main(int argc, char ** argv) {
             body,
             params.use_jinja,
             params.reasoning_format,
+            params.default_template_kwargs,
             ctx_server.chat_templates.get(),
             ctx_server.mctx,
             files);
@@ -4362,6 +4363,7 @@ int main(int argc, char ** argv) {
             body,
             params.use_jinja,
             params.reasoning_format,
+            params.default_template_kwargs,
             ctx_server.chat_templates.get(),
             ctx_server.mctx,
             files);

@@ -584,6 +584,7 @@ static json oaicompat_completion_params_parse(
     const json & body, /* openai api json semantics */
     bool use_jinja,
     common_reasoning_format reasoning_format,
+    const std::map<std::string, std::string> & default_template_kwargs,
     const struct common_chat_templates * tmpls,
     bool allow_non_text,
     std::vector<raw_buffer> & out_files)
@@ -726,6 +727,15 @@ static json oaicompat_completion_params_parse(
     inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", false);
     inputs.extract_reasoning     = reasoning_format != COMMON_REASONING_FORMAT_NONE;
     inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
+
+    auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object());
+    for (const auto & item : default_template_kwargs) {
+        inputs.chat_template_kwargs[item.first] = item.second;
+    }
+    for (const auto & item : chat_template_kwargs_object.items()) {
+        inputs.chat_template_kwargs[item.key()] = item.value().dump();
+    }
+
     if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
         throw std::runtime_error("Cannot use custom grammar constraints with tools.");
     }
@@ -743,6 +753,10 @@ static json oaicompat_completion_params_parse(
             throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
         }
 
+        if (inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) {
+            throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
+        }
+
         inputs.extract_reasoning = false;
         inputs.add_generation_prompt = true;
     }