Skip to content

Support jinja extra template kwargs (Qwen3 enable_thinking feature), from command line and from client #13196

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
10 changes: 10 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2774,6 +2774,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.ssl_file_cert = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
add_opt(common_arg(
{"--chat-template-kwargs"}, "STRING",
string_format("sets additional params for the json template parser"),
[](common_params & params, const std::string & value) {
auto parsed = json::parse(value);
for (const auto & item : parsed.items()) {
params.default_template_kwargs[item.key()] = item.value().dump();
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("CHAT_TEMPLATE_KWARGS"));
add_opt(common_arg(
{"-to", "--timeout"}, "N",
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
Expand Down
8 changes: 7 additions & 1 deletion common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ struct templates_params {
bool add_generation_prompt = true;
bool extract_reasoning = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
json extra_context;
};

common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
Expand Down Expand Up @@ -1582,7 +1583,7 @@ static common_chat_msg common_chat_parse_hermes_2_pro(const std::string& input,

static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, inputs.extra_context);
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
data.grammar_lazy = false;
if (!inputs.json_schema.is_null()) {
Expand Down Expand Up @@ -1613,6 +1614,11 @@ static common_chat_params common_chat_templates_apply_jinja(
params.tool_choice = inputs.tool_choice;
params.grammar = inputs.grammar;
params.now = inputs.now;

for (auto el : inputs.chat_template_kwargs) {
params.extra_context[el.first] = json::parse(el.second);
}

if (!inputs.json_schema.empty()) {
params.json_schema = json::parse(inputs.json_schema);
}
Expand Down
2 changes: 2 additions & 0 deletions common/chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <chrono>
#include <string>
#include <vector>
#include <map>

struct common_chat_templates;

Expand Down Expand Up @@ -73,6 +74,7 @@ struct common_chat_templates_inputs {
bool parallel_tool_calls = false;
bool extract_reasoning = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
std::map<std::string, std::string> chat_template_kwargs;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a map from string keys to stringified json values. Why not just store the stringified top-level json? (maybe name it chat_additional_context_json or chat_template_kwargs_json?)

};

struct common_chat_params {
Expand Down
3 changes: 3 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <string>
#include <string_view>
#include <vector>
#include <map>
#include <sstream>

#ifdef _WIN32
Expand Down Expand Up @@ -374,6 +375,8 @@ struct common_params {
std::string ssl_file_key = ""; // NOLINT
std::string ssl_file_cert = ""; // NOLINT

std::map<std::string, std::string> default_template_kwargs;

// "advanced" endpoints are disabled by default for better security
bool webui = true;
bool endpoint_slots = false;
Expand Down
3 changes: 3 additions & 0 deletions tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ The project is under active development, and we are [looking for feedback and co
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
| `--chat-template-kwargs STRING` | JSON object containing additional params for the json template parser. Example: `--chat_template_kwargs "{\"enable_thinking\":false}`"<br/>(env: CHAT_TEMPLATE_KWARGS) |
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
Expand Down Expand Up @@ -1111,6 +1112,8 @@ See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs

The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers.

`chat_template_kwargs`: Allows sending additional parameters to the json templating system. For example: `{"enable_thinking": false}`

*Examples:*

You can use either Python `openai` library with appropriate checkpoints:
Expand Down
2 changes: 2 additions & 0 deletions tools/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4341,6 +4341,7 @@ int main(int argc, char ** argv) {
body,
params.use_jinja,
params.reasoning_format,
params.default_template_kwargs,
ctx_server.chat_templates.get(),
ctx_server.mctx,
files);
Expand All @@ -4362,6 +4363,7 @@ int main(int argc, char ** argv) {
body,
params.use_jinja,
params.reasoning_format,
params.default_template_kwargs,
ctx_server.chat_templates.get(),
ctx_server.mctx,
files);
Expand Down
14 changes: 14 additions & 0 deletions tools/server/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,7 @@ static json oaicompat_completion_params_parse(
const json & body, /* openai api json semantics */
bool use_jinja,
common_reasoning_format reasoning_format,
const std::map<std::string, std::string> & default_template_kwargs,
const struct common_chat_templates * tmpls,
bool allow_non_text,
std::vector<raw_buffer> & out_files)
Expand Down Expand Up @@ -726,6 +727,15 @@ static json oaicompat_completion_params_parse(
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);

auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object());
for (const auto & item : default_template_kwargs) {
inputs.chat_template_kwargs[item.first] = item.second;
}
for (const auto & item : chat_template_kwargs_object.items()) {
inputs.chat_template_kwargs[item.key()] = item.value().dump();
}

if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
}
Expand All @@ -743,6 +753,10 @@ static json oaicompat_completion_params_parse(
throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
}

if (inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) {
throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
}

inputs.extract_reasoning = false;
inputs.add_generation_prompt = true;
}
Expand Down
Loading