Skip to content

feat: save and restore a context sequence state #460

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 38 commits into from
May 17, 2025
Merged
Changes from 1 commit
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
11b5404
fix: adapt to breaking `llama.cpp` changes
giladgd May 11, 2025
8b98cf0
fix: improve GPU backend loading error description
giladgd May 11, 2025
1e8111c
chore: update template dependencies
giladgd May 11, 2025
2f9858a
test: Qwen 3 template
giladgd May 11, 2025
4c6e2b1
feat: configure Hugging Face remote endpoint for resolving URIs
giladgd May 11, 2025
d39d261
fix: race condition when reading extremely long gguf metadata
giladgd May 11, 2025
e740078
docs: typo
giladgd May 11, 2025
d6e852e
fix: update gguf types
giladgd May 11, 2025
9ab3c6d
fix: capture multi-token segment separators
giladgd May 11, 2025
656f2be
docs: solutions to more CUDA issues
giladgd May 11, 2025
6926425
feat: stream function call parameters
giladgd May 11, 2025
b369eaf
docs: update the awesome list
giladgd May 11, 2025
72c30dc
chore: update modules
giladgd May 11, 2025
df05d70
docs: more clear default values for custom cmake options
giladgd May 11, 2025
b3d510e
chore: reorder Vitepress config keys
giladgd May 11, 2025
3233603
fix: update gguf types
giladgd May 11, 2025
96c78da
docs: document new env vars
giladgd May 11, 2025
f7063d8
chore: module versions
giladgd May 12, 2025
123e524
chore: update GitHub issue templates
giladgd May 12, 2025
53a5206
test: check recommended model URIs
giladgd May 13, 2025
2e1a7ce
test: fix tests
giladgd May 14, 2025
9463ccc
feat(`QwenChatWrapper`): support discouraging the generation of thoughts
giladgd May 15, 2025
631a7e7
test: fix tests
giladgd May 15, 2025
a0cc198
feat: save and restore context sequence state
giladgd May 15, 2025
185b734
docs: save and restore context sequence state
giladgd May 15, 2025
d36670c
fix: adapt memory estimation to new added model architectures
giladgd May 15, 2025
a68590a
feat(`getLlama`): `dryRun` option
giladgd May 16, 2025
8c6134d
feat: `getLlamaGpuTypes` to get the list of available GPU types for t…
giladgd May 16, 2025
71babfa
fix: skip binary testing on certain problematic conditions
giladgd May 16, 2025
12cec69
docs: fix dead link
giladgd May 16, 2025
de3a360
fix: Paperspace tests setup script nodejs version
giladgd May 16, 2025
8eff306
fix: Windows build
giladgd May 17, 2025
f76e899
fix: types
giladgd May 17, 2025
0cbb572
test: fix tests
giladgd May 17, 2025
2c01084
fix: performance improvements
giladgd May 17, 2025
5d4c8c3
fix: remove unused files from the build dir
giladgd May 17, 2025
69d30cd
fix: remove unused line
giladgd May 17, 2025
62c8020
fix: performance improvements
giladgd May 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
feat(QwenChatWrapper): support discouraging the generation of thoughts
  • Loading branch information
giladgd committed May 15, 2025
commit 9463ccc7d1fcb5e7565ca24dd84211b4aa6f32ff
142 changes: 101 additions & 41 deletions src/chatWrappers/QwenChatWrapper.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import {ChatWrapper, ChatWrapperJinjaMatchConfiguration} from "../ChatWrapper.js";
import {
ChatModelFunctions, ChatWrapperCheckModelCompatibilityParams, ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState,
ChatWrapperSettings, isChatModelResponseSegment
ChatModelFunctions, ChatModelResponse, ChatModelSegment, ChatWrapperCheckModelCompatibilityParams,
ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState, ChatWrapperSettings, isChatModelResponseFunctionCall,
isChatModelResponseSegment
} from "../types.js";
import {LlamaText, SpecialToken, SpecialTokensText} from "../utils/LlamaText.js";
import {GgufArchitectureType} from "../gguf/types/GgufMetadataTypes.js";
Expand All @@ -12,40 +13,9 @@ export class QwenChatWrapper extends ChatWrapper {
public readonly wrapperName: string = "Qwen";

public readonly keepOnlyLastThought: boolean;
public readonly thoughts: "auto" | "discourage";

public override readonly settings: ChatWrapperSettings = {
supportsSystemMessages: true,
functions: {
call: {
optionalPrefixSpace: true,
prefix: LlamaText("\n", new SpecialTokensText("<tool_call>"), '\n{"name": "'),
paramsPrefix: '", "arguments": ',
suffix: LlamaText("}\n", new SpecialTokensText("</tool_call>")),
emptyCallParamsPlaceholder: {}
},
result: {
prefix: LlamaText(new SpecialTokensText("\n<tool_response>\n")),
suffix: LlamaText(new SpecialTokensText("\n</tool_response>"))
},
parallelism: {
call: {
sectionPrefix: "",
sectionSuffix: LlamaText(new SpecialTokensText("<|im_end|>\n"))
},
result: {
sectionPrefix: LlamaText(new SpecialTokensText("<|im_start|>user")),
sectionSuffix: LlamaText(new SpecialTokensText("<|im_end|>\n<|im_start|>assistant\n"))
}
}
},
segments: {
reiterateStackAfterFunctionCalls: true,
thought: {
prefix: LlamaText(new SpecialTokensText("<think>")),
suffix: LlamaText(new SpecialTokensText("</think>"))
}
}
};
public override readonly settings: ChatWrapperSettings;

public constructor(options: {
/**
Expand All @@ -55,15 +25,70 @@ export class QwenChatWrapper extends ChatWrapper {
*
* Defaults to `true`.
*/
keepOnlyLastThought?: boolean
keepOnlyLastThought?: boolean,

/**
* Control the usage of thoughts in the model responses.
*
* Defaults to `"auto"`.
*/
thoughts?: "auto" | "discourage",

/** @internal */
_lineBreakBeforeFunctionCallPrefix?: boolean
} = {}) {
super();

const {
keepOnlyLastThought = true
keepOnlyLastThought = true,
thoughts = "auto",
_lineBreakBeforeFunctionCallPrefix = false
} = options;

this.keepOnlyLastThought = keepOnlyLastThought;
this.thoughts = thoughts;

this.settings = {
supportsSystemMessages: true,
functions: {
call: {
optionalPrefixSpace: true,
prefix: LlamaText([
_lineBreakBeforeFunctionCallPrefix
? "\n"
: "",
new SpecialTokensText("<tool_call>"), '\n{"name": "'
]),
paramsPrefix: '", "arguments": ',
suffix: LlamaText("}\n", new SpecialTokensText("</tool_call>")),
emptyCallParamsPlaceholder: {}
},
result: {
prefix: LlamaText(new SpecialTokensText("\n<tool_response>\n")),
suffix: LlamaText(new SpecialTokensText("\n</tool_response>"))
},
parallelism: {
call: {
sectionPrefix: "",
betweenCalls: _lineBreakBeforeFunctionCallPrefix
? ""
: "\n",
sectionSuffix: LlamaText(new SpecialTokensText("<|im_end|>\n"))
},
result: {
sectionPrefix: LlamaText(new SpecialTokensText("<|im_start|>user")),
sectionSuffix: LlamaText(new SpecialTokensText("<|im_end|>\n<|im_start|>assistant\n"))
}
}
},
segments: {
reiterateStackAfterFunctionCalls: true,
thought: {
prefix: LlamaText(new SpecialTokensText("<think>")),
suffix: LlamaText(new SpecialTokensText("</think>"))
}
}
};
}

public override generateContextState({
Expand Down Expand Up @@ -115,14 +140,18 @@ export class QwenChatWrapper extends ChatWrapper {
} else if (item.type === "model") {
flush();

const transformedModelResponse = (this.thoughts === "discourage" && isLastItem)
? discourageThoughtsInModelResponse(item.response)
: item.response;

currentAggregateFocus = null;
modelTexts.push(
this.generateModelResponseText(
(this.keepOnlyLastThought && !isLastItem)
? item.response.filter((response) => (
? transformedModelResponse.filter((response) => (
!isChatModelResponseSegment(response) || response.segmentType !== "thought"
))
: item.response
: transformedModelResponse
)
);
} else
Expand Down Expand Up @@ -204,13 +233,44 @@ export class QwenChatWrapper extends ChatWrapper {
/** @internal */
public static override _checkModelCompatibility(options: ChatWrapperCheckModelCompatibilityParams): boolean {
const architecture = options.fileInfo?.metadata.general.architecture;
return architecture == null || architecture === GgufArchitectureType.qwen2;
return (
architecture == null ||
architecture === GgufArchitectureType.qwen2 ||
architecture === GgufArchitectureType.qwen2moe ||
architecture === GgufArchitectureType.qwen2vl ||
architecture === GgufArchitectureType.qwen3 ||
architecture === GgufArchitectureType.qwen3moe
);
}

/** @internal */
public static override _getOptionConfigurationsToTestIfCanSupersedeJinjaTemplate(): ChatWrapperJinjaMatchConfiguration<typeof this> {
return [
[undefined, {}, {_requireFunctionCallSettingsExtraction: true}]
[{}, {}, {_requireFunctionCallSettingsExtraction: true}],
[{_lineBreakBeforeFunctionCallPrefix: true}, {}, {_requireFunctionCallSettingsExtraction: true}]
];
}
}

function discourageThoughtsInModelResponse(response: ChatModelResponse["response"]) {
const emptyThought: ChatModelSegment = {
type: "segment",
segmentType: "thought",
ended: true,
text: "\n\n",
raw: LlamaText(new SpecialTokensText("<think>\n\n</think>\n\n")).toJSON()
};
const res: ChatModelResponse["response"] = [...response];

for (let i = res.length - 1; i >= 0; i--) {
const item = res[i];

if (isChatModelResponseFunctionCall(item)) {
res.splice(i + 1, 0, emptyThought);
return res;
}
}

res.unshift(emptyThought);
return res;
}