feat: stream function call parameters

withcatai · giladgd · May 17, 2025 · May 11, 2025 · May 11, 2025 · May 11, 2025
commit 6926425c555778f68c08ac119dab8a99d334bc57
diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -105,6 +105,37 @@ export type LlamaChatResponseSegmentChunk = {
     segmentEndTime?: Date
 };
 
+export type LlamaChatResponseFunctionCallParamsChunk = {
+    /**
+     * Each different function call has a different `callIndex`.
+     *
+     * When the previous function call has finished being generated, the `callIndex` of the next one will increment.
+     *
+     * Use this value to distinguish between different function calls.
+     */
+    callIndex: number,
+
+    /**
+     * The name of the function being called
+     */
+    functionName: string,
+
+    /**
+     * A chunk of the generated text used for the function call parameters.
+     *
+     * Collect all the chunks together to construct the full function call parameters.
+     *
+     * After the function call is finished, the entire constructed params text can be parsed as a JSON object,
+     * according to the function parameters schema.
+     */
+    paramsChunk: string,
+
+    /**
+     * When this is `true`, the current chunk is the last chunk in the generation of the current function call parameters.
+     */
+    done: boolean
+};
+
 export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunctions | undefined = undefined> = {
     /**
      * Called as the model generates the main response with the generated text chunk.
@@ -253,15 +284,32 @@ export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunction
     functions?: never,
     documentFunctionParams?: never,
     maxParallelFunctionCalls?: never,
-    onFunctionCall?: never
+    onFunctionCall?: never,
+    onFunctionCallParamsChunk?: never
 } | {
     grammar?: never,
     functions?: Functions | ChatModelFunctions,
     documentFunctionParams?: boolean,
     maxParallelFunctionCalls?: number,
     onFunctionCall?: (
         functionCall: LlamaChatResponseFunctionCall<Functions extends ChatModelFunctions ? Functions : ChatModelFunctions>
-    ) => void
+    ) => void,
+
+    /**
+     * Called as the model generates function calls with the generated parameters chunk for each function call.
+     *
+     * Useful for streaming the generated function call parameters as they're being generated.
+     * Only useful in specific use cases,
+     * such as showing the generated textual file content as it's being generated (note that doing this requires parsing incomplete JSON).
+     *
+     * The constructed text from all the params chunks of a given function call can be parsed as a JSON object,
+     * according to the function parameters schema.
+     *
+     * Each function call has its own `callIndex` you can use to distinguish between them.
+     *
+     * Only relevant when using function calling (via passing the `functions` option).
+     */
+    onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void
 });
 
 export type LLamaChatLoadAndCompleteUserMessageOptions<Functions extends ChatModelFunctions | undefined = undefined> = {
@@ -465,6 +513,7 @@ export class LlamaChat {
             onTextChunk,
             onToken,
             onResponseChunk,
+            onFunctionCallParamsChunk,
             signal,
             stopOnAbortSignal = false,
             maxTokens,
@@ -501,6 +550,7 @@ export class LlamaChat {
                 onTextChunk,
                 onToken,
                 onResponseChunk,
+                onFunctionCallParamsChunk,
                 signal,
                 stopOnAbortSignal,
                 maxTokens,
@@ -1433,6 +1483,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
     private readonly onTextChunk: LLamaChatGenerateResponseOptions<Functions>["onTextChunk"];
     private readonly onToken: LLamaChatGenerateResponseOptions<Functions>["onToken"];
     private readonly onResponseChunk: LLamaChatGenerateResponseOptions<Functions>["onResponseChunk"];
+    private readonly onFunctionCallParamsChunk: LLamaChatGenerateResponseOptions<Functions>["onFunctionCallParamsChunk"];
     private readonly signal: LLamaChatGenerateResponseOptions<Functions>["signal"];
     private readonly stopOnAbortSignal: LLamaChatGenerateResponseOptions<Functions>["stopOnAbortSignal"];
     public readonly maxTokens: LLamaChatGenerateResponseOptions<Functions>["maxTokens"];
@@ -1531,6 +1582,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             onTextChunk,
             onToken,
             onResponseChunk,
+            onFunctionCallParamsChunk,
             signal,
             stopOnAbortSignal = false,
             maxTokens,
@@ -1563,6 +1615,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         this.onTextChunk = safeEventCallback(onTextChunk);
         this.onToken = safeEventCallback(onToken);
         this.onResponseChunk = safeEventCallback(onResponseChunk);
+        this.onFunctionCallParamsChunk = safeEventCallback(onFunctionCallParamsChunk);
         this.signal = signal;
         this.stopOnAbortSignal = stopOnAbortSignal;
         this.maxTokens = maxTokens;
@@ -2238,14 +2291,33 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
                     StopGenerationDetector.resolveStopTriggers(this.functionsGrammar.stopGenerationTriggers, this.llamaChat.model.tokenizer)
                         .map((stopTrigger) => functionParamsGenerationDoneDetector.addStopTrigger(stopTrigger));
 
-                    for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
-                        pushAll(this.currentFunctionCallCurrentPartTokens, tokens);
+                    if (this.currentFunctionCallCurrentPartTokens.length > 0)
+                        this.onFunctionCallParamsChunk?.({
+                            callIndex: this.resFunctionCalls.length,
+                            functionName: this.functionEvaluationFunctionName,
+                            paramsChunk: this.llamaChat.model.detokenize(this.currentFunctionCallCurrentPartTokens, false, lastPartTokens),
+                            done: false
+                        });
 
+                    for await (const tokens of this.evaluateWithContextShift(loadContextWindow)) {
                         functionParamsGenerationDoneDetector.recordGeneration({
                             text: this.currentText,
                             tokens: this.currentTokens
                         });
 
+                        this.onFunctionCallParamsChunk?.({
+                            callIndex: this.resFunctionCalls.length,
+                            functionName: this.functionEvaluationFunctionName,
+                            paramsChunk: this.llamaChat.model.detokenize(
+                                tokens,
+                                false,
+                                resolveLastTokens([lastPartTokens, this.currentFunctionCallCurrentPartTokens])
+                            ),
+                            done: functionParamsGenerationDoneDetector.hasTriggeredStops
+                        });
+
+                        pushAll(this.currentFunctionCallCurrentPartTokens, tokens);
+
                         if (functionParamsGenerationDoneDetector.hasTriggeredStops)
                             break;
                     }

diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -8,7 +8,8 @@ import {appendUserMessageToChatHistory} from "../../utils/appendUserMessageToCha
 import {LlamaContextSequence} from "../LlamaContext/LlamaContext.js";
 import {LlamaGrammar} from "../LlamaGrammar.js";
 import {
-    LlamaChat, LLamaChatContextShiftOptions, LlamaChatResponse, LlamaChatResponseFunctionCall, LlamaChatResponseChunk
+    LlamaChat, LLamaChatContextShiftOptions, LlamaChatResponse, LlamaChatResponseFunctionCall, LlamaChatResponseChunk,
+    LlamaChatResponseFunctionCallParamsChunk
 } from "../LlamaChat/LlamaChat.js";
 import {EvaluationPriority} from "../LlamaContext/types.js";
 import {TokenBias} from "../TokenBias.js";
@@ -197,12 +198,29 @@ export type LLamaChatPromptOptions<Functions extends ChatSessionModelFunctions |
     grammar?: LlamaGrammar,
     functions?: never,
     documentFunctionParams?: never,
-    maxParallelFunctionCalls?: never
+    maxParallelFunctionCalls?: never,
+    onFunctionCallParamsChunk?: never
 } | {
     grammar?: never,
     functions?: Functions | ChatSessionModelFunctions,
     documentFunctionParams?: boolean,
-    maxParallelFunctionCalls?: number
+    maxParallelFunctionCalls?: number,
+
+    /**
+     * Called as the model generates function calls with the generated parameters chunk for each function call.
+     *
+     * Useful for streaming the generated function call parameters as they're being generated.
+     * Only useful in specific use cases,
+     * such as showing the generated textual file content as it's being generated (note that doing this requires parsing incomplete JSON).
+     *
+     * The constructed text from all the params chunks of a given function call can be parsed as a JSON object,
+     * according to the function parameters schema.
+     *
+     * Each function call has its own `callIndex` you can use to distinguish between them.
+     *
+     * Only relevant when using function calling (via passing the `functions` option).
+     */
+    onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void
 });
 
 export type LLamaChatCompletePromptOptions = {
@@ -424,6 +442,7 @@ export class LlamaChatSession {
             onTextChunk,
             onToken,
             onResponseChunk,
+            onFunctionCallParamsChunk,
             signal,
             stopOnAbortSignal = false,
             maxTokens,
@@ -445,8 +464,10 @@ export class LlamaChatSession {
             functions: functions as undefined,
             documentFunctionParams: documentFunctionParams as undefined,
             maxParallelFunctionCalls: maxParallelFunctionCalls as undefined,
+            onFunctionCallParamsChunk: onFunctionCallParamsChunk as undefined,
 
-            onTextChunk, onToken, onResponseChunk, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, grammar,
+            onTextChunk, onToken, onResponseChunk, signal, stopOnAbortSignal, maxTokens,
+            temperature, minP, topK, topP, seed, grammar,
             trimWhitespaceSuffix, responsePrefix, repeatPenalty, tokenBias, customStopTriggers
         });
 
@@ -464,6 +485,7 @@ export class LlamaChatSession {
         onTextChunk,
         onToken,
         onResponseChunk,
+        onFunctionCallParamsChunk,
         signal,
         stopOnAbortSignal = false,
         maxTokens,
@@ -500,6 +522,7 @@ export class LlamaChatSession {
             let newContextWindowChatHistory = lastEvaluation?.contextWindow == null
                 ? undefined
                 : appendUserMessageToChatHistory(lastEvaluation?.contextWindow, prompt);
+            let previousFunctionCalls: number = 0;
 
             const resolvedResponsePrefix = (responsePrefix != null && responsePrefix !== "")
                 ? responsePrefix
@@ -553,6 +576,14 @@ export class LlamaChatSession {
                         onTextChunk: safeEventCallback(onTextChunk),
                         onToken: safeEventCallback(onToken),
                         onResponseChunk: safeEventCallback(onResponseChunk),
+                        onFunctionCallParamsChunk: onFunctionCallParamsChunk == null
+                            ? undefined
+                            : safeEventCallback((chunk) => onFunctionCallParamsChunk?.({
+                                callIndex: previousFunctionCalls + chunk.callIndex,
+                                functionName: chunk.functionName,
+                                paramsChunk: chunk.paramsChunk,
+                                done: chunk.done
+                            })),
                         signal: abortController.signal,
                         stopOnAbortSignal,
                         repeatPenalty,
@@ -675,6 +706,7 @@ export class LlamaChatSession {
                                 });
 
                                 startNewChunk = false;
+                                previousFunctionCalls++;
                             }
 
                             lastEvaluation.cleanHistory = newChatHistory;

diff --git a/src/index.ts b/src/index.ts
@@ -32,7 +32,7 @@ import {
     LlamaChat, type LlamaChatOptions, type LLamaChatGenerateResponseOptions, type LLamaChatLoadAndCompleteUserMessageOptions,
     type LLamaChatContextShiftOptions, type LlamaChatResponse, type LlamaChatResponseFunctionCall,
     type LlamaChatLoadAndCompleteUserResponse, type LlamaChatResponseChunk, type LlamaChatResponseTextChunk,
-    type LlamaChatResponseSegmentChunk, type LlamaChatResponseSegment
+    type LlamaChatResponseSegmentChunk, type LlamaChatResponseFunctionCallParamsChunk, type LlamaChatResponseSegment
 } from "./evaluator/LlamaChat/LlamaChat.js";
 import {
     LlamaChatSessionPromptCompletionEngine, type LLamaChatPromptCompletionEngineOptions
@@ -109,7 +109,7 @@ import {
     type GgufMetadataBloom, type GgufMetadataFalcon, type GgufMetadataMamba, isGgufMetadataOfArchitectureType
 } from "./gguf/types/GgufMetadataTypes.js";
 import {GgmlType, type GgufTensorInfo} from "./gguf/types/GgufTensorInfoTypes.js";
-import {type ModelFileAccessTokens} from "./utils/modelFileAccesTokens.js";
+import {type ModelFileAccessTokens} from "./utils/modelFileAccessTokens.js";
 import {type OverridesObject} from "./utils/OverridesObject.js";
 import type {LlamaClasses} from "./utils/getLlamaClasses.js";
 import type {ChatHistoryFunctionCallMessageTemplate} from "./chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.js";
@@ -183,6 +183,7 @@ export {
     type LlamaChatResponseChunk,
     type LlamaChatResponseTextChunk,
     type LlamaChatResponseSegmentChunk,
+    type LlamaChatResponseFunctionCallParamsChunk,
     type LlamaChatResponseSegment,
     LlamaChatSessionPromptCompletionEngine,
     type LLamaChatPromptCompletionEngineOptions,