models : move build_inp_out_ids outside loop (#17151)

CISC · web-flow · commit 7bef684118cc · 2025-11-10T22:55:30.000+01:00
* move build_inp_out_ids outside loop

* realign
diff --git a/src/models/ernie4-5.cpp b/src/models/ernie4-5.cpp
@@ -1,7 +1,5 @@
 #include "models.h"
 
-
-
 llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -19,6 +17,8 @@ llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_grap
 
     auto * inp_attn = build_attn_inp_kv();
 
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
     for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
@@ -67,9 +67,8 @@ llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_grap
         }
         if (il == n_layer - 1) {
             // skip computing output for unused tokens
-            ggml_tensor * inp_out_ids = build_inp_out_ids();
-            cur                       = ggml_get_rows(ctx0, cur, inp_out_ids);
-            inpSA                     = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
         ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
         cb(ffn_inp, "ffn_inp", il);
diff --git a/src/models/openai-moe-iswa.cpp b/src/models/openai-moe-iswa.cpp
@@ -11,6 +11,8 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
 
     auto * inp_attn = build_attn_inp_kv_iswa();
 
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
     for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
@@ -69,7 +71,6 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
         }
         if (il == n_layer - 1) {
             // skip computing output for unused tokens
-            ggml_tensor * inp_out_ids = build_inp_out_ids();
             cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,8 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,`
`11`	`11`
`12`	`12`	`auto * inp_attn = build_attn_inp_kv_iswa();`
`13`	`13`
	`14`	`+ ggml_tensor * inp_out_ids = build_inp_out_ids();`
	`15`	`+`
`14`	`16`	`for (int il = 0; il < n_layer; ++il) {`
`15`	`17`	`ggml_tensor * inpSA = inpL;`
`16`	`18`
`@@ -69,7 +71,6 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,`
`69`	`71`	`}`
`70`	`72`	`if (il == n_layer - 1) {`
`71`	`73`	`// skip computing output for unused tokens`
`72`		`- ggml_tensor * inp_out_ids = build_inp_out_ids();`
`73`	`74`	`cur = ggml_get_rows(ctx0, cur, inp_out_ids);`
`74`	`75`	`inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);`
`75`	`76`	`}`