add chunk attn mask

ggml-org · ngxson · Apr 7, 2025 · Apr 6, 2025 · Apr 6, 2025 · Apr 6, 2025
commit e6a2809c2d42042cb5e64052117be1e36af53b83
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -474,9 +474,17 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
                         }
 
                         // may need to cut off old tokens for sliding window
+                        // TODO @ngxson : the check for n_attn_chunk is temporary, need to optimize it
                         if (data_swa) {
-                            if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
-                                f = -INFINITY;
+                            if (hparams.n_attn_chunk) {
+                                llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
+                                if (kv_self->cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
+                                    f = -INFINITY;
+                                }
+                            } else {
+                                if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
+                                    f = -INFINITY;
+                                }
                             }
                             data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
                         }

diff --git a/src/llama-hparams.h b/src/llama-hparams.h
@@ -114,6 +114,7 @@ struct llama_hparams {
 
     uint32_t n_moe_layer_step        = 0;
     bool     use_kq_norm             = true;
+    uint32_t n_attn_chunk            = 0;
     // values below seems to be fixed on llama4
     uint32_t n_no_rope_layer_step    = 4;
     uint32_t n_attn_temp_floor_scale = 8192;

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -557,6 +557,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
                 ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,   hparams.n_moe_layer_step);
+                // hack: we use SWA to store the chunked attn mask
+                // luckily, the n_swa_pattern is the same as chunked layer pattern: 3 chunked - 1 full
+                hparams.n_swa_pattern = 4;
+                hparams.n_attn_chunk  = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
+                hparams.n_swa = 1; // unused, added to trigger the SWA
 
                 switch (hparams.n_expert) {
                     case 16:  type = LLM_TYPE_17B_16E; break;