server : disallow use cases involving partial SWA context

ggml-ci
ggml-org · ggerganov · May 20, 2025 · Apr 28, 2025 · May 11, 2025 · May 12, 2025
commit 84742efdd6a2f5f48dd88bae19fac9c7878269a4
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -1807,10 +1807,7 @@ llama_pos llama_kv_cache_unified_iswa::get_pos_max() const {
 }
 
 bool llama_kv_cache_unified_iswa::get_can_shift() const {
-    // TODO: for now allow this, eventhough it's not mathematically correct
-    //       but some initial tests indicate that the results are not bad
-    return true;
-    //return kv_base->get_size() == kv_swa->get_size();
+    return kv_base->get_size() == kv_swa->get_size();
 }
 
 void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {

@@ -3198,7 +3198,14 @@ struct server_context {
                                 // if we don't cache the prompt, we have to remove the entire KV cache
                                 llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
                                 slot.n_past = 0;
-                                slot.cache_tokens.clear();
+                                slot.cache_tokens.clear(); // TODO: not needed, will be cleared later via "keep_first()"
+                            }
+
+                            if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
+                                if (llama_kv_self_seq_pos_min(ctx, slot.id) > 0) {
+                                    SLT_WRN(slot, "%s", "forcing full prompt re-processing due to lack of cache data\n");
+                                    slot.n_past = 0;
+                                }
                             }
                         }