kv-cache : update warning logs when no space for the batch is available

ggml-ci
ggml-org · ggerganov · May 20, 2025 · Apr 28, 2025 · May 11, 2025 · May 12, 2025
commit 00731579358d37d6bd28286cad9cdd0991984039
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -948,8 +948,6 @@ int llama_context::decode(llama_batch & inp_batch) {
 
         // find KV slot
         if (!kv_self->find_slot(ubatch)) {
-            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
             return 1;
         }
 
@@ -2640,9 +2638,17 @@ int32_t llama_decode(
           llama_batch   batch) {
     int ret = ctx->decode(batch);
 
+    // defrag and try again
+    // TODO: distinguish return code when we are sure that even after defrag there is no space available
     if (ret == 1) {
         llama_kv_self_defrag(ctx);
         ret = ctx->decode(batch);
+
+        if (ret == 1) {
+            LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
+
+            return ret;
+        }
     }
 
     if (ret != 0) {