Skip to content

llama : save and restore kv cache for single seq id #6341

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 34 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
662aaea
llama : save and restore kv cache for single seq id
kaetemi Mar 27, 2024
5462817
remove trailing whitespace
kaetemi Mar 27, 2024
ab1c46a
respond error in case there's no space in the kv cache
kaetemi Mar 27, 2024
02a1840
add kv seq save restore to test case
kaetemi Mar 27, 2024
b8e8fac
add --slot-save-path arg to enable save restore and restrict save loc…
kaetemi Mar 27, 2024
b182f8f
Returning 0 for some cases, instead of asserting.
martindevans Mar 27, 2024
a2b48b9
cleanup error cases
kaetemi Mar 27, 2024
c4443d7
rename sequence state functions
kaetemi Mar 28, 2024
4d5356b
rename state get set functions
kaetemi Mar 28, 2024
bbcbf47
add previous function names back in with DEPRECATED notice
kaetemi Mar 29, 2024
8b5ae29
update doc
kaetemi Mar 29, 2024
a71ec3d
adjust endpoints to preferred style
kaetemi Mar 29, 2024
bf1d493
fix restoring zero cell count
kaetemi Mar 29, 2024
8ab1a17
handle seq rm return value
kaetemi Mar 29, 2024
0d22136
unused param
kaetemi Mar 29, 2024
29f18c2
keep in the size check
kaetemi Mar 29, 2024
f2e41b3
fix return types
kaetemi Mar 29, 2024
92c4681
add server test case for slot save restore
kaetemi Mar 29, 2024
60f685f
cleanup
kaetemi Mar 29, 2024
d38eef4
add cake
kaetemi Mar 30, 2024
ea717f7
cleanup style
kaetemi Mar 30, 2024
b509b8b
add special
kaetemi Mar 30, 2024
129b6ff
removing a whole sequence never fails
kaetemi Mar 30, 2024
8af7211
move sequence state file functionality from server to llama to match …
kaetemi Mar 30, 2024
3d6fa5b
catch exceptions on save as well
kaetemi Apr 1, 2024
b3f6da3
error log messages
kaetemi Apr 1, 2024
be714a0
check types for stricter restore
kaetemi Apr 1, 2024
0ccfbf2
update server doc
kaetemi Apr 1, 2024
205c44c
readme : update API changes date
ggerganov Apr 4, 2024
d9fd0d7
Merge branch 'master' into feature/save-restore-seq
kaetemi Apr 4, 2024
f2a4777
strict filename validation
kaetemi Apr 5, 2024
4a4f399
move include, reject bom as well
kaetemi Apr 5, 2024
2fbf0c3
also reject empty filename
kaetemi Apr 5, 2024
bf94e9f
reject whitespace and trailing dot
kaetemi Apr 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
cleanup error cases
  • Loading branch information
kaetemi committed Mar 27, 2024
commit a2b48b95f59fd96007fd5a59c52744671a0f7c49
2 changes: 1 addition & 1 deletion examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1693,7 +1693,7 @@ struct server_context {

size_t nread = llama_set_seq_data(ctx, state_data.data(), slot->id + 1);
if (nread == 0) {
send_error(task, "Unable to restore slot, no available space in KV cache", ERROR_TYPE_INVALID_REQUEST);
send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
break;
}
GGML_ASSERT(nread <= state_data.size());
Expand Down
12 changes: 8 additions & 4 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15290,8 +15290,6 @@ size_t llama_set_seq_data(struct llama_context * ctx, const uint8_t * src, llama

const uint32_t kv_size = kv_self.size;
const uint32_t kv_head = kv_self.head;
GGML_ASSERT(n_layer == n_layer_ref);
GGML_ASSERT(n_embd_v_gqa == n_embd_v_gqa_ref);

// For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo
for (int il = 0; il < (int)n_layer; ++il) {
Expand All @@ -15300,7 +15298,10 @@ size_t llama_set_seq_data(struct llama_context * ctx, const uint8_t * src, llama
memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref));
inp += sizeof(k_size_row_ref);
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
GGML_ASSERT(k_size_row == k_size_row_ref);
if (k_size_row != k_size_row_ref) {
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
return 0;
}

// Read and set the keys for the whole cell range
ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row);
Expand All @@ -15315,7 +15316,10 @@ size_t llama_set_seq_data(struct llama_context * ctx, const uint8_t * src, llama
inp += sizeof(v_size_el_ref);

const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
GGML_ASSERT(v_size_el == v_size_el_ref);
if (v_size_el != v_size_el_ref) {
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
return 0;
}

// For each row in the transposed matrix, read the values for the whole cell range
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
Expand Down