From 2c7c0f5ae9cf19a5c2f2eea2d6c2f75fdbce5bc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 10 May 2025 13:03:19 +0200 Subject: [PATCH 1/3] add seed-coder vocab --- include/llama.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/llama.h b/include/llama.h index a18f365bff6f2..7d5f9d559816d 100644 --- a/include/llama.h +++ b/include/llama.h @@ -112,6 +112,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32, LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33, LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, + LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35, }; enum llama_rope_type { From bf33e5a22243d66c7473cc11ace587ef69549b0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 10 May 2025 13:03:54 +0200 Subject: [PATCH 2/3] add seed-coder vocab --- src/llama-vocab.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 50ded286f3f5f..e6b96ecc8b78b 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -415,6 +415,13 @@ struct llm_tokenizer_bpe : llm_tokenizer { "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+", }; break; + case LLAMA_VOCAB_PRE_TYPE_SEED_CODER: + regex_exprs = { + // original regex from tokenizer.json + // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+" + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }; + break; default: // default regex for BPE tokenization pre-processing regex_exprs = { @@ -1634,6 +1641,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "bailingmoe") { pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE; clean_spaces = false; + } else if ( + tokenizer_pre == "seed-coder") { + pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER; + clean_spaces = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } From 5732ffebd6b56eb931b6a6efb83522b94d62f9df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 10 May 2025 13:04:55 +0200 Subject: [PATCH 3/3] add seed-coder vocab --- convert_hf_to_gguf.py | 3 +++ convert_hf_to_gguf_update.py | 1 + 2 files changed, 4 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bf6bc68380b19..44b1f8f42a8d5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -794,6 +794,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3": # ref: https://huggingface.co/mistral-community/pixtral-12b res = "pixtral" + if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec": + # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base + res = "seed-coder" if res is None: logger.warning("\n") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 03a1d8d8c9b42..5993a4c9836b5 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -116,6 +116,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", }, {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", }, {"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", }, + {"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", }, ]