Skip to content

Update llama_cpp: Sync LLAMA_API names with llama.cpp mainline. Needs more testing #1901

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
786fb42
Submodule vendor/llama.cpp f7cd133..6171c9d
JamePeng Jan 21, 2025
ff3c9e1
Update llama_cpp: Sync LLAMA_API names with llama.cpp mainline. Needs…
JamePeng Jan 21, 2025
6eb78ec
fix the llama_model_load_from_splits function name
JamePeng Jan 21, 2025
1e09d92
correct the llama_tokenize param0 from self.model to self.vocab
JamePeng Jan 21, 2025
60e6b15
fixe more params
JamePeng Jan 21, 2025
d1dbd0c
fix typo
JamePeng Jan 21, 2025
1f3096a
Updated deprecated llama_token_is_eog -> llama_vocab_is_eog
davidmroth Jan 26, 2025
84bcb2c
Correct some typo
JamePeng Jan 26, 2025
7ddf097
correct llama_chat_apply_template function params
JamePeng Jan 26, 2025
4dc2609
correct var type
JamePeng Jan 26, 2025
2fba9d8
Merge branch 'main' into fix-deprecated
davidmroth Jan 26, 2025
3ffc680
Move self._vocab to llama.py
JamePeng Jan 26, 2025
017e2a6
Merge pull request #1 from davidmroth/fix-deprecated
JamePeng Jan 27, 2025
1162207
Update submodule vendor/llama.cpp 6171c9d..df984e0
JamePeng Jan 27, 2025
baec8ff
add missing params in llama_tokenizer.py
JamePeng Jan 27, 2025
52327de
Rename _vocab to vocab in _internals.py
JamePeng Jan 27, 2025
e4d0d97
Fix the missing vocab params in llama.py
JamePeng Jan 27, 2025
12c3bf8
Add more vocab params in file :>
JamePeng Jan 27, 2025
559fb33
fix llama-cpp-python[server] breaks
ljm625 Jan 27, 2025
e9edb78
Merge pull request #2 from ljm625/main
JamePeng Jan 27, 2025
db2a845
Add the vocab params fix patch
JamePeng Jan 28, 2025
2b7d2df
[FIX] llama_chat_format.py: Update llama.llama_model_get_vocab -> lla…
davidmroth Jan 28, 2025
114b76b
Merge pull request #3 from davidmroth/main
JamePeng Jan 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 51 additions & 50 deletions llama_cpp/_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(
raise ValueError(f"Model path does not exist: {path_model}")

with suppress_stdout_stderr(disable=verbose):
model = llama_cpp.llama_load_model_from_file(
model = llama_cpp.llama_model_load_from_file(
self.path_model.encode("utf-8"), self.params
)

Expand All @@ -60,7 +60,7 @@ def __init__(
def free_model():
if self.model is None:
return
llama_cpp.llama_free_model(self.model)
llama_cpp.llama_model_free(self.model)
self.model = None

self._exit_stack.callback(free_model)
Expand All @@ -71,20 +71,20 @@ def close(self):
def __del__(self):
self.close()

def vocab_type(self) -> int:
return llama_cpp.llama_vocab_type(self.model)
def vocab_type(self, vocab:llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_type(vocab)

def n_vocab(self) -> int:
return llama_cpp.llama_n_vocab(self.model)
def n_vocab(self, vocab:llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_n_tokens(vocab)

def n_ctx_train(self) -> int:
return llama_cpp.llama_n_ctx_train(self.model)
return llama_cpp.llama_model_n_ctx_train(self.model)

def n_embd(self) -> int:
return llama_cpp.llama_n_embd(self.model)
return llama_cpp.llama_model_n_embd(self.model)

def rope_freq_scale_train(self) -> float:
return llama_cpp.llama_rope_freq_scale_train(self.model)
return llama_cpp.llama_model_rope_freq_scale_train(self.model)

def desc(self) -> str:
buf = ctypes.create_string_buffer(1024)
Expand All @@ -97,95 +97,95 @@ def size(self) -> int:
def n_params(self) -> int:
return llama_cpp.llama_model_n_params(self.model)

def get_tensor(self, name: str) -> ctypes.c_void_p:
return llama_cpp.llama_get_model_tensor(self.model, name.encode("utf-8"))

# Vocab

def token_get_text(self, token: int) -> str:
return llama_cpp.llama_token_get_text(self.model, token).decode("utf-8")
def token_get_text(self, vocab:llama_cpp.llama_vocab_p, token: int) -> str:
return llama_cpp.llama_vocab_get_text(vocab, token).decode("utf-8")

def token_get_score(self, token: int) -> float:
return llama_cpp.llama_token_get_score(self.model, token)
def token_get_score(self, vocab:llama_cpp.llama_vocab_p, token: int) -> float:
return llama_cpp.llama_vocab_get_score(vocab, token)

def token_get_attr(self, token: int) -> int:
return llama_cpp.llama_token_get_attr(self.model, token)
def token_get_attr(self, vocab:llama_cpp.llama_vocab_p, token: int) -> int:
return llama_cpp.llama_vocab_get_attr(vocab, token)

# Special tokens

def token_bos(self) -> int:
return llama_cpp.llama_token_bos(self.model)
def token_bos(self, vocab:llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_bos(vocab)

def token_eos(self, vocab:llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_eos(vocab)

def token_eos(self) -> int:
return llama_cpp.llama_token_eos(self.model)
def token_eot(self, vocab:llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_eot(vocab)

def token_cls(self) -> int:
return llama_cpp.llama_token_cls(self.model)
def token_cls(self, vocab:llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_cls(vocab)

def token_sep(self) -> int:
return llama_cpp.llama_token_sep(self.model)
def token_sep(self, vocab:llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_sep(vocab)

def token_nl(self) -> int:
return llama_cpp.llama_token_nl(self.model)
def token_nl(self, vocab:llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_nl(vocab)

def token_prefix(self) -> int:
return llama_cpp.llama_token_prefix(self.model)
def token_pad(self, vocab:llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_pad(vocab)

def token_middle(self) -> int:
return llama_cpp.llama_token_middle(self.model)
def token_prefix(self, vocab:llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_fim_pre(vocab)

def token_suffix(self) -> int:
return llama_cpp.llama_token_suffix(self.model)
def token_middle(self, vocab:llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_fim_mid(vocab)

def token_eot(self) -> int:
return llama_cpp.llama_token_eot(self.model)
def token_suffix(self, vocab:llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_fim_suf(vocab)

def add_bos_token(self) -> bool:
return llama_cpp.llama_add_bos_token(self.model)
def add_bos_token(self, vocab:llama_cpp.llama_vocab_p) -> bool:
return llama_cpp.llama_vocab_get_add_bos(vocab)

def add_eos_token(self) -> bool:
return llama_cpp.llama_add_eos_token(self.model)
def add_eos_token(self, vocab:llama_cpp.llama_vocab_p) -> bool:
return llama_cpp.llama_vocab_get_add_eos(vocab)

# Tokenization

def tokenize(self, text: bytes, add_bos: bool, special: bool):
def tokenize(self, vocab:llama_cpp.llama_vocab_p, text: bytes, add_bos: bool, special: bool):
n_ctx = self.n_ctx_train()
tokens = (llama_cpp.llama_token * n_ctx)()
n_tokens = llama_cpp.llama_tokenize(
self.model, text, len(text), tokens, n_ctx, add_bos, special
vocab, text, len(text), tokens, n_ctx, add_bos, special
)
if n_tokens < 0:
n_tokens = abs(n_tokens)
tokens = (llama_cpp.llama_token * n_tokens)()
n_tokens = llama_cpp.llama_tokenize(
self.model, text, len(text), tokens, n_tokens, add_bos, special
vocab, text, len(text), tokens, n_tokens, add_bos, special
)
if n_tokens < 0:
raise RuntimeError(
f'Failed to tokenize: text="{text}" n_tokens={n_tokens}'
)
return list(tokens[:n_tokens])

def token_to_piece(self, token: int, special: bool = False) -> bytes:
def token_to_piece(self, vocab:llama_cpp.llama_vocab_p, token: int, special: bool = False) -> bytes:
buf = ctypes.create_string_buffer(32)
llama_cpp.llama_token_to_piece(self.model, token, buf, 32, 0, special)
llama_cpp.llama_token_to_piece(vocab, token, buf, 32, 0, special)
return bytes(buf)

def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
def detokenize(self, vocab:llama_cpp.llama_vocab_p, tokens: List[int], special: bool = False) -> bytes:
output = b""
size = 32
buffer = (ctypes.c_char * size)()
for token in tokens:
n = llama_cpp.llama_token_to_piece(
self.model, llama_cpp.llama_token(token), buffer, size, 0, special
vocab, llama_cpp.llama_token(token), buffer, size, 0, special
)
assert n <= size
output += bytes(buffer[:n])
# NOTE: Llama1 models automatically added a space at the start of the prompt
# this line removes a leading space if the first token is a beginning of sentence token
return (
output[1:]
if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b" "
if len(tokens) > 0 and tokens[0] == self.token_bos(vocab) and output[0:1] == b" "
else output
)

Expand Down Expand Up @@ -605,10 +605,11 @@ def prev_str(self, ctx_main: LlamaContext, n: int) -> str:
def sample(
self,
ctx_main: LlamaContext,
vocab:llama_cpp.llama_vocab_p,
idx: int = 0,
logits_array: Optional[npt.NDArray[np.single]] = None,
):
n_vocab = ctx_main.model.n_vocab()
n_vocab = ctx_main.model.n_vocab(vocab)
id: int = 0

if logits_array is None:
Expand All @@ -629,7 +630,7 @@ def sample(

# apply penalties
if len(self.prev) > 0:
nl_token = ctx_main.model.token_nl()
nl_token = ctx_main.model.token_nl(vocab)
nl_logit = logits_array[nl_token]
last_tokens = self.prev[-self.params.penalty_last_n :]
last_tokens_size = min(len(last_tokens), self.params.penalty_last_n)
Expand Down
Loading