@@ -547,7 +547,7 @@ bool llama_eval(
547547 static void * buf = malloc (buf_size);
548548
549549 if (mem_per_token > 0 && mem_per_token*N > buf_size) {
550- const size_t buf_size_new = 1.1 *(mem_per_token*N); // add 10 % to account for ggml object overhead
550+ const size_t buf_size_new = 1.3 *(mem_per_token*N); // add 30 % to account for ggml object overhead
551551 // fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
552552
553553 // reallocate
@@ -747,6 +747,49 @@ bool llama_eval(
747747 return true ;
748748}
749749
750+ std::vector<double > softmax (const std::vector<float >& logits) {
751+ std::vector<double > probs (logits.size ());
752+ float max_logit = logits[0 ];
753+ for (float v : logits) max_logit = std::max (max_logit, v);
754+ double sum_exp = 0.0 ;
755+ for (size_t i = 0 ; i < logits.size (); i++) {
756+ // Subtract the maximum logit value from the current logit value for numerical stability
757+ float logit = logits[i] - max_logit;
758+ double exp_logit = std::exp (logit);
759+ sum_exp += exp_logit;
760+ probs[i] = exp_logit;
761+ }
762+ for (size_t i = 0 ; i < probs.size (); i++) probs[i] /= sum_exp;
763+ return probs;
764+ }
765+
766+ void perplexity (const gpt_vocab &vocab, const llama_model &model, const gpt_params ¶ms, size_t mem_per_token) {
767+ // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
768+ // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
769+ // Output: `perplexity: 13.5106 [114/114]`
770+ std::vector<gpt_vocab::id> tokens = ::llama_tokenize (vocab, params.prompt , true );
771+
772+ double nll = 0.0 ;
773+ int seq_count = tokens.size () / params.n_ctx ;
774+ for (int i = 0 ; i < seq_count; ++i) {
775+ int start = i * params.n_ctx ;
776+ int end = start + params.n_ctx - 1 ;
777+ std::vector<gpt_vocab::id> embd (tokens.begin () + start, tokens.begin () + end);
778+ std::vector<float > logits;
779+ if (!llama_eval (model, params.n_threads , 0 , embd, logits, mem_per_token)) {
780+ fprintf (stderr, " Failed to predict\n " );
781+ return ;
782+ }
783+ // Calculate probability of next token, given the previous ones.
784+ double prob = softmax (logits)[tokens[end]];
785+ nll += -std::log (prob);
786+ // perplexity is e^(average negative log-likelihood)
787+ printf (" perplexity: %.4lf [%d/%d] \r " , std::exp (nll / (i + 1 )), i + 1 , seq_count);
788+ fflush (stdout);
789+ }
790+ printf (" \n " );
791+ }
792+
750793static bool is_interacting = false ;
751794
752795#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
@@ -815,7 +858,7 @@ int main(int argc, char ** argv) {
815858 // load the model
816859 {
817860 const int64_t t_start_us = ggml_time_us ();
818- if (!llama_model_load (params.model , model, vocab, params.n_ctx )) {
861+ if (!llama_model_load (params.model , model, vocab, params.n_ctx )) {
819862 fprintf (stderr, " %s: failed to load model from '%s'\n " , __func__, params.model .c_str ());
820863 return 1 ;
821864 }
@@ -830,13 +873,22 @@ int main(int argc, char ** argv) {
830873 params.n_threads , std::thread::hardware_concurrency (), llama_print_system_info ());
831874 }
832875
876+ std::vector<float > logits;
877+
878+ // determine the required inference memory per token:
879+ size_t mem_per_token = 0 ;
880+ llama_eval (model, params.n_threads , 0 , { 0 , 1 , 2 , 3 }, logits, mem_per_token);
881+
882+ if (params.perplexity ) {
883+ perplexity (vocab, model, params, mem_per_token);
884+ exit (0 );
885+ }
886+
833887 int n_past = 0 ;
834888
835889 int64_t t_sample_us = 0 ;
836890 int64_t t_predict_us = 0 ;
837891
838- std::vector<float > logits;
839-
840892 // Add a space in front of the first character to match OG llama tokenizer behavior
841893 params.prompt .insert (0 , 1 , ' ' );
842894 // tokenize the prompt
@@ -881,10 +933,6 @@ int main(int argc, char ** argv) {
881933
882934 std::vector<gpt_vocab::id> embd;
883935
884- // determine the required inference memory per token:
885- size_t mem_per_token = 0 ;
886- llama_eval (model, params.n_threads , 0 , { 0 , 1 , 2 , 3 }, logits, mem_per_token);
887-
888936 int last_n_size = params.repeat_last_n ;
889937 std::vector<gpt_vocab::id> last_n_tokens (last_n_size);
890938 std::fill (last_n_tokens.begin (), last_n_tokens.end (), 0 );
0 commit comments