diff options
Diffstat (limited to 'utils.h')
-rw-r--r-- | utils.h | 61 |
1 files changed, 3 insertions, 58 deletions
@@ -2,8 +2,9 @@ #pragma once +#include "llama.h" + #include <string> -#include <unordered_map> #include <vector> #include <random> #include <thread> @@ -50,63 +51,7 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params); std::string gpt_random_prompt(std::mt19937 & rng); // -// Model file parsing -// - -#define FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files -#define FILE_MAGIC 0x67676d66 // 'ggmf' in hex -#define FILE_VERSION 1 - -// // Vocab utils // -struct llama_vocab { - using id = int32_t; - using token = std::string; - - struct token_score { - token tok; - float score; - }; - - std::unordered_map<token, id> token_to_id; - std::vector<token_score> id_to_token; -}; - -void replace(std::string & str, const std::string & needle, const std::string & replacement); - -// poor-man's JSON parsing -std::unordered_map<std::string, int32_t> json_parse(const std::string & fname); - -// TODO: temporary until #77 is merged, need this now for some tokenizer tests -bool llama_vocab_load(const std::string & fname, llama_vocab & vocab); - -// TODO: this is probably wrong, but I cannot figure out how this tokenizer works .. -// ref: https://github.com/google/sentencepiece -std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos); - -// sample next token given probabilities for each embedding -// -// - consider only the top K tokens -// - from them, consider only the top tokens with cumulative probability > P -// -llama_vocab::id llama_sample_top_p_top_k( - const llama_vocab & vocab, - const float * logits, - std::vector<llama_vocab::id> & last_n_tokens, - double repeat_penalty, - int top_k, - double top_p, - double temp, - std::mt19937 & rng); - -// filer to top K tokens from list of logits -void sample_top_k(std::vector<std::pair<double, llama_vocab::id>> & logits_id, int top_k); - -// -// Quantization -// - -size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist); -size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist); +std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos); |