sentencepiece bpe compatible tokenizer (#252)

* potential out of bounds read * fix quantize * style * Update convert-pth-to-ggml.py * mild cleanup * don't need the space-prefixing here rn since main.cpp already does it * new file magic + version header field * readme notice * missing newlines Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
author: Mack Straight <eiz@users.noreply.github.com> 2023-03-20 03:17:23 -0700
committer: GitHub <noreply@github.com> 2023-03-20 03:17:23 -0700
commit: 074bea2eb1f1349a0118239c4152914aecaa1be4 (patch)
tree: 41ce911ac28d858cabfeff650b10521b30838656 /utils.h
parent: 5cb63e2493c49bc2c3b9b355696e8dc26cdd0380 (diff)
1 files changed, 2 insertions, 1 deletions
diff --git a/utils.h b/utils.h
index 49658f7..b3a0f47 100644
--- a/utils.h
+++ b/utils.h
@@ -58,6 +58,7 @@ struct gpt_vocab {
 
     std::map<token, id> token_to_id;
     std::map<id, token> id_to_token;
+    std::map<id, float> score;
 };
 
 void replace(std::string & str, const std::string & needle, const std::string & replacement);
@@ -79,7 +80,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
 
 // TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
 // ref: https://github.com/google/sentencepiece
-std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
+std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos);
 
 // load the tokens from encoder.json
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
author	Mack Straight <eiz@users.noreply.github.com>	2023-03-20 03:17:23 -0700
committer	GitHub <noreply@github.com>	2023-03-20 03:17:23 -0700
commit	074bea2eb1f1349a0118239c4152914aecaa1be4 (patch)
tree	41ce911ac28d858cabfeff650b10521b30838656 /utils.h
parent	5cb63e2493c49bc2c3b9b355696e8dc26cdd0380 (diff)