diff options
author | eric8607242 <e0928021388@gmail.com> | 2023-07-29 02:10:05 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-28 21:10:05 +0300 |
commit | ee1b497c985f61d6ec519c39fcfed78a3c6f1d06 (patch) | |
tree | c98adfd2b7b1ce10d80fa3100194b0b5f3150ad2 | |
parent | d73b8d48b45d6e2c0ae9bb8c39623c4024adc275 (diff) |
llama : support more diverse tokenizers? (#2420)
* supporting more diverse tokenizers
* Update llama.cpp
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
-rw-r--r-- | llama.cpp | 4 |
1 files changed, 3 insertions, 1 deletions
@@ -1924,7 +1924,9 @@ struct llama_tokenizer { if (token == vocab_.token_to_id.end()) { // output any symbols that did not form tokens as bytes. for (int j = 0; j < (int) symbol.n; ++j) { - llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3; + // NOTE: old version, before #2420 - not sure what are the implications of this + //llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3; + llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j])); output.push_back(token_id); } } else { |