From c9f670a17755311aa28c411f5c7f3c8c05434770 Mon Sep 17 00:00:00 2001 From: thement <40525767+thement@users.noreply.github.com> Date: Fri, 17 Mar 2023 21:05:58 +0100 Subject: Implement non-greedy tokenizer that tries to maximize token lengths (#242) * Implement non-greedy tokenizer that tries to maximize token lengths * Insert single space in front of the prompt - this is to match original llama tokenizer behavior --------- Co-authored-by: Jakub Horak --- main.cpp | 2 ++ utils.cpp | 68 +++++++++++++++++++++++++++++++++++++++------------------------ 2 files changed, 44 insertions(+), 26 deletions(-) diff --git a/main.cpp b/main.cpp index ca0fca8..39c5d7b 100644 --- a/main.cpp +++ b/main.cpp @@ -845,6 +845,8 @@ int main(int argc, char ** argv) { std::vector logits; + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); // tokenize the prompt std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); diff --git a/utils.cpp b/utils.cpp index 9e50487..22ef593 100644 --- a/utils.cpp +++ b/utils.cpp @@ -287,41 +287,57 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri return tokens; } +// TODO: Calculate this constant from the vocabulary +#define MAX_TOKEN_LEN 18 +// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { - //auto res = gpt_tokenize(vocab, text); - - //if (bos) { - // res.insert(res.begin(), 1); // TODO: replace with vocab.bos - //} - std::vector res; - - if (bos) { - res.push_back(1); // TODO: replace with vocab.bos - } - - //find the longest token that matches the text - int pos = 0; - while (true) { - int l = 0; - int t = 0; - for (const auto & kv : vocab.id_to_token) { - if (kv.second.size() < l) continue; - if (kv.second.size() > text.size() - pos) continue; - if (text.substr(pos, kv.second.size()) == kv.second) { - l = kv.second.size(); - t = kv.first; + std::vector score; + std::vector prev; + int len = text.length(); + + score.resize(len + 1); + prev.resize(len + 1); + + // Forward pass + for (int i = 0; i < len; i++) { + int max_len = std::min(len - i, MAX_TOKEN_LEN); + for (int sub_len = 1; sub_len <= len - i; sub_len++) { + auto sub = text.substr(i, sub_len); + auto token = vocab.token_to_id.find(sub); + if (token != vocab.token_to_id.end()) { + int token_score = sub.length() * sub.length(); + int local_score = score[i] + token_score; + int next = i + sub_len; + if (score[next] < local_score) { + score[next] = local_score; + prev[next] = (*token).second; + } } } + } - if (l == 0) { - break; + // Backward pass + int i = len; + while (i > 0) { + gpt_vocab::id token_id = prev[i]; + if (token_id == 0) { + // TODO: Return error or something more meaningful + printf("failed to tokenize string!\n"); + break; } + res.push_back(token_id); + auto token = (*vocab.id_to_token.find(token_id)).second; + i -= token.length(); + } - res.push_back(t); - pos += l; + if (bos) { + res.push_back(1); // TODO: replace with vocab.bos } + // Pieces are in reverse order so correct that + std::reverse(res.begin(), res.end()); + return res; } -- cgit v1.2.3