diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2023-03-10 21:50:46 +0200 |
---|---|---|
committer | Georgi Gerganov <ggerganov@gmail.com> | 2023-03-10 21:50:46 +0200 |
commit | 319cdb3e1ffe263cf5b08249c9559e011396c1de (patch) | |
tree | 90c02a60d3e381ebd882c5c52d9dca114714ce43 | |
parent | 775328064e69db1ebd7e19ccb59d2a7fa6142470 (diff) |
Final touches
-rw-r--r-- | README.md | 3 | ||||
-rw-r--r-- | main.cpp | 1 | ||||
-rw-r--r-- | models/.gitignore | 0 | ||||
-rw-r--r-- | utils.cpp | 54 | ||||
-rw-r--r-- | utils.h | 6 |
5 files changed, 32 insertions, 32 deletions
@@ -114,6 +114,5 @@ python3 convert-pth-to-ggml.py models/7B/ 1 In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that - I don't know yet how much the quantization affects the quality of the generated text - Probably the token sampling can be improved -- No Windows support - x86 quantization support [not yet ready](https://github.com/ggerganov/ggml/pull/27). Basically, you want to run this on Apple Silicon - + @@ -728,6 +728,7 @@ int main(int argc, char ** argv) { // end of text token if (embd.back() == 2) { + printf(" [end of text]\n"); break; } } diff --git a/models/.gitignore b/models/.gitignore new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/models/.gitignore @@ -231,39 +231,39 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri } std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { - auto res = gpt_tokenize(vocab, text); + //auto res = gpt_tokenize(vocab, text); + + //if (bos) { + // res.insert(res.begin(), 1); // TODO: replace with vocab.bos + //} + + std::vector<gpt_vocab::id> res; if (bos) { - res.insert(res.begin(), 1); // TODO: replace with vocab.bos + res.push_back(1); // TODO: replace with vocab.bos } - //std::vector<gpt_vocab::id> res; + //find the longest token that matches the text + int pos = 0; + while (true) { + int l = 0; + int t = 0; + for (const auto & kv : vocab.id_to_token) { + if (kv.second.size() < l) continue; + if (kv.second.size() > text.size() - pos) continue; + if (text.substr(pos, kv.second.size()) == kv.second) { + l = kv.second.size(); + t = kv.first; + } + } - //if (bos) { - // res.push_back(1); // TODO: replace with vocab.bos - //} + if (l == 0 && t != 13) { + break; + } - // find the longest token that matches the text - //int pos = 0; - //while (true) { - // int l = 0; - // int t = 0; - // for (const auto & kv : vocab.id_to_token) { - // if (kv.second.size() < l) continue; - // if (kv.second.size() > text.size() - pos) continue; - // if (text.substr(pos, kv.second.size()) == kv.second) { - // l = kv.second.size(); - // t = kv.first; - // } - // } - - // if (l == 0 && t != 13) { - // break; - // } - - // res.push_back(t); - // pos += l; - //} + res.push_back(t); + pos += l; + } return res; } @@ -15,12 +15,12 @@ struct gpt_params { int32_t seed = -1; // RNG seed int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - int32_t n_predict = 200; // new tokens to predict + int32_t n_predict = 128; // new tokens to predict // sampling parameters - int32_t top_k = 100; + int32_t top_k = 40; float top_p = 0.95f; - float temp = 0.8f; + float temp = 0.80f; int32_t n_batch = 8; // batch size for prompt processing |