diff options
| author | Georgi Gerganov <ggerganov@gmail.com> | 2023-03-10 21:50:46 +0200 | 
|---|---|---|
| committer | Georgi Gerganov <ggerganov@gmail.com> | 2023-03-10 21:50:46 +0200 | 
| commit | 319cdb3e1ffe263cf5b08249c9559e011396c1de (patch) | |
| tree | 90c02a60d3e381ebd882c5c52d9dca114714ce43 | |
| parent | 775328064e69db1ebd7e19ccb59d2a7fa6142470 (diff) | |
Final touches
| -rw-r--r-- | README.md | 3 | ||||
| -rw-r--r-- | main.cpp | 1 | ||||
| -rw-r--r-- | models/.gitignore | 0 | ||||
| -rw-r--r-- | utils.cpp | 54 | ||||
| -rw-r--r-- | utils.h | 6 | 
5 files changed, 32 insertions, 32 deletions
| @@ -114,6 +114,5 @@ python3 convert-pth-to-ggml.py models/7B/ 1    In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that  - I don't know yet how much the quantization affects the quality of the generated text  - Probably the token sampling can be improved -- No Windows support  - x86 quantization support [not yet ready](https://github.com/ggerganov/ggml/pull/27). Basically, you want to run this on Apple Silicon -   + @@ -728,6 +728,7 @@ int main(int argc, char ** argv) {          // end of text token          if (embd.back() == 2) { +            printf(" [end of text]\n");              break;          }      } diff --git a/models/.gitignore b/models/.gitignore new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/models/.gitignore @@ -231,39 +231,39 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri  }  std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { -    auto res = gpt_tokenize(vocab, text); +    //auto res = gpt_tokenize(vocab, text); + +    //if (bos) { +    //    res.insert(res.begin(), 1); // TODO: replace with vocab.bos +    //} + +    std::vector<gpt_vocab::id> res;      if (bos) { -        res.insert(res.begin(), 1); // TODO: replace with vocab.bos +        res.push_back(1); // TODO: replace with vocab.bos      } -    //std::vector<gpt_vocab::id> res; +     //find the longest token that matches the text +    int pos = 0; +    while (true) { +        int l = 0; +        int t = 0; +        for (const auto & kv : vocab.id_to_token) { +            if (kv.second.size() < l) continue; +            if (kv.second.size() > text.size() - pos) continue; +            if (text.substr(pos, kv.second.size()) == kv.second) { +                l = kv.second.size(); +                t = kv.first; +            } +        } -    //if (bos) { -    //    res.push_back(1); // TODO: replace with vocab.bos -    //} +        if (l == 0 && t != 13) { +            break; +        } -    // find the longest token that matches the text -    //int pos = 0; -    //while (true) { -    //    int l = 0; -    //    int t = 0; -    //    for (const auto & kv : vocab.id_to_token) { -    //        if (kv.second.size() < l) continue; -    //        if (kv.second.size() > text.size() - pos) continue; -    //        if (text.substr(pos, kv.second.size()) == kv.second) { -    //            l = kv.second.size(); -    //            t = kv.first; -    //        } -    //    } - -    //    if (l == 0 && t != 13) { -    //        break; -    //    } - -    //    res.push_back(t); -    //    pos += l; -    //} +        res.push_back(t); +        pos += l; +    }      return res;  } @@ -15,12 +15,12 @@  struct gpt_params {      int32_t seed      = -1; // RNG seed      int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); -    int32_t n_predict = 200; // new tokens to predict +    int32_t n_predict = 128; // new tokens to predict      // sampling parameters -    int32_t top_k = 100; +    int32_t top_k = 40;      float   top_p = 0.95f; -    float   temp  = 0.8f; +    float   temp  = 0.80f;      int32_t n_batch = 8; // batch size for prompt processing | 
