diff options
author | Fabio R. Sluzala <Fabio3rs@users.noreply.github.com> | 2023-03-21 14:21:50 -0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-03-21 19:21:50 +0200 |
commit | 353ec251a42491f5192c48561da4b444ef67f23c (patch) | |
tree | 95783d81ab8be8a6875ec708e2e2a7882222250a /utils.h | |
parent | 89d5d90f3b6d25f134da7a8e252c3432bffcf674 (diff) |
We could use std::unordered_map over std::map (#305)
* Improve performance by changing std::map to std::unordered_map and std::map<id, token> id_to_token; to std::vector<token> id_to_token;
* fix last commit on gpt_vocab_init add vocab.id_to_token.resize(vocab.token_to_id.size());
* Removed include <map>
* Nest struct token score inside gpt_vocab
* renamed token to tok
Diffstat (limited to 'utils.h')
-rw-r--r-- | utils.h | 14 |
1 files changed, 9 insertions, 5 deletions
@@ -3,7 +3,7 @@ #pragma once #include <string> -#include <map> +#include <unordered_map> #include <vector> #include <random> #include <thread> @@ -65,15 +65,19 @@ struct llama_vocab { using id = int32_t; using token = std::string; - std::map<token, id> token_to_id; - std::map<id, token> id_to_token; - std::map<id, float> score; + struct token_score { + token tok; + float score; + }; + + std::unordered_map<token, id> token_to_id; + std::vector<token_score> id_to_token; }; void replace(std::string & str, const std::string & needle, const std::string & replacement); // poor-man's JSON parsing -std::map<std::string, int32_t> json_parse(const std::string & fname); +std::unordered_map<std::string, int32_t> json_parse(const std::string & fname); // TODO: temporary until #77 is merged, need this now for some tokenizer tests bool llama_vocab_load(const std::string & fname, llama_vocab & vocab); |