aboutsummaryrefslogtreecommitdiff
path: root/utils.h
diff options
context:
space:
mode:
authorFabio R. Sluzala <Fabio3rs@users.noreply.github.com>2023-03-21 14:21:50 -0300
committerGitHub <noreply@github.com>2023-03-21 19:21:50 +0200
commit353ec251a42491f5192c48561da4b444ef67f23c (patch)
tree95783d81ab8be8a6875ec708e2e2a7882222250a /utils.h
parent89d5d90f3b6d25f134da7a8e252c3432bffcf674 (diff)
We could use std::unordered_map over std::map (#305)
* Improve performance by changing std::map to std::unordered_map and std::map<id, token> id_to_token; to std::vector<token> id_to_token; * fix last commit on gpt_vocab_init add vocab.id_to_token.resize(vocab.token_to_id.size()); * Removed include <map> * Nest struct token score inside gpt_vocab * renamed token to tok
Diffstat (limited to 'utils.h')
-rw-r--r--utils.h14
1 files changed, 9 insertions, 5 deletions
diff --git a/utils.h b/utils.h
index 6693775..3129038 100644
--- a/utils.h
+++ b/utils.h
@@ -3,7 +3,7 @@
#pragma once
#include <string>
-#include <map>
+#include <unordered_map>
#include <vector>
#include <random>
#include <thread>
@@ -65,15 +65,19 @@ struct llama_vocab {
using id = int32_t;
using token = std::string;
- std::map<token, id> token_to_id;
- std::map<id, token> id_to_token;
- std::map<id, float> score;
+ struct token_score {
+ token tok;
+ float score;
+ };
+
+ std::unordered_map<token, id> token_to_id;
+ std::vector<token_score> id_to_token;
};
void replace(std::string & str, const std::string & needle, const std::string & replacement);
// poor-man's JSON parsing
-std::map<std::string, int32_t> json_parse(const std::string & fname);
+std::unordered_map<std::string, int32_t> json_parse(const std::string & fname);
// TODO: temporary until #77 is merged, need this now for some tokenizer tests
bool llama_vocab_load(const std::string & fname, llama_vocab & vocab);