Reduce memory usage and allocate enough memory for largest context (#473)

* Reduce memory usage and allocate enough memory for large contexts * Simpler scratch buffer usage * Reenable BLAS for quantized mul_mat * Fix number of layers in 30B and 65B * Fix KV cache size for F32
author: Georgi Gerganov <ggerganov@gmail.com> 2023-03-24 23:17:37 +0200
committer: GitHub <noreply@github.com> 2023-03-24 23:17:37 +0200
commit: 7a9b6c3a8bdc1cb75fefc826dfaa7331eb63695d (patch)
tree: 339815189c912e9a759a0259613621f6a2adcbf4 /utils.h
parent: 31572d966531f7d768eb773322016ab78eb6e835 (diff)
1 files changed, 8 insertions, 8 deletions
diff --git a/utils.h b/utils.h
index cf91499..d469bc6 100644
--- a/utils.h
+++ b/utils.h
@@ -14,12 +14,13 @@
 //
 
 struct gpt_params {
-    int32_t seed          = -1;  // RNG seed
+    int32_t seed          = -1;   // RNG seed
     int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_predict     = 128; // new tokens to predict
-    int32_t repeat_last_n = 64;  // last n tokens to penalize
-    int32_t n_parts       = -1;  // amount of model parts (-1 = determine from model dimensions)
-    int32_t n_ctx         = 512; //context size
+    int32_t n_predict     = 128;  // new tokens to predict
+    int32_t repeat_last_n = 64;   // last n tokens to penalize
+    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
+    int32_t n_ctx         = 512;  // context size
+    int32_t n_batch       = 8;    // batch size for prompt processing
 
     // sampling parameters
     int32_t top_k = 40;
@@ -27,15 +28,13 @@ struct gpt_params {
     float   temp  = 0.80f;
     float   repeat_penalty  = 1.10f;
 
-    int32_t n_batch = 8; // batch size for prompt processing
-
     std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
     std::string prompt = "";
 
 
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
 
-    bool memory_f16        = false; // use f16 instead of f32 for memory kv
+    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
     bool interactive       = false; // interactive mode
@@ -47,6 +46,7 @@ struct gpt_params {
     bool ignore_eos        = false; // do not stop generating after eos
     bool perplexity        = false; // compute perplexity over the prompt
     bool use_mlock         = false; // use mlock to keep model in memory
+    bool mem_test          = false; // compute maximum memory usage
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
author	Georgi Gerganov <ggerganov@gmail.com>	2023-03-24 23:17:37 +0200
committer	GitHub <noreply@github.com>	2023-03-24 23:17:37 +0200
commit	7a9b6c3a8bdc1cb75fefc826dfaa7331eb63695d (patch)
tree	339815189c912e9a759a0259613621f6a2adcbf4 /utils.h
parent	31572d966531f7d768eb773322016ab78eb6e835 (diff)