llama : optimize memory buffers (#2325)

author: Georgi Gerganov <ggerganov@gmail.com> 2023-07-22 21:17:57 +0300
committer: GitHub <noreply@github.com> 2023-07-22 21:17:57 +0300
commit: b47b8a9cfeb439d271bf997fb985fd6d82b3af5e (patch)
tree: e5e2c0b5fc8839d2497e14b4c073964bc541707e /examples/main/main.cpp
parent: b5fe67f8c69113bd9354bc1adcfe2df6be323740 (diff)
1 files changed, 4 insertions, 7 deletions
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 656382f..4b4cd1d 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -139,17 +139,14 @@ int main(int argc, char ** argv) {
                 params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
     }
 
-    // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
+    // determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
     // uncomment the "used_mem" line in llama.cpp to see the results
     if (params.mem_test) {
         {
-            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
-            llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
-        }
+            fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
 
-        {
-            const std::vector<llama_token> tmp = { 0, };
-            llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
+            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
+            llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
         }
 
         llama_print_timings(ctx);
author	Georgi Gerganov <ggerganov@gmail.com>	2023-07-22 21:17:57 +0300
committer	GitHub <noreply@github.com>	2023-07-22 21:17:57 +0300
commit	b47b8a9cfeb439d271bf997fb985fd6d82b3af5e (patch)
tree	e5e2c0b5fc8839d2497e14b4c073964bc541707e /examples/main/main.cpp
parent	b5fe67f8c69113bd9354bc1adcfe2df6be323740 (diff)