llama : make tensor_split ptr instead of array (#2272)

author: Georgi Gerganov <ggerganov@gmail.com> 2023-07-21 13:10:51 +0300
committer: GitHub <noreply@github.com> 2023-07-21 13:10:51 +0300
commit: ae178ab46bfd6ecb2422da5dad441a4e2fef8b7e (patch)
tree: 064a13d048ecd596bbd57bd081c9615aa91ebbf6 /llama.h
parent: 54e3bc76fed914f8d4a30a7a50c19867cccb1338 (diff)
1 files changed, 2 insertions, 1 deletions
diff --git a/llama.h b/llama.h
index b676a38..c565f6a 100644
--- a/llama.h
+++ b/llama.h
@@ -88,7 +88,8 @@ extern "C" {
         int32_t  n_batch;                      // prompt processing batch size
         int32_t  n_gpu_layers;                 // number of layers to store in VRAM
         int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
-        float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+
+        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
 
         // ref: https://github.com/ggerganov/llama.cpp/pull/2054
         float    rope_freq_base;  // RoPE base frequency
author	Georgi Gerganov <ggerganov@gmail.com>	2023-07-21 13:10:51 +0300
committer	GitHub <noreply@github.com>	2023-07-21 13:10:51 +0300
commit	ae178ab46bfd6ecb2422da5dad441a4e2fef8b7e (patch)
tree	064a13d048ecd596bbd57bd081c9615aa91ebbf6 /llama.h
parent	54e3bc76fed914f8d4a30a7a50c19867cccb1338 (diff)