llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation * llama : support for GQA and LLaMAv2 70B ggml-ci * py : fix hparams parsing (if-else blocks) ggml-ci * py : oh boy .. ggml-ci * help : fix gqa value for 70B ggml-ci --------- Co-authored-by: JohannesGaessler <johannesg@5d6.de>
author: Georgi Gerganov <ggerganov@gmail.com> 2023-07-23 15:09:47 +0300
committer: GitHub <noreply@github.com> 2023-07-23 15:09:47 +0300
commit: e76d630df17e235e6b9ef416c45996765d2e36fb (patch)
tree: 15e0e9648f9b0e398b43e888216a73f84098ff3a /llama.h
parent: 1d0824b2476e7fda09751a0235c9e571b76d6f2c (diff)
1 files changed, 6 insertions, 5 deletions
diff --git a/llama.h b/llama.h
index bbf28e6..1089909 100644
--- a/llama.h
+++ b/llama.h
@@ -83,11 +83,12 @@ extern "C" {
     typedef void (*llama_progress_callback)(float progress, void *ctx);
 
    struct llama_context_params {
-        uint32_t seed;                         // RNG seed, -1 for random
-        int32_t  n_ctx;                        // text context
-        int32_t  n_batch;                      // prompt processing batch size
-        int32_t  n_gpu_layers;                 // number of layers to store in VRAM
-        int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
+        uint32_t seed;         // RNG seed, -1 for random
+        int32_t  n_ctx;        // text context
+        int32_t  n_batch;      // prompt processing batch size
+        int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
+        int32_t  n_gpu_layers; // number of layers to store in VRAM
+        int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
 
         const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
author	Georgi Gerganov <ggerganov@gmail.com>	2023-07-23 15:09:47 +0300
committer	GitHub <noreply@github.com>	2023-07-23 15:09:47 +0300
commit	e76d630df17e235e6b9ef416c45996765d2e36fb (patch)
tree	15e0e9648f9b0e398b43e888216a73f84098ff3a /llama.h
parent	1d0824b2476e7fda09751a0235c9e571b76d6f2c (diff)