From e76d630df17e235e6b9ef416c45996765d2e36fb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 23 Jul 2023 15:09:47 +0300 Subject: llama : grouped-query attention + LLaMAv2 70B support (#2276) * CUDA: GQA implementation * llama : support for GQA and LLaMAv2 70B ggml-ci * py : fix hparams parsing (if-else blocks) ggml-ci * py : oh boy .. ggml-ci * help : fix gqa value for 70B ggml-ci --------- Co-authored-by: JohannesGaessler --- examples/main/main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'examples/main') diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 4b4cd1d..3bd8ba2 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -93,8 +93,8 @@ int main(int argc, char ** argv) { } if (params.n_ctx > 2048) { - fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);" - " you are on your own\n", __func__, params.n_ctx); + // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048 + fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx); } else if (params.n_ctx < 8) { fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__); params.n_ctx = 8; -- cgit v1.2.3