diff options
Diffstat (limited to 'examples')
| -rw-r--r-- | examples/common.cpp | 25 | ||||
| -rw-r--r-- | examples/common.h | 11 | 
2 files changed, 23 insertions, 13 deletions
diff --git a/examples/common.cpp b/examples/common.cpp index 80e35d2..86c1eef 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -277,6 +277,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {              params.use_color = true;          } else if (arg == "--mlock") {              params.use_mlock = true; +        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { +            if (++i >= argc) { +                invalid_param = true; +                break; +            } +            params.n_gpu_layers = std::stoi(argv[i]);          } else if (arg == "--no-mmap") {              params.use_mmap = false;          } else if (arg == "--mtest") { @@ -421,6 +427,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {      if (llama_mmap_supported()) {          fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");      } +    fprintf(stderr, "  -ngl N, --n-gpu-layers N\n"); +    fprintf(stderr, "                        number of layers to store in VRAM\n");      fprintf(stderr, "  --mtest               compute maximum memory usage\n");      fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");      fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n"); @@ -463,14 +471,15 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s  struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {      auto lparams = llama_context_default_params(); -    lparams.n_ctx      = params.n_ctx; -    lparams.n_parts    = params.n_parts; -    lparams.seed       = params.seed; -    lparams.f16_kv     = params.memory_f16; -    lparams.use_mmap   = params.use_mmap; -    lparams.use_mlock  = params.use_mlock; -    lparams.logits_all = params.perplexity; -    lparams.embedding  = params.embedding; +    lparams.n_ctx        = params.n_ctx; +    lparams.n_parts      = params.n_parts; +    lparams.n_gpu_layers = params.n_gpu_layers; +    lparams.seed         = params.seed; +    lparams.f16_kv       = params.memory_f16; +    lparams.use_mmap     = params.use_mmap; +    lparams.use_mlock    = params.use_mlock; +    lparams.logits_all   = params.perplexity; +    lparams.embedding    = params.embedding;      llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams); diff --git a/examples/common.h b/examples/common.h index 499671b..717838f 100644 --- a/examples/common.h +++ b/examples/common.h @@ -21,13 +21,14 @@  int32_t get_num_physical_cores();  struct gpt_params { -    int32_t seed          = -1;   // RNG seed +    int32_t seed          = -1;  // RNG seed      int32_t n_threads     = get_num_physical_cores();      int32_t n_predict     = -1;  // new tokens to predict -    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions) -    int32_t n_ctx         = 512;  // context size -    int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS) -    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt +    int32_t n_parts       = -1;  // amount of model parts (-1 = determine from model dimensions) +    int32_t n_ctx         = 512; // context size +    int32_t n_batch       = 512; // batch size for prompt processing (must be >=32 to use BLAS) +    int32_t n_keep        = 0;   // number of tokens to keep from initial prompt +    int32_t n_gpu_layers  = 0;   // number of layers to store in VRAM      // sampling parameters      std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens  | 
