diff options
Diffstat (limited to 'examples/server/server.cpp')
-rw-r--r-- | examples/server/server.cpp | 9 |
1 files changed, 9 insertions, 0 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 31d8087..8727500 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -405,6 +405,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms) fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); + fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); #endif fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); @@ -539,6 +540,14 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); #endif // GGML_USE_CUBLAS } + else if (arg == "--low-vram" || arg == "-lv") + { +#ifdef GGML_USE_CUBLAS + params.low_vram = true; +#else + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); +#endif // GGML_USE_CUBLAS + } else if (arg == "--main-gpu" || arg == "-mg") { if (++i >= argc) |