1 files changed, 9 insertions, 0 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 31d8087..8727500 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -405,6 +405,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params &params)
   fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
   fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
   fprintf(stderr, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
+  fprintf(stderr, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
 #endif
   fprintf(stderr, "  -m FNAME, --model FNAME\n");
   fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
@@ -539,6 +540,14 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
       fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
 #endif // GGML_USE_CUBLAS
     }
+    else if (arg == "--low-vram" || arg == "-lv")
+    {
+#ifdef GGML_USE_CUBLAS
+      params.low_vram = true;
+#else
+      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
+#endif // GGML_USE_CUBLAS
+    }
     else if (arg == "--main-gpu" || arg == "-mg")
     {
       if (++i >= argc)