aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ggml.c76
-rw-r--r--ggml.h3
-rw-r--r--llama.cpp14
-rw-r--r--llama.h1
-rw-r--r--main.cpp1
-rw-r--r--utils.cpp7
-rw-r--r--utils.h1
7 files changed, 91 insertions, 12 deletions
diff --git a/ggml.c b/ggml.c
index 0e4b146..800390a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1,5 +1,5 @@
-// Defines CLOCK_MONOTONIC on Linux
-#define _POSIX_C_SOURCE 199309L
+// Defines CLOCK_MONOTONIC and asprintf on Linux
+#define _GNU_SOURCE
#include "ggml.h"
@@ -10,6 +10,7 @@
#endif
#include <assert.h>
+#include <errno.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
@@ -31,7 +32,6 @@
#else
// ref: https://github.com/ggerganov/whisper.cpp/issues/168
#include <windows.h>
-#include <errno.h>
#endif
typedef volatile LONG atomic_int;
@@ -83,6 +83,17 @@ typedef void* thread_ret_t;
#define static_assert(cond, msg) _Static_assert(cond, msg)
#endif
+#define GGML_MLOCK_SUPPORT 0
+
+#ifdef __has_include
+ #if __has_include(<sys/mman.h>)
+ #undef GGML_MLOCK_SUPPORT
+ #define GGML_MLOCK_SUPPORT 1
+ #include <sys/mman.h>
+ #endif
+#endif
+
+
/*#define GGML_PERF*/
#define GGML_DEBUG 0
#define GGML_GELU_FP16
@@ -2344,6 +2355,7 @@ struct ggml_context {
size_t mem_size;
void * mem_buffer;
bool mem_buffer_owned;
+ bool mem_buffer_mlocked;
int n_objects;
@@ -2619,16 +2631,19 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
}
*ctx = (struct ggml_context) {
- /*.mem_size =*/ params.mem_size,
- /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
- /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
- /*.n_objects =*/ 0,
- /*.objects_begin =*/ NULL,
- /*.objects_end =*/ NULL,
- /*.scratch =*/ { 0, 0, NULL, },
- /*.scratch_save =*/ { 0, 0, NULL, },
+ /*.mem_size =*/ params.mem_size,
+ /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
+ /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
+ /*.mem_buffer_mlocked =*/ false,
+ /*.n_objects =*/ 0,
+ /*.objects_begin =*/ NULL,
+ /*.objects_end =*/ NULL,
+ /*.scratch =*/ { 0, 0, NULL, },
+ /*.scratch_save =*/ { 0, 0, NULL, },
};
+ GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure
+
ggml_assert_aligned(ctx->mem_buffer);
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
@@ -2651,6 +2666,14 @@ void ggml_free(struct ggml_context * ctx) {
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
+#if GGML_MLOCK_SUPPORT
+ if (ctx->mem_buffer_mlocked) {
+ if (munlock(ctx->mem_buffer, ctx->mem_size)) {
+ fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno));
+ }
+ }
+#endif
+
if (ctx->mem_buffer_owned) {
free(ctx->mem_buffer);
}
@@ -2679,6 +2702,37 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
return result;
}
+bool ggml_mlock_supported(void) {
+ return GGML_MLOCK_SUPPORT;
+}
+
+#if GGML_MLOCK_SUPPORT
+#ifdef __APPLE__
+ #define MLOCK_SUGGESTION "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or\n" \
+ "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l)."
+#else
+ #define MLOCK_SUGGESTION "Try increasing RLIMIT_MLOCK (ulimit -l)."
+#endif
+bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
+ if (ctx->mem_buffer_mlocked) {
+ return true;
+ }
+ if (mlock(ctx->mem_buffer, ctx->mem_size)) {
+ int ret = asprintf(err_p, "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
+ ctx->mem_size, strerror(errno));
+ GGML_ASSERT(ret >= 0);
+ return false;
+ }
+ ctx->mem_buffer_mlocked = true;
+ return true;
+}
+#else // GGML_MLOCK_SUPPORT
+bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
+ *err_p = strdup("can't mlock because it's not supported on this system");
+ return false;
+}
+#endif // GGML_MLOCK_SUPPORT
+
////////////////////////////////////////////////////////////////////////////////
struct ggml_tensor * ggml_new_tensor_impl(
diff --git a/ggml.h b/ggml.h
index c7e6814..ddb9731 100644
--- a/ggml.h
+++ b/ggml.h
@@ -343,6 +343,9 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
+bool ggml_mlock_supported(void);
+bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
+
struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
enum ggml_type type,
diff --git a/llama.cpp b/llama.cpp
index d8c7715..5d56cc9 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -115,6 +115,7 @@ struct llama_context_params llama_context_default_params() {
/*.f16_kv =*/ false,
/*.logits_all =*/ false,
/*.vocab_only =*/ false,
+ /*.use_mlock =*/ false,
/*.embedding =*/ false,
};
@@ -1428,11 +1429,22 @@ struct llama_context * llama_init_from_file(
ggml_type type_memory = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory, params.vocab_only)) {
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory,
+ params.vocab_only)) {
fprintf(stderr, "%s: failed to load model\n", __func__);
delete ctx;
return nullptr;
}
+
+ if (params.use_mlock) {
+ char *err;
+ if (!ggml_mlock(ctx->model.ctx, &err)) {
+ fprintf(stderr, "%s\n", err);
+ free(err);
+ delete ctx;
+ return nullptr;
+ }
+ }
// reserve memory for context buffers
{
diff --git a/llama.h b/llama.h
index 209b4db..9943d96 100644
--- a/llama.h
+++ b/llama.h
@@ -53,6 +53,7 @@ extern "C" {
bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool vocab_only; // only load the vocabulary, no weights
+ bool use_mlock; // force system to keep model in RAM
bool embedding; // embedding mode only
};
diff --git a/main.cpp b/main.cpp
index 46a80ff..39dfc57 100644
--- a/main.cpp
+++ b/main.cpp
@@ -199,6 +199,7 @@ int main(int argc, char ** argv) {
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
+ lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
diff --git a/utils.cpp b/utils.cpp
index 0df89af..10673fb 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -1,3 +1,5 @@
+#include "ggml.h"
+
#include "utils.h"
#include <cassert>
@@ -127,6 +129,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.instruct = true;
} else if (arg == "--color") {
params.use_color = true;
+ } else if (arg == "--mlock") {
+ params.use_mlock = true;
} else if (arg == "-r" || arg == "--reverse-prompt") {
if (++i >= argc) {
invalid_param = true;
@@ -194,6 +198,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
+ if (ggml_mlock_supported()) {
+ fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
+ }
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n");
diff --git a/utils.h b/utils.h
index 8120c12..cf91499 100644
--- a/utils.h
+++ b/utils.h
@@ -46,6 +46,7 @@ struct gpt_params {
bool instruct = false; // instruction mode (used for Alpaca models)
bool ignore_eos = false; // do not stop generating after eos
bool perplexity = false; // compute perplexity over the prompt
+ bool use_mlock = false; // use mlock to keep model in memory
};
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);