aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorkiltyj <kiltyj@gmail.com>2023-06-05 13:24:04 -0700
committerGitHub <noreply@github.com>2023-06-05 23:24:04 +0300
commit9d0693bce38013364b1042568d9083353bfff48f (patch)
tree8311cb168defca62e9b2689571c4b641ea7654b9
parentefe05076323f5c6bafece109e21cce046f5e4b07 (diff)
metal : use shared buffers between CPU and GPU (#1696)
* Use MTLDevice.newBufferWithBytesNoCopy to share buffers between CPU and GPU * Page-align buffers used by Metal * Remove trailing whitespace * Only import unistd.h for Metal builds * metal : remove unnecessary copies --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
-rw-r--r--ggml-metal.m17
-rw-r--r--ggml.c8
-rw-r--r--llama-util.h16
-rw-r--r--llama.cpp13
4 files changed, 38 insertions, 16 deletions
diff --git a/ggml-metal.m b/ggml-metal.m
index 3cb423a..82c6596 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -195,14 +195,25 @@ bool ggml_metal_add_buffer(
}
}
+ size_t page_size = getpagesize();
+ size_t aligned_size = size;
+ if ((aligned_size % page_size) != 0) {
+ aligned_size += (page_size - (aligned_size % page_size));
+ }
+
ctx->buffers[ctx->n_buffers].name = name;
ctx->buffers[ctx->n_buffers].data = data;
ctx->buffers[ctx->n_buffers].size = size;
- ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared];
+ ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
- ++ctx->n_buffers;
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
+ fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
+ return false;
+ } else {
+ fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
+ }
- fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, size / 1024.0 / 1024.0);
+ ++ctx->n_buffers;
}
return true;
diff --git a/ggml.c b/ggml.c
index 24f0d2f..4e3e7ed 100644
--- a/ggml.c
+++ b/ggml.c
@@ -22,6 +22,10 @@
#include <float.h>
#include <limits.h>
+#ifdef GGML_USE_METAL
+#include <unistd.h>
+#endif
+
// if C99 - static_assert is noop
// ref: https://stackoverflow.com/a/53923785/4039976
#ifndef static_assert
@@ -122,7 +126,11 @@ typedef void* thread_ret_t;
#else
inline static void* ggml_aligned_malloc(size_t size) {
void* aligned_memory = NULL;
+#ifdef GGML_USE_METAL
+ int result = posix_memalign(&aligned_memory, getpagesize(), size);
+#else
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
+#endif
if (result != 0) {
// Handle allocation failure
return NULL;
diff --git a/llama-util.h b/llama-util.h
index 3cac9f6..4f8a429 100644
--- a/llama-util.h
+++ b/llama-util.h
@@ -405,13 +405,29 @@ struct llama_buffer {
llama_buffer() = default;
void resize(size_t len) {
+#ifdef GGML_USE_METAL
+ free(addr);
+ int result = posix_memalign((void **) &addr, getpagesize(), len);
+ if (result == 0) {
+ memset(addr, 0, len);
+ }
+ else {
+ addr = NULL;
+ }
+#else
delete[] addr;
addr = new uint8_t[len];
+#endif
size = len;
}
~llama_buffer() {
+#ifdef GGML_USE_METAL
+ free(addr);
+#else
delete[] addr;
+#endif
+ addr = NULL;
}
// disable copy and move
diff --git a/llama.cpp b/llama.cpp
index e2511e5..d0e7151 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -53,7 +53,6 @@ enum e_model {
MODEL_65B,
};
-
static const size_t MB = 1024*1024;
// computed for n_ctx == 2048
@@ -1281,12 +1280,6 @@ static bool llama_eval_internal(
ggml_set_name(embd, "embd");
memcpy(embd->data, tokens, N*ggml_element_size(embd));
-#ifdef GGML_USE_METAL
- if (lctx.ctx_metal && N == 1) {
- ggml_metal_set_tensor(lctx.ctx_metal, embd);
- }
-#endif
-
struct ggml_tensor * cur;
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
@@ -1484,12 +1477,6 @@ static bool llama_eval_internal(
}
ggml_graph_compute(ctx0, &gf);
-
- if (lctx.ctx_metal) {
- // We need to sync the CPU KV cache with the GPU KV cache
- ggml_metal_set_tensor(lctx.ctx_metal, kv_self.k);
- ggml_metal_set_tensor(lctx.ctx_metal, kv_self.v);
- }
}
#else
ggml_graph_compute(ctx0, &gf);