aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile2
-rw-r--r--ggml-metal.h5
-rw-r--r--ggml-metal.m98
-rw-r--r--ggml.c24
-rw-r--r--ggml.h5
-rw-r--r--llama.cpp26
6 files changed, 125 insertions, 35 deletions
diff --git a/Makefile b/Makefile
index cf59086..afd06e0 100644
--- a/Makefile
+++ b/Makefile
@@ -252,7 +252,7 @@ $(info )
ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c $< -o $@
-llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
+llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common.o: examples/common.cpp examples/common.h
diff --git a/ggml-metal.h b/ggml-metal.h
index 033c4d8..b9e50ac 100644
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -41,12 +41,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx);
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
// - the mapping is used during computation to determine the arguments of the compute kernels
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
+// - max_size specifies the maximum size of a tensor and is used to create shared views such
+// that it is guaranteed that the tensor will fit in at least one of the views
//
bool ggml_metal_add_buffer(
struct ggml_metal_context * ctx,
const char * name,
void * data,
- size_t size);
+ size_t size,
+ size_t max_size);
// set data from host memory into the device
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
diff --git a/ggml-metal.m b/ggml-metal.m
index 07da62a..a7e104d 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -183,6 +183,14 @@ struct ggml_metal_context * ggml_metal_init(void) {
#undef GGML_METAL_ADD_KERNEL
}
+ fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+ fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
+ if (ctx->device.maxTransferRate != 0) {
+ fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
+ } else {
+ fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
+ }
+
return ctx;
}
@@ -199,10 +207,13 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
+ const int64_t tsize = ggml_nbytes(t);
+
+ // find the view that contains the tensor fully
for (int i = 0; i < ctx->n_buffers; ++i) {
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
- if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
+ if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
*offs = (size_t) ioffs;
//fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
@@ -220,7 +231,8 @@ bool ggml_metal_add_buffer(
struct ggml_metal_context * ctx,
const char * name,
void * data,
- size_t size) {
+ size_t size,
+ size_t max_size) {
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
fprintf(stderr, "%s: too many buffers\n", __func__);
return false;
@@ -237,30 +249,68 @@ bool ggml_metal_add_buffer(
}
}
- size_t page_size = getpagesize();
- size_t aligned_size = size;
- if ((aligned_size % page_size) != 0) {
- aligned_size += (page_size - (aligned_size % page_size));
+ const size_t size_page = getpagesize();
+
+ size_t size_aligned = size;
+ if ((size_aligned % size_page) != 0) {
+ size_aligned += (size_page - (size_aligned % size_page));
}
- ctx->buffers[ctx->n_buffers].name = name;
- ctx->buffers[ctx->n_buffers].data = data;
- ctx->buffers[ctx->n_buffers].size = size;
+ // the buffer fits into the max buffer size allowed by the device
+ if (size_aligned <= ctx->device.maxBufferLength) {
+ ctx->buffers[ctx->n_buffers].name = name;
+ ctx->buffers[ctx->n_buffers].data = data;
+ ctx->buffers[ctx->n_buffers].size = size;
- if (ctx->device.maxBufferLength < aligned_size) {
- fprintf(stderr, "%s: buffer '%s' size %zu is larger than buffer maximum of %zu\n", __func__, name, aligned_size, ctx->device.maxBufferLength);
- return false;
- }
- ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
+ ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
+
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
+ fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
+ return false;
+ }
+
+ fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
+
+ ++ctx->n_buffers;
+ } else {
+ // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
+ // one of the views
+ const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
+ const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
+ const size_t size_view = ctx->device.maxBufferLength;
+
+ for (size_t i = 0; i < size; i += size_step) {
+ const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
- if (ctx->buffers[ctx->n_buffers].metal == nil) {
- fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
- return false;
+ ctx->buffers[ctx->n_buffers].name = name;
+ ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
+ ctx->buffers[ctx->n_buffers].size = size_step_aligned;
+
+ ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
+
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
+ fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
+ return false;
+ }
+
+ fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
+ if (i + size_step < size) {
+ fprintf(stderr, "\n");
+ }
+
+ ++ctx->n_buffers;
+ }
}
- fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
+ fprintf(stderr, ", (%8.2f / %8.2f)",
+ ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
+ ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
- ++ctx->n_buffers;
+ if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
+ fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
+ } else {
+ fprintf(stderr, "\n");
+ }
}
return true;
@@ -909,4 +959,14 @@ void ggml_metal_graph_compute(
dispatch_barrier_sync(queue, ^{});
[command_buffers[n_cb - 1] waitUntilCompleted];
+
+ // check status of command buffers
+ // needed to detect if the device ran out-of-memory for example (#1881)
+ for (int i = 0; i < n_cb; i++) {
+ MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
+ if (status != MTLCommandBufferStatusCompleted) {
+ fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
+ GGML_ASSERT(false);
+ }
+ }
}
diff --git a/ggml.c b/ggml.c
index 0eda7f3..78c3653 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4154,14 +4154,34 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
ctx->no_alloc = no_alloc;
}
-void * ggml_get_mem_buffer(struct ggml_context * ctx) {
+void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
return ctx->mem_buffer;
}
-size_t ggml_get_mem_size(struct ggml_context * ctx) {
+size_t ggml_get_mem_size(const struct ggml_context * ctx) {
return ctx->mem_size;
}
+size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
+ size_t max_size = 0;
+
+ struct ggml_object * obj = ctx->objects_begin;
+
+ while (obj != NULL) {
+ struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
+
+ const size_t size = ggml_nbytes(tensor);
+
+ if (max_size < size) {
+ max_size = size;
+ }
+
+ obj = obj->next;
+ }
+
+ return max_size;
+}
+
// IMPORTANT:
// when creating "opt" tensors, always save and load the scratch buffer
// this is an error prone process, but it is necessary to support inplace
diff --git a/ggml.h b/ggml.h
index 9b0c846..1380c53 100644
--- a/ggml.h
+++ b/ggml.h
@@ -500,8 +500,9 @@ extern "C" {
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
- GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
- GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
+ GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
+ GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
+ GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
GGML_API struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
diff --git a/llama.cpp b/llama.cpp
index a2916b3..c165d32 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2696,16 +2696,21 @@ struct llama_context * llama_init_from_file(
// this allocates all Metal resources and memory buffers
ctx->ctx_metal = ggml_metal_init();
- void *data_ptr = NULL;
+ void * data_ptr = NULL;
size_t data_size = 0;
+
if (params.use_mmap) {
- data_ptr = ctx->model.mapping->addr;
- data_size= ctx->model.mapping->size;
+ data_ptr = ctx->model.mapping->addr;
+ data_size = ctx->model.mapping->size;
} else {
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
- data_size= ggml_get_mem_size(ctx->model.ctx);
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
+ data_size = ggml_get_mem_size (ctx->model.ctx);
}
+ const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
+
+ printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
+
#define LLAMA_METAL_CHECK_BUF(result) \
if (!(result)) { \
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
@@ -2713,12 +2718,13 @@ struct llama_context * llama_init_from_file(
return NULL; \
}
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
+
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
#undef LLAMA_METAL_CHECK_BUF
}
#endif