diff options
author | kiltyj <kiltyj@gmail.com> | 2023-06-05 13:24:04 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-06-05 23:24:04 +0300 |
commit | 9d0693bce38013364b1042568d9083353bfff48f (patch) | |
tree | 8311cb168defca62e9b2689571c4b641ea7654b9 /llama.cpp | |
parent | efe05076323f5c6bafece109e21cce046f5e4b07 (diff) |
metal : use shared buffers between CPU and GPU (#1696)
* Use MTLDevice.newBufferWithBytesNoCopy to share buffers between CPU and GPU
* Page-align buffers used by Metal
* Remove trailing whitespace
* Only import unistd.h for Metal builds
* metal : remove unnecessary copies
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 13 |
1 files changed, 0 insertions, 13 deletions
@@ -53,7 +53,6 @@ enum e_model { MODEL_65B, }; - static const size_t MB = 1024*1024; // computed for n_ctx == 2048 @@ -1281,12 +1280,6 @@ static bool llama_eval_internal( ggml_set_name(embd, "embd"); memcpy(embd->data, tokens, N*ggml_element_size(embd)); -#ifdef GGML_USE_METAL - if (lctx.ctx_metal && N == 1) { - ggml_metal_set_tensor(lctx.ctx_metal, embd); - } -#endif - struct ggml_tensor * cur; struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); @@ -1484,12 +1477,6 @@ static bool llama_eval_internal( } ggml_graph_compute(ctx0, &gf); - - if (lctx.ctx_metal) { - // We need to sync the CPU KV cache with the GPU KV cache - ggml_metal_set_tensor(lctx.ctx_metal, kv_self.k); - ggml_metal_set_tensor(lctx.ctx_metal, kv_self.v); - } } #else ggml_graph_compute(ctx0, &gf); |