aboutsummaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp132
1 files changed, 104 insertions, 28 deletions
diff --git a/llama.cpp b/llama.cpp
index f70b26c..bc58ad9 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -16,6 +16,10 @@
#include "ggml-opencl.h"
#endif
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
#include <array>
#include <ctime>
#include <cinttypes>
@@ -243,6 +247,10 @@ struct llama_context {
llama_ctx_buffer buf_compute;
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
+#ifdef GGML_USE_METAL
+ ggml_metal_context * ctx_metal = NULL;
+#endif
+
int buf_last = 0;
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -1088,7 +1096,7 @@ static void llama_model_load_internal(
mmapped_size - vram_total + // weights in VRAM not in memory
MEM_REQ_SCRATCH0().at(model.type) +
MEM_REQ_SCRATCH1().at(model.type) +
- MEM_REQ_EVAL().at(model.type);
+ MEM_REQ_EVAL().at (model.type);
// this is the memory required by one llama_state
const size_t mem_required_state =
@@ -1195,17 +1203,19 @@ static bool llama_model_load(
// evaluate the transformer
//
-// - lctx: llama context
-// - tokens: new batch of tokens to process
-// - n_past: the context size so far
-// - n_threads: number of threads to use
+// - lctx: llama context
+// - tokens: new batch of tokens to process
+// - n_past: the context size so far
+// - n_threads: number of threads to use
+// - cgraph_fname: filename of the exported computation graph
//
static bool llama_eval_internal(
- llama_context & lctx,
- const llama_token * tokens,
- const int n_tokens,
- const int n_past,
- const int n_threads) {
+ llama_context & lctx,
+ const llama_token * tokens,
+ const int n_tokens,
+ const int n_past,
+ const int n_threads,
+ const char * cgraph_fname) {
// enforce that the first token is BOS
if (n_past == 0 && tokens[0] != llama_token_bos()) {
@@ -1251,13 +1261,18 @@ static bool llama_eval_internal(
ggml_set_name(embd, "embd");
memcpy(embd->data, tokens, N*ggml_element_size(embd));
+#ifdef GGML_USE_METAL
+ if (lctx.ctx_metal && N == 1) {
+ ggml_metal_set_tensor(lctx.ctx_metal, embd);
+ }
+#endif
+
+ struct ggml_tensor * cur;
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
- struct ggml_tensor * cur;
-
lctx.use_buf(ctx0, 0);
// norm
@@ -1271,6 +1286,7 @@ static bool llama_eval_internal(
// self-attention
{
// compute Q and K and RoPE them
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
ggml_set_name(Qcur, "Qcur");
@@ -1280,6 +1296,7 @@ static bool llama_eval_internal(
{
// compute the transposed [N, n_embd] V matrix
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
+ ggml_set_name(Vcur, "Vcur");
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
@@ -1325,7 +1342,6 @@ static bool llama_eval_internal(
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
ggml_set_name(KQ_soft_max, "KQ_soft_max");
-
// split cached V into n_head heads
struct ggml_tensor * V =
ggml_view_3d(ctx0, kv_self.v,
@@ -1407,26 +1423,53 @@ static bool llama_eval_internal(
// norm
{
+ cur = ggml_rms_norm(ctx0, inpL);
- inpL = ggml_rms_norm(ctx0, inpL);
+ // cur = cur*norm(broadcasted)
+ cur = ggml_mul(ctx0, cur, model.norm);
- // inpL = inpL*norm(broadcasted)
- inpL = ggml_mul(ctx0, inpL, model.norm);
-
- embeddings = inpL;
+ embeddings = cur;
}
// lm_head
- inpL = ggml_mul_mat(ctx0, model.output, inpL);
+ cur = ggml_mul_mat(ctx0, model.output, cur);
lctx.use_buf(ctx0, -1);
// logits -> probs
- //inpL = ggml_soft_max_inplace(ctx0, inpL);
+ //cur = ggml_soft_max_inplace(ctx0, cur);
// run the computation
- ggml_build_forward_expand(&gf, inpL);
- ggml_graph_compute (ctx0, &gf);
+ ggml_build_forward_expand(&gf, cur);
+
+#ifdef GGML_USE_METAL
+ if (lctx.ctx_metal && N == 1) {
+ ggml_metal_graph_compute(lctx.ctx_metal, &gf);
+ ggml_metal_get_tensor (lctx.ctx_metal, cur);
+ } else {
+ // IMPORTANT:
+ // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
+ // ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
+ // coprocessor.
+ //
+ // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
+ // But for now, we have focused only on Matrix x Vector Metal multiplication.
+ //
+ ggml_graph_compute(ctx0, &gf);
+
+ if (lctx.ctx_metal) {
+ // We need to sync the CPU KV cache with the GPU KV cache
+ ggml_metal_set_tensor(lctx.ctx_metal, kv_self.k);
+ ggml_metal_set_tensor(lctx.ctx_metal, kv_self.v);
+ }
+ }
+#else
+ ggml_graph_compute(ctx0, &gf);
+#endif
+
+ if (cgraph_fname) {
+ ggml_graph_export(&gf, cgraph_fname);
+ }
#ifdef GGML_PERF
// print timing information per ggml operation (for debugging purposes)
@@ -1440,7 +1483,7 @@ static bool llama_eval_internal(
//}
//embd_w.resize(n_vocab*N);
- //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+ //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
// update kv token count
lctx.model.kv_self.n = n_past + N;
@@ -1451,11 +1494,11 @@ static bool llama_eval_internal(
if (lctx.logits_all) {
logits_out.resize(n_vocab * N);
- memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+ memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
} else {
// return result for just the last token
logits_out.resize(n_vocab);
- memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+ memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
}
}
@@ -2251,8 +2294,8 @@ struct llama_context * llama_init_from_file(
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
- params.use_mmap, params.use_mlock, params.vocab_only,
- params.progress_callback, params.progress_callback_user_data)) {
+ params.use_mmap, params.use_mlock, params.vocab_only,
+ params.progress_callback, params.progress_callback_user_data)) {
fprintf(stderr, "%s: failed to load model\n", __func__);
llama_free(ctx);
return nullptr;
@@ -2290,6 +2333,25 @@ struct llama_context * llama_init_from_file(
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
}
+#ifdef GGML_USE_METAL
+ if (params.n_gpu_layers > 0) {
+ // this allocates all Metal resources and memory buffers
+ ctx->ctx_metal = ggml_metal_init();
+
+ if (params.use_mmap) {
+ ggml_metal_add_buffer(ctx->ctx_metal, "data", ctx->model.mapping->addr, ctx->model.mapping->size);
+ ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size);
+ } else {
+ ggml_metal_add_buffer(ctx->ctx_metal, "data", ggml_get_mem_buffer(ctx->model.ctx), ggml_get_mem_size(ctx->model.ctx));
+ ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size);
+ }
+
+ ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size);
+ ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size);
+ ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size);
+ }
+#endif
+
return ctx;
}
@@ -2905,7 +2967,7 @@ int llama_eval(
int n_tokens,
int n_past,
int n_threads) {
- if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
+ if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
fprintf(stderr, "%s: failed to eval\n", __func__);
return 1;
}
@@ -2920,6 +2982,20 @@ int llama_eval(
return 0;
}
+int llama_eval_export(struct llama_context * ctx, const char * fname) {
+ const int n_batch = 1;
+ const int n_ctx = 512 - n_batch;
+
+ const std::vector<llama_token> tmp(n_batch, llama_token_bos());
+
+ if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
+ fprintf(stderr, "%s: failed to eval\n", __func__);
+ return 1;
+ }
+
+ return 0;
+}
+
int llama_tokenize(
struct llama_context * ctx,
const char * text,