aboutsummaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorQingyou Meng <meng.qingyou@gmail.com>2023-07-08 00:24:01 +0800
committerGitHub <noreply@github.com>2023-07-07 19:24:01 +0300
commit1d656d6360359cfdaaf5d64ed9690047b600dbcb (patch)
treeea41daf563633ab0552f24fd0bacce51833e04eb /llama.cpp
parent72421402834141df6cbdcf595fe46dbd11874dce (diff)
ggml : change ggml_graph_compute() API to not require context (#1999)
* ggml_graph_compute: deprecate using ggml_context, try resolve issue #287 * rewrite: no longer consider backward compitability; plan and make_plan * minor: rename ctx as plan; const * remove ggml_graph_compute from tests/test-grad0.c, but current change breaks backward * add static ggml_graph_compute_sugar() * minor: update comments * reusable buffers * ggml : more consistent naming + metal fixes * ggml : fix docs * tests : disable grad / opt + minor naming changes * ggml : add ggml_graph_compute_with_ctx() - backwards compatible API - deduplicates a lot of copy-paste * ci : enable test-grad0 * examples : factor out plan allocation into a helper function * llama : factor out plan stuff into a helper function * ci : fix env * llama : fix duplicate symbols + refactor example benchmark * ggml : remove obsolete assert + refactor n_tasks section * ggml : fix indentation in switch * llama : avoid unnecessary bool * ggml : remove comments from source file and match order in header --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp54
1 files changed, 39 insertions, 15 deletions
diff --git a/llama.cpp b/llama.cpp
index 02afdeb..ee6ec09 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -79,6 +79,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
(void) tensor;
}
+//
+// ggml helpers
+//
+
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+ if (plan.work_size > 0) {
+ buf.resize(plan.work_size);
+ plan.work_data = buf.data();
+ }
+
+ ggml_graph_compute(graph, &plan);
+}
+
+//
+// memory sizes
+//
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
{
static std::map<e_model, size_t> k_sizes = {
@@ -321,6 +340,9 @@ struct llama_context {
// input embedding (1-dimensional array: [n_embd])
std::vector<float> embedding;
+ // reusable buffer for `struct ggml_graph_plan.work_data`
+ std::vector<uint8_t> work_buffer;
+
// memory buffers used to evaluate the model
// TODO: move in llama_state
llama_ctx_buffer buf_compute;
@@ -758,7 +780,6 @@ struct llama_model_loader {
};
-
//
// kv cache
//
@@ -1265,7 +1286,7 @@ static bool llama_eval_internal(
const float * embd,
const int n_tokens,
const int n_past,
- const int n_threads,
+ int n_threads,
const char * cgraph_fname) {
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
@@ -1306,10 +1327,11 @@ static bool llama_eval_internal(
struct ggml_context * ctx0 = ggml_init(params);
+ ggml_cgraph gf = {};
+
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
- ggml_cgraph gf = {};
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@@ -1593,6 +1615,7 @@ static bool llama_eval_internal(
#ifdef GGML_USE_METAL
if (lctx.ctx_metal && N == 1) {
+ ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
ggml_metal_get_tensor (lctx.ctx_metal, cur);
} else {
@@ -1612,10 +1635,10 @@ static bool llama_eval_internal(
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
}
- ggml_graph_compute(ctx0, &gf);
+ ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
}
#else
- ggml_graph_compute(ctx0, &gf);
+ ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
#endif
if (cgraph_fname) {
@@ -2575,8 +2598,8 @@ void llama_free_model(struct llama_model * model) {
}
struct llama_context * llama_new_context_with_model(
- struct llama_model * model,
- struct llama_context_params params) {
+ struct llama_model * model,
+ struct llama_context_params params) {
if (!model) {
return nullptr;
@@ -2645,7 +2668,7 @@ struct llama_context * llama_new_context_with_model(
#ifdef GGML_USE_METAL
if (params.n_gpu_layers > 0) {
// this allocates all Metal resources and memory buffers
- ctx->ctx_metal = ggml_metal_init();
+ ctx->ctx_metal = ggml_metal_init(1);
void * data_ptr = NULL;
size_t data_size = 0;
@@ -2802,6 +2825,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
// read tensors and apply
bool warned = false;
int n_tensors = 0;
+
+ std::vector<uint8_t> work_buffer;
+
while (true) {
int32_t n_dims;
int32_t length;
@@ -2966,8 +2992,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
}
struct ggml_cgraph gf = ggml_build_forward(r);
- gf.n_threads = n_threads;
- ggml_graph_compute(lora_ctx, &gf);
+
+ ggml_graph_compute_helper(work_buffer, &gf, n_threads);
// we won't need these tensors again, reset the context to save memory
ggml_free(lora_ctx);
@@ -3120,7 +3146,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
ggml_cgraph gf{};
- gf.n_threads = 1;
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
kout3d->data = out;
@@ -3140,7 +3165,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
- ggml_graph_compute(cpy_ctx, &gf);
+ ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
ggml_free(cpy_ctx);
}
@@ -3226,7 +3251,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
ggml_cgraph gf{};
- gf.n_threads = 1;
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
kin3d->data = (void *) inp;
@@ -3246,7 +3270,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
- ggml_graph_compute(cpy_ctx, &gf);
+ ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
ggml_free(cpy_ctx);
}