aboutsummaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorEvan Miller <emmiller@gmail.com>2023-07-10 11:49:56 -0400
committerGitHub <noreply@github.com>2023-07-10 18:49:56 +0300
commit5656d10599bd756dc0f17284e418e704200b43f3 (patch)
treea9aba6c867a268d0bcb90bd9174912774a67ed65 /llama.cpp
parent1d1630996920f889cdc08de26cebf2415958540e (diff)
mpi : add support for distributed inference via MPI (#2099)
* MPI support, first cut * fix warnings, update README * fixes * wrap includes * PR comments * Update CMakeLists.txt * Add GH workflow, fix test * Add info to README * mpi : trying to move more MPI stuff into ggml-mpi (WIP) (#2099) * mpi : add names for layer inputs + prep ggml_mpi_graph_compute() * mpi : move all MPI logic into ggml-mpi Not tested yet * mpi : various fixes - communication now works but results are wrong * mpi : fix output tensor after MPI compute (still not working) * mpi : fix inference * mpi : minor * Add OpenMPI to GH action * [mpi] continue-on-error: true * mpi : fix after master merge * [mpi] Link MPI C++ libraries to fix OpenMPI * tests : fix new llama_backend API * [mpi] use MPI_INT32_T * mpi : factor out recv / send in functions and reuse * mpi : extend API to allow usage with outer backends (e.g. Metal) --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp98
1 files changed, 71 insertions, 27 deletions
diff --git a/llama.cpp b/llama.cpp
index a491f1c..ad7283f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -19,6 +19,9 @@
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
+#ifdef GGML_USE_MPI
+#include "ggml-mpi.h"
+#endif
#ifdef GGML_USE_K_QUANTS
#ifndef QK_K
#ifdef GGML_QKK_64
@@ -352,6 +355,10 @@ struct llama_context {
ggml_metal_context * ctx_metal = NULL;
#endif
+#ifdef GGML_USE_MPI
+ ggml_mpi_context * ctx_mpi = NULL;
+#endif
+
int buf_last = 0;
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -870,7 +877,7 @@ bool llama_mlock_supported() {
return llama_mlock::SUPPORTED;
}
-void llama_init_backend(bool numa) {
+void llama_backend_init(bool numa) {
ggml_time_init();
// needed to initialize f16 tables
@@ -883,6 +890,16 @@ void llama_init_backend(bool numa) {
if (numa) {
ggml_numa_init();
}
+
+#ifdef GGML_USE_MPI
+ ggml_mpi_backend_init();
+#endif
+}
+
+void llama_backend_free() {
+#ifdef GGML_USE_MPI
+ ggml_mpi_backend_free();
+#endif
}
int64_t llama_time_us() {
@@ -1284,13 +1301,17 @@ static bool llama_eval_internal(
llama_context & lctx,
const llama_token * tokens,
const float * embd,
- const int n_tokens,
- const int n_past,
+ int n_tokens,
+ int n_past,
int n_threads,
const char * cgraph_fname) {
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
+#ifdef GGML_USE_MPI
+ ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
+#endif
+
const int64_t t_start_us = ggml_time_us();
const int N = n_tokens;
@@ -1331,11 +1352,16 @@ static bool llama_eval_internal(
struct ggml_tensor * inpL;
if (tokens) {
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
- ggml_set_name(embd, "embd");
- memcpy(embd->data, tokens, N*ggml_element_size(embd));
- inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
+ ggml_set_name(inp_tokens, "inp_tokens");
+
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
} else {
+#ifdef GGML_USE_MPI
+ GGML_ASSERT(false && "not implemented");
+#endif
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
}
@@ -1353,18 +1379,20 @@ static bool llama_eval_internal(
offload_func_t offload_func_v = llama_nop;
#ifdef GGML_USE_CUBLAS
- if (n_gpu_layers > n_layer) {
- offload_func_nr = ggml_cuda_assign_buffers;
- }
- if (n_gpu_layers > n_layer + 1) {
- offload_func_v = ggml_cuda_assign_buffers;
- }
- if (n_gpu_layers > n_layer + 2) {
- offload_func_kq = ggml_cuda_assign_buffers;
- }
+ if (n_gpu_layers > n_layer) {
+ offload_func_nr = ggml_cuda_assign_buffers;
+ }
+ if (n_gpu_layers > n_layer + 1) {
+ offload_func_v = ggml_cuda_assign_buffers;
+ }
+ if (n_gpu_layers > n_layer + 2) {
+ offload_func_kq = ggml_cuda_assign_buffers;
+ }
#endif // GGML_USE_CUBLAS
for (int il = 0; il < n_layer; ++il) {
+ ggml_format_name(inpL, "layer_inp_%d", il);
+
offload_func_t offload_func = llama_nop;
#ifdef GGML_USE_CUBLAS
@@ -1571,7 +1599,6 @@ static bool llama_eval_internal(
// input for next layer
inpL = cur;
-
}
lctx.use_buf(ctx0, 0);
@@ -1579,7 +1606,6 @@ static bool llama_eval_internal(
// used at the end to optionally extract the embeddings
struct ggml_tensor * embeddings = NULL;
-
// norm
{
cur = ggml_rms_norm(ctx0, inpL);
@@ -1594,7 +1620,6 @@ static bool llama_eval_internal(
embeddings = cur;
}
-
// lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
ggml_set_name(cur, "result_output");
@@ -1607,6 +1632,10 @@ static bool llama_eval_internal(
// run the computation
ggml_build_forward_expand(&gf, cur);
+#if GGML_USE_MPI
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
+#endif
+
#ifdef GGML_USE_METAL
if (lctx.ctx_metal && N == 1) {
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
@@ -1635,6 +1664,15 @@ static bool llama_eval_internal(
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
#endif
+#if GGML_USE_MPI
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
+#endif
+
+ // update kv token count
+ lctx.kv_self.n = n_past + N;
+
+ struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
+
if (cgraph_fname) {
ggml_graph_export(&gf, cgraph_fname);
}
@@ -1650,23 +1688,17 @@ static bool llama_eval_internal(
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
//}
- //embd_w.resize(n_vocab*N);
- //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
-
- // update kv token count
- lctx.kv_self.n = n_past + N;
-
// extract logits
{
auto & logits_out = lctx.logits;
if (lctx.logits_all) {
logits_out.resize(n_vocab * N);
- memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
+ memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
} else {
// return result for just the last token
logits_out.resize(n_vocab);
- memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+ memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
}
}
@@ -2697,6 +2729,18 @@ struct llama_context * llama_new_context_with_model(
}
#endif
+#ifdef GGML_USE_MPI
+ ctx->ctx_mpi = ggml_mpi_init();
+
+ if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
+ // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
+ const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
+ while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
+ llama_backend_free();
+ exit(1);
+ }
+#endif
+
return ctx;
}