aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--examples/CMakeLists.txt1
-rw-r--r--examples/baby-llama/CMakeLists.txt4
-rw-r--r--examples/baby-llama/baby-llama.cpp1687
-rw-r--r--ggml.c3158
-rw-r--r--ggml.h200
-rw-r--r--llama.cpp16
-rw-r--r--tests/CMakeLists.txt2
-rw-r--r--tests/test-grad0.c1131
-rw-r--r--tests/test-opt.c205
9 files changed, 6150 insertions, 254 deletions
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 0973a3f..74d0350 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -36,4 +36,5 @@ else()
add_subdirectory(embedding)
add_subdirectory(save-load-state)
add_subdirectory(benchmark)
+ add_subdirectory(baby-llama)
endif()
diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt
new file mode 100644
index 0000000..d2ce363
--- /dev/null
+++ b/examples/baby-llama/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET baby-llama)
+add_executable(${TARGET} baby-llama.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
new file mode 100644
index 0000000..5573c15
--- /dev/null
+++ b/examples/baby-llama/baby-llama.cpp
@@ -0,0 +1,1687 @@
+#include "ggml.h"
+#include <vector>
+#include <cassert>
+#include <random>
+#include <cstring>
+
+float frand() {
+ return (float)rand()/(float)RAND_MAX;
+}
+
+struct random_normal_distribution {
+ std::mt19937 gen;
+ std::normal_distribution<float> nd;
+ float min;
+ float max;
+};
+
+void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
+ rnd->gen = std::mt19937(seed);
+ rnd->nd = std::normal_distribution<float>{mean, std};
+ rnd->min = min;
+ rnd->max = max;
+}
+
+float frand_normal(struct random_normal_distribution * rnd) {
+ const float r = rnd->nd(rnd->gen);
+ return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
+}
+
+struct ggml_tensor * randomize_tensor(
+ struct ggml_tensor * tensor,
+ int ndims,
+ const int64_t ne[],
+ float fmin,
+ float fmax) {
+
+ switch (ndims) {
+ case 1:
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((float *)tensor->data)[i0] = frand()*(fmax - fmin) + fmin;
+ }
+ break;
+ case 2:
+ for (int i1 = 0; i1 < ne[1]; i1++) {
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((float *)tensor->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+ }
+ }
+ break;
+ case 3:
+ for (int i2 = 0; i2 < ne[2]; i2++) {
+ for (int i1 = 0; i1 < ne[1]; i1++) {
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+ }
+ }
+ }
+ break;
+ case 4:
+ for (int i3 = 0; i3 < ne[3]; i3++) {
+ for (int i2 = 0; i2 < ne[2]; i2++) {
+ for (int i1 = 0; i1 < ne[1]; i1++) {
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+ }
+ }
+ }
+ }
+ break;
+ default:
+ assert(false);
+ };
+
+ return tensor;
+}
+
+struct ggml_tensor * randomize_tensor_normal(
+ struct ggml_tensor * tensor,
+ int ndims,
+ const int64_t ne[],
+ struct random_normal_distribution * rnd) {
+ switch (ndims) {
+ case 1:
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((float *)tensor->data)[i0] = frand_normal(rnd);
+ }
+ break;
+ case 2:
+ for (int i1 = 0; i1 < ne[1]; i1++) {
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((float *)tensor->data)[i1*ne[0] + i0] = frand_normal(rnd);
+ }
+ }
+ break;
+ case 3:
+ for (int i2 = 0; i2 < ne[2]; i2++) {
+ for (int i1 = 0; i1 < ne[1]; i1++) {
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd);
+ }
+ }
+ }
+ break;
+ case 4:
+ for (int i3 = 0; i3 < ne[3]; i3++) {
+ for (int i2 = 0; i2 < ne[2]; i2++) {
+ for (int i1 = 0; i1 < ne[1]; i1++) {
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd);
+ }
+ }
+ }
+ }
+ break;
+ default:
+ assert(false);
+ };
+
+ return tensor;
+}
+
+struct llama_hparams {
+ uint32_t n_vocab = 32000;
+ uint32_t n_ctx = 512; // this is provided as user input?
+ uint32_t n_embd = 4096;
+ uint32_t n_mult = 4;
+ uint32_t n_head = 32;
+ uint32_t n_layer = 32;
+ uint32_t n_rot = 64;
+
+ bool operator!=(const llama_hparams & other) const {
+ return memcmp(this, &other, sizeof(llama_hparams));
+ }
+};
+
+uint32_t get_n_ff(const struct llama_hparams* hparams) {
+ const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
+ return n_ff;
+}
+
+struct llama_hparams_lora {
+ uint32_t n_vocab = 32000;
+ uint32_t n_ctx = 512; // this is provided as user input?
+ uint32_t n_embd = 4096;
+ uint32_t n_mult = 4;
+ uint32_t n_head = 32;
+ uint32_t n_layer = 32;
+ uint32_t n_rot = 64;
+ uint32_t n_lora = 64;
+
+ bool operator!=(const llama_hparams & other) const {
+ return memcmp(this, &other, sizeof(llama_hparams));
+ }
+};
+
+struct llama_layer {
+ // normalization
+ struct ggml_tensor * attention_norm;
+
+ // attention
+ struct ggml_tensor * wq;
+ struct ggml_tensor * wk;
+ struct ggml_tensor * wv;
+ struct ggml_tensor * wo;
+
+ // normalization
+ struct ggml_tensor * ffn_norm;
+
+ // ff
+ struct ggml_tensor * w1;
+ struct ggml_tensor * w2;
+ struct ggml_tensor * w3;
+};
+
+struct llama_layer_lora {
+ // normalization
+ struct ggml_tensor * attention_norm;
+
+ // attention
+ struct ggml_tensor * wqa;
+ struct ggml_tensor * wqb;
+ struct ggml_tensor * wka;
+ struct ggml_tensor * wkb;
+ struct ggml_tensor * wva;
+ struct ggml_tensor * wvb;
+ struct ggml_tensor * woa;
+ struct ggml_tensor * wob;
+
+ // normalization
+ struct ggml_tensor * ffn_norm;
+
+ // ff
+ struct ggml_tensor * w1;
+ struct ggml_tensor * w2;
+ struct ggml_tensor * w3;
+};
+
+
+struct llama_kv_cache {
+ struct ggml_context * ctx = NULL;
+
+ struct ggml_tensor * k;
+ struct ggml_tensor * v;
+
+ // llama_ctx_buffer buf;
+
+ int n; // number of tokens currently in the cache
+};
+
+struct llama_model {
+ struct ggml_context * ctx = NULL;
+
+ llama_hparams hparams;
+
+ struct ggml_tensor * tok_embeddings;
+
+ struct ggml_tensor * norm;
+ struct ggml_tensor * output;
+
+ std::vector<llama_layer> layers;
+};
+
+struct llama_model_lora {
+ struct ggml_context * ctx = NULL;
+
+ llama_hparams_lora hparams;
+
+ struct ggml_tensor * tok_embeddings;
+
+ struct ggml_tensor * norm;
+ struct ggml_tensor * outputa;
+ struct ggml_tensor * outputb;
+
+ std::vector<llama_layer_lora> layers;
+};
+
+void init_model(struct llama_model * model) {
+ const auto & hparams = model->hparams;
+
+ const uint32_t n_embd = hparams.n_embd;
+ const uint32_t n_layer = hparams.n_layer;
+ const uint32_t n_vocab = hparams.n_vocab;
+
+ const uint32_t n_ff = get_n_ff(&hparams);
+
+ struct ggml_context * ctx = model->ctx;
+
+ model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
+ model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd});
+ model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight", {n_embd, n_vocab});
+
+ model->layers.resize(n_layer);
+ for (uint32_t i = 0; i < n_layer; ++i) {
+ auto & layer = model->layers[i];
+
+ // std::string layers_i = "layers." + std::to_string(i);
+
+ layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
+
+ layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
+ layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
+ layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
+ layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
+
+ layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd});
+
+ layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
+ layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
+ layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
+ }
+}
+
+
+void init_model_lora(struct llama_model_lora * model) {
+ const auto & hparams = model->hparams;
+
+ const uint32_t n_embd = hparams.n_embd;
+ const uint32_t n_mult = hparams.n_mult;
+ const uint32_t n_layer = hparams.n_layer;
+ const uint32_t n_vocab = hparams.n_vocab;
+ const uint32_t n_lora = hparams.n_lora;
+
+ const uint32_t n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
+
+ struct ggml_context * ctx = model->ctx;
+
+ model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
+ model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd});
+ model->outputa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_vocab); // ("output.weight", {n_embd, n_vocab});
+ model->outputb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // ("output.weight", {n_embd, n_vocab});
+
+ model->layers.resize(n_layer);
+ for (uint32_t i = 0; i < n_layer; ++i) {
+ auto & layer = model->layers[i];
+
+ // std::string layers_i = "layers." + std::to_string(i);
+
+ layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
+
+ layer.wqa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
+ layer.wqb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
+ layer.wka = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
+ layer.wkb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
+ layer.wva = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
+ layer.wvb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
+ layer.woa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
+ layer.wob = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
+
+ layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd});
+
+ layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
+ layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
+ layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
+ }
+}
+
+void set_param_model(struct llama_model * model) {
+ const auto& hparams = model->hparams;
+
+ const uint32_t n_layer = hparams.n_layer;
+
+ struct ggml_context* ctx = model->ctx;
+
+ ggml_set_param(ctx, model->tok_embeddings);
+ ggml_set_param(ctx, model->norm);
+ ggml_set_param(ctx, model->output);
+
+ for (uint32_t i = 0; i < n_layer; ++i) {
+ auto & layer = model->layers[i];
+
+ ggml_set_param(ctx, layer.attention_norm);
+ ggml_set_param(ctx, layer.wq);
+ ggml_set_param(ctx, layer.wk);
+ ggml_set_param(ctx, layer.wv);
+ ggml_set_param(ctx, layer.wo);
+ ggml_set_param(ctx, layer.ffn_norm);
+ ggml_set_param(ctx, layer.w1);
+ ggml_set_param(ctx, layer.w2);
+ ggml_set_param(ctx, layer.w3);
+ }
+}
+
+void set_param_model_lora(struct llama_model_lora * model) {
+ const auto& hparams = model->hparams;
+
+ const uint32_t n_layer = hparams.n_layer;
+
+ struct ggml_context* ctx = model->ctx;
+
+ ggml_set_param(ctx, model->tok_embeddings);
+ ggml_set_param(ctx, model->norm);
+ ggml_set_param(ctx, model->outputa);
+ ggml_set_param(ctx, model->outputb);
+
+ for (uint32_t i = 0; i < n_layer; ++i) {
+ auto & layer = model->layers[i];
+
+ ggml_set_param(ctx, layer.attention_norm);
+ ggml_set_param(ctx, layer.wqa);
+ ggml_set_param(ctx, layer.wqb);
+ ggml_set_param(ctx, layer.wka);
+ ggml_set_param(ctx, layer.wkb);
+ ggml_set_param(ctx, layer.wva);
+ ggml_set_param(ctx, layer.wvb);
+ ggml_set_param(ctx, layer.woa);
+ ggml_set_param(ctx, layer.wob);
+ ggml_set_param(ctx, layer.ffn_norm);
+ ggml_set_param(ctx, layer.w1);
+ ggml_set_param(ctx, layer.w2);
+ ggml_set_param(ctx, layer.w3);
+ }
+}
+
+void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
+ const auto & hparams = model->hparams;
+
+ const uint32_t n_layer = hparams.n_layer;
+
+ struct random_normal_distribution rnd;
+ init_random_normal_distribution(&rnd, seed, mean, std, min, max);
+ randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
+ randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd);
+ randomize_tensor_normal(model->output, model->output->n_dims, model->output->ne, &rnd);
+
+ for (uint32_t i = 0; i < n_layer; ++i) {
+ auto & layer = model->layers[i];
+ randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
+
+ randomize_tensor_normal(layer.wq, layer.wq->n_dims, layer.wq->ne, &rnd);
+ randomize_tensor_normal(layer.wk, layer.wk->n_dims, layer.wk->ne, &rnd);
+ randomize_tensor_normal(layer.wv, layer.wv->n_dims, layer.wv->ne, &rnd);
+ randomize_tensor_normal(layer.wo, layer.wo->n_dims, layer.wo->ne, &rnd);
+
+ randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
+
+ randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
+ randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
+ randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
+ }
+}
+
+
+void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) {
+ const auto & hparams = model->hparams;
+
+ const uint32_t n_layer = hparams.n_layer;
+
+ struct random_normal_distribution rnd;
+ init_random_normal_distribution(&rnd, seed, mean, std, min, max);
+ randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
+ randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd);
+ randomize_tensor_normal(model->outputa, model->outputa->n_dims, model->outputa->ne, &rnd);
+ randomize_tensor_normal(model->outputb, model->outputb->n_dims, model->outputb->ne, &rnd);
+
+ for (uint32_t i = 0; i < n_layer; ++i) {
+ auto & layer = model->layers[i];
+ randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
+
+ randomize_tensor_normal(layer.wqa, layer.wqa->n_dims, layer.wqa->ne, &rnd);
+ randomize_tensor_normal(layer.wqb, layer.wqb->n_dims, layer.wqb->ne, &rnd);
+ randomize_tensor_normal(layer.wka, layer.wka->n_dims, layer.wka->ne, &rnd);
+ randomize_tensor_normal(layer.wkb, layer.wkb->n_dims, layer.wkb->ne, &rnd);
+ randomize_tensor_normal(layer.wva, layer.wva->n_dims, layer.wva->ne, &rnd);
+ randomize_tensor_normal(layer.wvb, layer.wvb->n_dims, layer.wvb->ne, &rnd);
+ randomize_tensor_normal(layer.woa, layer.woa->n_dims, layer.woa->ne, &rnd);
+ randomize_tensor_normal(layer.wob, layer.wob->n_dims, layer.wob->ne, &rnd);
+
+ randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
+
+ randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
+ randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
+ randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
+ }
+}
+
+bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
+ const auto & hparams = model->hparams;
+
+ const uint32_t n_ctx = hparams.n_ctx;
+ const uint32_t n_embd = hparams.n_embd;
+ const uint32_t n_layer = hparams.n_layer;
+
+ const int64_t n_mem = n_layer*n_ctx*n_batch;
+ const int64_t n_elements = n_embd*n_mem;
+
+ // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
+
+ // struct ggml_init_params params;
+ // params.mem_size = cache.buf.size;
+ // params.mem_buffer = cache.buf.addr;
+ // params.no_alloc = false;
+ if (!cache->ctx) {
+ struct ggml_init_params params;
+ params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
+ params.mem_buffer = NULL;
+ params.no_alloc = false;
+
+ cache->ctx = ggml_init(params);
+
+ if (!cache->ctx) {
+ fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
+ return false;
+ }
+ }
+
+ cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
+ cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
+
+ return true;
+}
+
+bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
+ const auto & hparams = model->hparams;
+
+ const uint32_t n_ctx = hparams.n_ctx;
+ const uint32_t n_embd = hparams.n_embd;
+ const uint32_t n_layer = hparams.n_layer;
+
+ const int64_t n_mem = n_layer*n_ctx*n_batch;
+ const int64_t n_elements = n_embd*n_mem;
+
+ // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
+
+ // struct ggml_init_params params;
+ // params.mem_size = cache.buf.size;
+ // params.mem_buffer = cache.buf.addr;
+ // params.no_alloc = false;
+ if (!cache->ctx) {
+ struct ggml_init_params params;
+ params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
+ params.mem_buffer = NULL;
+ params.no_alloc = false;
+
+ cache->ctx = ggml_init(params);
+
+ if (!cache->ctx) {
+ fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
+ return false;
+ }
+ }
+
+ cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
+ cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
+
+ return true;
+}
+
+struct ggml_tensor * forward(
+ struct llama_model * model,
+ struct llama_kv_cache * cache,
+ struct ggml_context * ctx0,
+ struct ggml_cgraph * gf,
+ struct ggml_tensor * tokens_input,
+ const int n_tokens,
+ const int n_past) {
+
+ const int N = n_tokens;
+
+ struct llama_kv_cache& kv_self = *cache;
+ const auto & hparams = model->hparams;
+ const int n_ctx = hparams.n_ctx;
+ const int n_embd = hparams.n_embd;
+ const int n_layer = hparams.n_layer;
+ const int n_head = hparams.n_head;
+ const int n_rot = hparams.n_rot;
+
+ struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+ memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
+
+ struct ggml_tensor * kc = kv_self.k;
+ struct ggml_tensor * vc = kv_self.v;
+
+ // inpL shape [n_embd,N,1,1]
+ struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
+
+ struct ggml_tensor * cur;
+
+ // lctx.use_buf(ctx0, 0);
+
+ // norm
+ {
+ // cur shape [n_embd,N,1,1]
+ cur = ggml_rms_norm(ctx0, inpL);
+
+ // cur = attention_norm*cur
+ cur = ggml_mul(ctx0,
+ ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
+ cur);
+ }
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ // wq shape [n_embd, n_embd, 1, 1]
+ // wk shape [n_embd, n_embd, 1, 1]
+ // Qcur shape [n_embd/n_head, n_head, N, 1]
+ // Kcur shape [n_embd/n_head, n_head, N, 1]
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+
+ // store key and value to memory
+ {
+ // compute the transposed [N, n_embd] V matrix
+ // wv shape [n_embd, n_embd, 1, 1]
+ // Vcur shape [n_embd, N, 1, 1]
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
+
+ // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+ // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
+ // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
+ // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
+
+ /* {
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
+ ( n_ctx)*ggml_element_size(kv_self.v),
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+
+ // important: storing RoPE-ed version of K in the KV cache!
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+ } //*/
+
+ kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+ vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v),
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+ }
+
+ // Qcur shape [n_embd/n_head, n_head, N, 1]
+ // Q shape [n_embd/n_head, N, n_head, 1]
+ struct ggml_tensor * Q =
+ ggml_permute(ctx0,
+ Qcur,
+ 0, 2, 1, 3);
+
+ // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+ // K shape [n_embd/n_head, n_past + N, n_head, 1]
+ struct ggml_tensor * K =
+ ggml_permute(ctx0,
+ ggml_reshape_3d(ctx0,
+ ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
+ n_embd/n_head, n_head, n_past + N),
+ 0, 2, 1, 3);
+
+ // K * Q
+ // KQ shape [n_past + N, N, n_head, 1]
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+ // KQ_scaled = KQ / sqrt(n_embd/n_head)
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
+ struct ggml_tensor * KQ_scaled =
+ ggml_scale(ctx0,
+ KQ,
+ ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
+
+ // KQ_masked = mask_past(KQ_scaled)
+ // KQ_masked shape [n_past + N, N, n_head, 1]
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+
+ // KQ = soft_max(KQ_masked)
+ // KQ_soft_max shape [n_past + N, N, n_head, 1]
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+
+ // split cached V into n_head heads
+ //// V shape [n_past + N, n_embd/n_head, n_head, 1]
+ // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
+ struct ggml_tensor * V =
+ ggml_view_3d(ctx0, vc,
+ n_past + N, n_embd/n_head, n_head,
+ n_ctx*ggml_element_size(vc),
+ n_ctx*ggml_element_size(vc)*n_embd/n_head,
+ il*n_ctx*ggml_element_size(vc)*n_embd);
+
+ // KQV shape [n_embd/n_head, N, n_head, 1]
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
+ // KQV_merged shape [n_embd/n_head, n_head, N, 1]
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+ // KQV_merged shape
+
+ // cur = KQV_merged.contiguous().view(n_embd, N)
+ // cur shape [n_embd,N,1,1]
+ cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
+ // cur = ggml_cpy(ctx0,
+ // KQV_merged,
+ // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+
+ // projection (no bias)
+ // cur shape [n_embd,N,1,1]
+ cur = ggml_mul_mat(ctx0,
+ model->layers[il].wo,
+ cur);
+ }
+
+ // lctx.use_buf(ctx0, 1);
+
+ // inpFF shape [n_embd,N,1,1]
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
+
+ // feed-forward network
+ {
+ // norm
+ {
+ // cur shape [n_embd,N,1,1]
+ cur = ggml_rms_norm(ctx0, inpFF);
+
+ // cur = ffn_norm*cur
+ // cur shape [n_embd,N,1,1]
+ cur = ggml_mul(ctx0,
+ ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
+ cur);
+ }
+
+ // tmp shape [n_ff,N,1,1]
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
+ model->layers[il].w3,
+ cur);
+
+ // cur shape [n_ff,N,1,1]
+ cur = ggml_mul_mat(ctx0,
+ model->layers[il].w1,
+ cur);
+
+ // SILU activation
+ // cur shape [n_ff,N,1,1]
+ cur = ggml_silu(ctx0, cur);
+
+ // cur shape [n_ff,N,1,1]
+ cur = ggml_mul(ctx0, cur, tmp);
+
+ // cur shape [n_embd,N,1,1]
+ cur = ggml_mul_mat(ctx0,
+ model->layers[il].w2,
+ cur);
+ }
+
+ // cur shape [n_embd,N,1,1]
+ cur = ggml_add(ctx0, cur, inpFF);
+
+ // input for next layer
+ // inpL shape [n_embd,N,1,1]
+ inpL = cur;
+ }
+
+ // norm
+ {
+
+ // inpL shape [n_embd,N,1,1]
+ inpL = ggml_rms_norm(ctx0, inpL);
+
+ // inpL = norm*inpL
+ // inpL shape [n_embd,N,1,1]
+ inpL = ggml_mul(ctx0,
+ ggml_repeat(ctx0, model->norm, inpL),
+ inpL);
+
+ //embeddings = inpL;
+ }
+
+ // lm_head
+ // inpL shape [n_vocab,N,1,1]
+ inpL = ggml_mul_mat(ctx0, model->output, inpL);
+
+ // run the computation
+ ggml_build_forward_expand(gf, inpL);
+
+ return inpL;
+}
+
+void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
+ GGML_ASSERT(tensor->n_dims == 1);
+ GGML_ASSERT(tensor->ne[0] == ne0);
+}
+
+void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
+ GGML_ASSERT(tensor->n_dims == 2);
+ GGML_ASSERT(tensor->ne[0] == ne0);
+ GGML_ASSERT(tensor->ne[1] == ne1);
+}
+
+void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
+ GGML_ASSERT(tensor->n_dims == 3);
+ GGML_ASSERT(tensor->ne[0] == ne0);
+ GGML_ASSERT(tensor->ne[1] == ne1);
+ GGML_ASSERT(tensor->ne[2] == ne2);
+}
+
+void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
+ GGML_ASSERT(tensor->n_dims == 4);
+ GGML_ASSERT(tensor->ne[0] == ne0);
+ GGML_ASSERT(tensor->ne[1] == ne1);
+ GGML_ASSERT(tensor->ne[2] == ne2);
+ GGML_ASSERT(tensor->ne[3] == ne3);
+}
+
+struct ggml_tensor * forward_batch(
+ struct llama_model * model,
+ struct llama_kv_cache * cache,
+ struct ggml_context * ctx0,
+ struct ggml_cgraph * gf,
+ struct ggml_tensor * tokens_input,
+ const int n_tokens,
+ const int n_past,
+ const int n_batch) {
+
+ const int N = n_tokens;
+
+ struct llama_kv_cache& kv_self = *cache;
+ const auto & hparams = model->hparams;
+ const int n_ctx = hparams.n_ctx;
+ const int n_vocab = hparams.n_vocab;
+ const int n_embd = hparams.n_embd;
+ const int n_layer = hparams.n_layer;
+ const int n_head = hparams.n_head;
+ const int n_rot = hparams.n_rot;
+ const int n_ff = get_n_ff(&hparams);
+
+ struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
+ memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
+
+ struct ggml_tensor * kc = kv_self.k;
+ struct ggml_tensor * vc = kv_self.v;
+
+ // inpL shape [n_embd,N*n_batch,1]
+ struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
+ assert_shape_2d(inpL, n_embd, N*n_batch);
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
+
+ struct ggml_tensor * cur;
+
+ // lctx.use_buf(ctx0, 0);
+
+ // norm
+ {
+ // cur shape [n_embd,N*n_batch,1,1]
+ cur = ggml_rms_norm(ctx0, inpL);
+ assert_shape_2d(cur, n_embd, N*n_batch);
+
+ // cur = attention_norm*cur
+ cur = ggml_mul(ctx0,
+ ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
+ cur);
+ assert_shape_2d(cur, n_embd, N*n_batch);
+ }
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ // wq shape [n_embd, n_embd, 1, 1]
+ // wk shape [n_embd, n_embd, 1, 1]
+ // Qcur shape [n_embd/n_head, n_head, N, n_batch]
+ // Kcur shape [n_embd/n_head, n_head, N, n_batch]
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
+ assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
+ assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
+
+ // store key and value to memory
+ {
+ // compute the transposed [N, n_embd] V matrix
+ // wv shape [n_embd, n_embd, 1, 1]
+ // Vcur shape [N, n_embd, n_batch, 1]
+ struct ggml_tensor * Vcur = ggml_cont(ctx0,
+ ggml_permute(ctx0,
+ ggml_reshape_3d(ctx0,
+ ggml_mul_mat(ctx0,
+ model->layers[il].wv,
+ cur),
+ n_embd, N, n_batch),
+ 1, 0, 2, 3));
+
+ assert_shape_3d(Vcur, N, n_embd, n_batch);
+
+ // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
+ // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
+ // k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il]
+ // v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
+
+ /* {
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
+ ( n_ctx)*ggml_element_size(kv_self.v),
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+
+ // important: storing RoPE-ed version of K in the KV cache!
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+ } //*/
+
+ kc = ggml_set_2d(ctx0, kc,
+ ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch),
+ ggml_element_size(kc)*n_embd*n_ctx,
+ (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past));
+ vc = ggml_set_2d(ctx0, vc,
+ ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch),
+ ggml_element_size(vc)*n_ctx*n_embd,
+ ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx));
+
+ assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer);
+ assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer);
+ }
+
+ // Qcur shape [n_embd/n_head, n_head, N, n_batch]
+ // Q shape [n_embd/n_head, N, n_head, n_batch]
+ struct ggml_tensor * Q =
+ ggml_permute(ctx0,
+ Qcur,
+ 0, 2, 1, 3);
+ assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
+
+ // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
+ // K shape [n_embd/n_head, n_past + N, n_head, n_batch]
+ struct ggml_tensor * K =
+ ggml_permute(ctx0,
+ ggml_reshape_4d(ctx0,
+ ggml_view_3d(ctx0,
+ kc,
+ n_embd,
+ (n_past + N),
+ n_batch,
+ n_embd*ggml_element_size(kc),
+ n_ctx*n_embd*ggml_element_size(kc),
+ il*n_batch*n_ctx*n_embd*ggml_element_size(kc)),
+ n_embd/n_head, n_head, n_past + N, n_batch),
+ 0, 2, 1, 3);
+ assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch);
+
+ // K * Q
+ // KQ shape [n_past + N, N, n_head, n_batch]
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+ assert_shape_4d(KQ, n_past + N, N, n_head, n_batch);
+
+ // KQ_scaled = KQ / sqrt(n_embd/n_head)
+ // KQ_scaled shape [n_past + N, N, n_head, n_batch]
+ struct ggml_tensor * KQ_scaled =
+ ggml_scale(ctx0,
+ KQ,
+ ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
+ assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
+
+ // KQ_masked = mask_past(KQ_scaled)
+ // KQ_masked shape [n_past + N, N, n_head, n_batch]
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+ assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch);
+
+ // KQ = soft_max(KQ_masked)
+ // KQ_soft_max shape [n_past + N, N, n_head, n_batch]
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+ assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
+
+ // split cached V into n_head heads
+ // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
+ // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
+ struct ggml_tensor * V =
+ ggml_view_4d(ctx0, vc,
+ n_past + N, n_embd/n_head, n_head, n_batch,
+ ggml_element_size(vc)*n_ctx,
+ ggml_element_size(vc)*n_ctx*n_embd/n_head,
+ ggml_element_size(vc)*n_ctx*n_embd,
+ il*n_batch*n_ctx*n_embd*ggml_element_size(vc));
+ assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch);
+
+ // KQV shape [n_embd/n_head, N, n_head, n_batch]
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+ assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
+
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
+ // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+ assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
+ // KQV_merged shape
+
+ // cur = KQV_merged.contiguous().view(n_embd, N)
+ // cur shape [n_embd,N*n_batch,1,1]
+ cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
+ assert_shape_2d(cur, n_embd, N*n_batch);
+ // cur = ggml_cpy(ctx0,
+ // KQV_merged,
+ // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+
+ // projection (no bias)
+ // cur shape [n_embd,N*n_batch,1,1]
+ cur = ggml_mul_mat(ctx0,
+ model->layers[il].wo,
+ cur);
+ assert_shape_2d(cur, n_embd, N*n_batch);
+ }
+
+ // lctx.use_buf(ctx0, 1);
+
+ // inpFF shape [n_embd,N*n_batch,1,1]
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
+ assert_shape_2d(inpFF, n_embd, N*n_batch);
+
+ // feed-forward network
+ {
+ // norm
+ {
+ // cur shape [n_embd,N*n_batch,1,1]
+ cur = ggml_rms_norm(ctx0, inpFF);
+ assert_shape_2d(cur, n_embd, N*n_batch);
+
+ // cur = ffn_norm*cur
+ // cur shape [n_embd,N*n_batch,1,1]
+ cur = ggml_mul(ctx0,
+ ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
+ cur);
+ assert_shape_2d(cur, n_embd, N*n_batch);
+ }
+
+ // tmp shape [n_ff,N*n_batch,1,1]
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
+ model->layers[il].w3,
+ cur);
+ assert_shape_2d(tmp, n_ff, N*n_batch);
+
+ // cur shape [n_ff,N*n_batch,1,1]
+ cur = ggml_mul_mat(ctx0,
+ model->layers[il].w1,
+ cur);
+ assert_shape_2d(cur, n_ff, N*n_batch);
+
+ // SILU activation
+ // cur shape [n_ff,N*n_batch,1,1]
+ cur = ggml_silu(ctx0, cur);
+ assert_shape_2d(cur, n_ff, N*n_batch);
+
+ // cur shape [n_ff,N*n_batch,1,1]
+ cur = ggml_mul(ctx0, cur, tmp);
+ assert_shape_2d(cur, n_ff, N*n_batch);
+
+ // cur shape [n_embd,N*n_batch,1,1]
+ cur = ggml_mul_mat(ctx0,
+ model->layers[il].w2,
+ cur);
+ assert_shape_2d(cur, n_embd, N*n_batch);
+ }
+
+ // cur shape [n_embd,N*n_batch,1,1]
+ cur = ggml_add(ctx0, cur, inpFF);
+ assert_shape_2d(cur, n_embd, N*n_batch);
+
+ // input for next layer
+ // inpL shape [n_embd,N*n_batch,1,1]
+ inpL = cur;
+ assert_shape_2d(inpL, n_embd, N*n_batch);
+ }
+
+ // norm
+ {
+
+ // inpL shape [n_embd,N*n_batch,1,1]
+ inpL = ggml_rms_norm(ctx0, inpL);
+ assert_shape_2d(inpL, n_embd, N*n_batch);
+
+ // inpL = norm*inpL
+ // inpL shape [n_embd,N*n_batch,1,1]
+ inpL = ggml_mul(ctx0,
+ ggml_repeat(ctx0, model->norm, inpL),
+ inpL);
+
+ assert_shape_2d(inpL, n_embd, N*n_batch);
+
+ //embeddings = inpL;
+ }
+
+ // lm_head
+ // inpL shape [n_vocab,N*n_batch,1,1]
+ inpL = ggml_mul_mat(ctx0, model->output, inpL);
+ assert_shape_2d(inpL, n_vocab, N*n_batch);
+
+ {
+ // inpL shape [n_vocab,N,n_batch,1]
+ inpL = ggml_reshape_3d(ctx0,
+ inpL,
+ n_vocab, N, n_batch);
+ assert_shape_3d(inpL, n_vocab, N, n_batch);
+ }
+
+ // run the computation
+ ggml_build_forward_expand(gf, inpL);
+
+ return inpL;
+}
+
+
+struct ggml_tensor * forward_lora(
+ struct llama_model_lora * model,
+ struct llama_kv_cache * cache,
+ struct ggml_context * ctx0,
+ struct ggml_cgraph * gf,
+ struct ggml_tensor * tokens_input,
+ const int n_tokens,
+ const int n_past) {
+
+ const int N = n_tokens;
+
+ struct llama_kv_cache& kv_self = *cache;
+ const auto & hparams = model->hparams;
+
+ const int n_ctx = hparams.n_ctx;
+ const int n_embd = hparams.n_embd;
+ const int n_layer = hparams.n_layer;
+ const int n_head = hparams.n_head;
+ const int n_rot = hparams.n_rot;
+
+ struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+ memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
+
+ struct ggml_tensor * kc = kv_self.k;
+ struct ggml_tensor * vc = kv_self.v;
+
+ // inpL shape [n_embd,N,1,1]
+ struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
+
+ struct ggml_tensor * cur;
+
+ // norm
+ {
+ // cur shape [n_embd,N,1,1]
+ cur = ggml_rms_norm(ctx0, inpL);
+
+ // cur = attention_norm*cur
+ cur = ggml_mul(ctx0,
+ ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
+ cur);
+ }
+
+ // self-attention
+ {
+ // compute Q and K and RoPE them
+ // wq shape [n_embd, n_embd, 1, 1]
+ // wk shape [n_embd, n_embd, 1, 1]
+ // Qcur shape [n_embd/n_head, n_head, N, 1]
+ // Kcur shape [n_embd/n_head, n_head, N, 1]
+ struct ggml_tensor * Qcur = ggml_rope(ctx0,
+ ggml_reshape_3d(ctx0,
+ ggml_mul_mat(ctx0,
+ model->layers[il].wqa,
+ ggml_mul_mat(ctx0,
+ model->layers[il].wqb,
+ cur)),
+ n_embd/n_head, n_head, N),
+ n_past, n_rot, 0);
+ struct ggml_tensor * Kcur = ggml_rope(ctx0,
+ ggml_reshape_3d(ctx0,
+ ggml_mul_mat(ctx0,
+ model->layers[il].wka,
+ ggml_mul_mat(ctx0,
+ model->layers[il].wkb,
+ cur)),
+ n_embd/n_head, n_head, N),
+ n_past, n_rot, 0);
+
+ // store key and value to memory
+ {
+ // compute the transposed [N, n_embd] V matrix
+ // wv shape [n_embd, n_embd, 1, 1]
+ // Vcur shape [n_embd, N, 1, 1]
+ struct ggml_tensor * Vcur = ggml_cont(ctx0,
+ ggml_transpose(ctx0,
+ ggml_reshape_2d(ctx0,
+ ggml_mul_mat(ctx0,
+ model->layers[il].wva,
+ ggml_mul_mat(ctx0,
+ model->layers[il].wvb,
+ cur)),
+ n_embd, N)));
+
+ // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+ // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
+ // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
+ // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
+
+ /* {
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
+ ( n_ctx)*ggml_element_size(kv_self.v),
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+
+ // important: storing RoPE-ed version of K in the KV cache!
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+ } //*/
+
+ kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+ vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v),
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+ }
+
+ // Qcur shape [n_embd/n_head, n_head, N, 1]
+ // Q shape [n_embd/n_head, N, n_head, 1]
+ struct ggml_tensor * Q =
+ ggml_permute(ctx0,
+ Qcur,
+ 0, 2, 1, 3);
+
+ // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+ // K shape [n_embd/n_head, n_past + N, n_head, 1]
+ struct ggml_tensor * K =
+ ggml_permute(ctx0,
+ ggml_reshape_3d(ctx0,
+ ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
+ n_embd/n_head, n_head, n_past + N),
+ 0, 2, 1, 3);
+
+ // K * Q
+ // KQ shape [n_past + N, N, n_head, 1]
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+ // KQ_scaled = KQ / sqrt(n_embd/n_head)
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
+ struct ggml_tensor * KQ_scaled =
+ ggml_scale(ctx0,
+ KQ,
+ ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
+
+ // KQ_masked = mask_past(KQ_scaled)
+ // KQ_masked shape [n_past + N, N, n_head, 1]
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+
+ // KQ = soft_max(KQ_masked)
+ // KQ_soft_max shape [n_past + N, N, n_head, 1]
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+
+ // split cached V into n_head heads
+ //// V shape [n_past + N, n_embd/n_head, n_head, 1]
+ // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
+ struct ggml_tensor * V =
+ ggml_view_3d(ctx0, vc,
+ n_past + N, n_embd/n_head, n_head,
+ n_ctx*ggml_element_size(vc),
+ n_ctx*ggml_element_size(vc)*n_embd/n_head,
+ il*n_ctx*ggml_element_size(vc)*n_embd);
+
+ // KQV shape [n_embd/n_head, N, n_head, 1]
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
+ // KQV_merged shape [n_embd/n_head, n_head, N, 1]
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+ // KQV_merged shape
+
+ // cur = KQV_merged.contiguous().view(n_embd, N)
+ // cur shape [n_embd,N,1,1]
+ cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
+ // cur = ggml_cpy(ctx0,
+ // KQV_merged,
+ // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+
+ // projection (no bias)
+ // cur shape [n_embd,N,1,1]
+ cur = ggml_mul_mat(ctx0,
+ model->layers[il].woa,
+ ggml_mul_mat(ctx0,
+ model->layers[il].wob,
+ cur));
+ }
+
+ // inpFF shape [n_embd,N,1,1]
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
+
+ // feed-forward network
+ {
+ // norm
+ {
+ // cur shape [n_embd,N,1,1]
+ cur = ggml_rms_norm(ctx0, inpFF);
+
+ // cur = ffn_norm*cur
+ // cur shape [n_embd,N,1,1]
+ cur = ggml_mul(ctx0,
+ ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
+ cur);
+ }
+
+ // tmp shape [n_ff,N,1,1]
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
+ model->layers[il].w3,
+ cur);
+
+ // cur shape [n_ff,N,1,1]
+ cur = ggml_mul_mat(ctx0,
+ model->layers[il].w1,
+ cur);
+
+ // SILU activation
+ // cur shape [n_ff,N,1,1]
+ cur = ggml_silu(ctx0, cur);
+
+ // cur shape [n_ff,N,1,1]
+ cur = ggml_mul(ctx0, cur, tmp);
+
+ // cur shape [n_embd,N,1,1]
+ cur = ggml_mul_mat(ctx0,
+ model->layers[il].w2,
+ cur);
+ }
+
+ // cur shape [n_embd,N,1,1]
+ cur = ggml_add(ctx0, cur, inpFF);
+
+ // input for next layer
+ // inpL shape [n_embd,N,1,1]
+ inpL = cur;
+ }
+
+ // norm
+ {
+
+ // inpL shape [n_embd,N,1,1]
+ inpL = ggml_rms_norm(ctx0, inpL);
+
+ // inpL = norm*inpL
+ // inpL shape [n_embd,N,1,1]
+ inpL = ggml_mul(ctx0,
+ ggml_repeat(ctx0, model->norm, inpL),
+ inpL);
+
+ //embeddings = inpL;
+ }
+
+
+ // lm_head
+ // inpL shape [n_vocab,N,1,1]
+ inpL = ggml_mul_mat(ctx0,
+ model->outputa,
+ ggml_mul_mat(ctx0,
+ model->outputb,
+ inpL));
+
+ // ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+ // run the computation
+ ggml_build_forward_expand(gf, inpL);
+
+ return inpL;
+}
+
+void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
+ assert(logits->n_dims == 2);
+ assert(probs->n_dims == 2);
+ assert(best_samples->n_dims == 1);
+ assert(logits->ne[1] == best_samples->ne[0]);
+ assert(logits->ne[0] == probs->ne[0]);
+ assert(logits->ne[1] == probs->ne[1]);
+ for (int i = 0; i < logits->ne[1]; ++i) {
+ float max_logit = ggml_get_f32_1d(logits, i * logits->ne[0]);
+ ggml_set_i32_1d(best_samples, i, 0);
+ for (int k = 0; k < logits->ne[0]; ++k) {
+ float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
+ if (logit > max_logit) {
+ max_logit = logit;
+ ggml_set_i32_1d(best_samples, i, k);
+ }
+ }
+ float psum = 0;
+ for (int k = 0; k < logits->ne[0]; ++k) {
+ float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
+ float p = (logit == -INFINITY) ? 0 : expf(logit - max_logit);
+ psum += p;
+ ggml_set_f32_1d(probs, i * probs->ne[0] + k, p);
+ }
+ for (int k = 0; k < logits->ne[0]; ++k) {
+ float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
+ ggml_set_f32_1d(probs, i * probs->ne[0] + k, p / psum);
+ }
+ }
+}
+
+void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
+ GGML_ASSERT(best_samples->n_dims == 2);
+ GGML_ASSERT(logits->n_dims == 3);
+ GGML_ASSERT(probs->n_dims == 3);
+ int n_tokens = best_samples->ne[0];
+ int n_batch = best_samples->ne[1];
+ int n_vocab = logits->ne[0];
+ GGML_ASSERT(n_tokens == logits->ne[1]);
+ GGML_ASSERT(n_batch == logits->ne[2]);
+ GGML_ASSERT(n_vocab == probs->ne[0]);
+ GGML_ASSERT(n_tokens == probs->ne[1]);
+ GGML_ASSERT(n_batch == probs->ne[2]);
+
+ for (int k = 0; k < n_batch; ++k) {
+ struct ggml_tensor * best_samples_k = ggml_view_1d(ctx,
+ best_samples,
+ best_samples->ne[0],
+ k*best_samples->nb[1]);
+ struct ggml_tensor * logits_k = ggml_view_2d(ctx,
+ logits,
+ logits->ne[0],
+ logits->ne[1],
+ logits->nb[1],
+ k*logits->nb[2]);
+ struct ggml_tensor * probs_k = ggml_view_2d(ctx,
+ probs,
+ probs->ne[0],
+ probs->ne[1],
+ probs->nb[1],
+ k*probs->nb[2]);
+ sample_softmax(logits_k, probs_k, best_samples_k);
+ }
+}
+
+void print_row(struct ggml_tensor * probs, int i) {
+ for (int k = 0; k < probs->ne[0]; ++k) {
+ float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
+ printf(" %.2f", p);
+ }
+ printf("\n");
+}
+
+void print_matrix(struct ggml_tensor * probs) {
+ assert(probs->n_dims == 2);
+ for (int i = 0; i < probs->ne[1]; ++i) {
+ for (int k = 0; k < probs->ne[0]; ++k) {
+ float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
+ printf(" %.2f", p);
+ }
+ printf("\n");
+ }
+}
+
+void print_token(int token, int n_vocab) {
+ for (int k = 0; k < token; ++k) {
+ printf(" ");
+ }
+ printf("X");
+ for (int k = token+1; k < n_vocab; ++k) {
+ printf(" ");
+ }
+ printf("\n");
+}
+
+void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
+ for (int i=0; i<tokens->ne[0]; ++i) {
+ int token = ggml_get_i32_1d(tokens, i);
+ print_token(token, n_vocab);
+ }
+}
+
+void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
+ int n_tokens = tokens_input->ne[0];
+ int n_vocab = targets->ne[0];
+ float randomness = 0.0f;
+ // ggml_set_zero(targets);
+ ggml_set_f32(targets, -1.0f);
+ ggml_set_i32_1d(tokens_input, 0, 0);
+ for (int i=1; i<n_tokens+1; ++i) {
+ float x = example_id + i * 3.14159f * 2.0f * 1.0f * 0.5f / n_tokens;
+ float y = sinf(x);//*cosf(x*1.1f+1.0f);
+ float z = (y+1.0f)*0.5f; // scale to [0..1]
+ z += (frand()-0.5f)*(randomness/n_vocab);
+ z = (z < 0.0f) ? 0.0f : (z > 1.0f) ? 1.0f : z; // clamp to [0..1]
+ int token = std::max(1,std::min(1+(int)(z*(float)(n_vocab-1)), n_vocab-1));
+ ggml_set_f32_1d(targets, (i-1)*n_vocab + token, +1.0f);
+ if (i<n_tokens) {
+ ggml_set_i32_1d(tokens_input, i, token);
+ }
+ }
+}
+
+void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
+ GGML_ASSERT(tokens_input->n_dims == 2);
+ GGML_ASSERT( targets->n_dims == 3);
+ int n_tokens = tokens_input->ne[0];
+ int n_batch = tokens_input->ne[1];
+ GGML_ASSERT(n_tokens == targets->ne[1]);
+ GGML_ASSERT(n_batch == targets->ne[2]);
+
+ for (int k=0; k<n_batch; ++k) {
+ struct ggml_tensor * tokens_input_k = ggml_view_1d(ctx,
+ tokens_input,
+ tokens_input->ne[0],
+ k*tokens_input->nb[1]);
+ struct ggml_tensor * targets_k = ggml_view_2d(ctx,
+ targets,
+ targets->ne[0],
+ targets->ne[1],
+ targets->nb[1],
+ k*targets->nb[2]);
+ get_example_targets(example_id*n_batch + k, tokens_input_k, targets_k);
+ }
+}
+
+void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
+ int n_tokens = tokens_input->ne[0];
+ int n_vocab = targets->ne[0];
+ for (int i=0; i<n_tokens-n_shift; ++i) {
+ ggml_set_i32_1d(tokens_input, i, ggml_get_i32_1d(tokens_input, i + n_shift));
+ for (int k=0; k<n_vocab; ++k) {
+ ggml_set_f32_1d(targets, i*n_vocab + k, ggml_get_f32_1d(targets, (i + n_shift)*n_vocab + k));
+ }
+ }
+}
+
+struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
+ // todo: instead of a-b: a[1:]-b[:-1]
+ return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
+}
+
+struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
+ const float eps = 1e-3;
+ return
+ ggml_sum(ctx,
+ ggml_neg(ctx,
+ ggml_sum_rows(ctx,
+ ggml_mul(ctx,
+ ggml_soft_max(ctx, a),
+ ggml_log(ctx,
+ ggml_add1(ctx,
+ ggml_soft_max(ctx, b),
+ ggml_new_f32(ctx, eps)))))));
+}
+
+int main(int argc, char ** argv) {
+ if (argc < 1) {
+ fprintf(stderr, "usage: %s\n", argv[0]);
+
+ return 1;
+ }
+
+ struct ggml_init_params lcparams;
+ lcparams.mem_size = 1024ll*1024ll*1024ll;
+ lcparams.mem_buffer = NULL;
+ lcparams.no_alloc = false;
+
+ struct llama_model model;
+ model.hparams.n_vocab = 8;
+ model.hparams.n_ctx = 8;
+ model.hparams.n_embd = 32;
+ model.hparams.n_mult = 2;
+ model.hparams.n_head = 8;
+ model.hparams.n_layer = 1;
+ model.hparams.n_rot = std::min(16u, model.hparams.n_embd / model.hparams.n_head);
+
+ // model.hparams.n_embd = 32;
+ // model.hparams.n_mult = 2;
+ // model.hparams.n_head = 4;
+ // model.hparams.n_layer = 8;
+ // model.hparams.n_rot = 8;
+
+ model.ctx = ggml_init(lcparams);
+ printf("init model\n");
+ init_model(&model);
+ set_param_model(&model);
+
+ randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
+
+/*
+ struct llama_model_lora model_lora;
+ // model.hparams.n_vocab = 6;
+ // model.hparams.n_ctx = 64;
+ // model.hparams.n_embd = 128;
+ // model.hparams.n_mult = 2;
+ // model.hparams.n_head = 8;
+ // model.hparams.n_layer = 6;
+ // model.hparams.n_rot = model.hparams.n_embd / model.hparams.n_head;
+
+ model_lora.hparams.n_vocab = 16;
+ model_lora.hparams.n_ctx = 32;
+ model_lora.hparams.n_embd = 256;
+ model_lora.hparams.n_mult = 2;
+ model_lora.hparams.n_head = 16;
+ model_lora.hparams.n_layer = 1;
+ model_lora.hparams.n_lora = 64;
+ model_lora.hparams.n_rot = MIN(16, model_lora.hparams.n_embd / model_lora.hparams.n_head);
+ // model.hparams.n_rot = (model.hparams.n_embd / model.hparams.n_head) / 2;
+
+ // model.hparams.n_embd = 32;
+ // model.hparams.n_mult = 2;
+ // model.hparams.n_head = 4;
+ // model.hparams.n_layer = 8;
+ // model.hparams.n_rot = 8;
+
+ model_lora.ctx = ggml_init(lcparams);
+ printf("init model_lora\n");
+ init_model_lora(&model_lora);
+ set_param_model_lora(&model_lora);
+
+ randomize_model_lora(&model_lora, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
+*/
+ int n_batch = 8;
+ // key + value cache for the self attention
+ struct llama_kv_cache kv_self;
+ printf("init_kv_cache\n");
+ kv_self.ctx = model.ctx;
+ init_kv_cache(&kv_self, &model, n_batch);
+ //init_kv_cache_lora(&kv_self, &model_lora);
+
+ size_t compute_size = 1024ll*1024ll*1024ll;
+ uint8_t * compute_addr = new uint8_t[compute_size];
+
+ int n_examples = 256;
+ int n_tokens = model.hparams.n_ctx;
+ int n_vocab = model.hparams.n_vocab;
+
+ for (int ex=0; ex<n_examples; ++ex) {
+ struct ggml_init_params params = {
+ /*.mem_size =*/ compute_size,
+ /*.mem_buffer =*/ compute_addr,
+ /*.no_alloc =*/ false,
+ };
+
+ struct ggml_context * ctx0 = ggml_init(params);
+
+ struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
+ struct ggml_tensor * after_opt_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
+ struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
+ struct ggml_tensor * targets = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
+
+ int n_past = 0;
+
+ ggml_cgraph gf = {};
+ gf.n_threads = 1;
+
+ get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
+
+ struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
+ // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
+ struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
+
+ ggml_build_forward_expand(&gf, e);
+ ggml_graph_compute(ctx0, &gf);
+
+ float error_before_opt = ggml_get_f32_1d(e, 0);
+
+ struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
+ struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
+ opt_params_adam.print_forward_graph = false;
+ opt_params_adam.print_backward_graph = false;
+ opt_params_lbfgs.print_forward_graph = false;
+ opt_params_lbfgs.print_backward_graph = false;
+ opt_params_adam.adam.n_iter = 16;
+ opt_params_lbfgs.lbfgs.n_iter = 16;
+ // ggml_opt(ctx0, opt_params_adam, e);
+ ggml_opt(ctx0, opt_params_lbfgs, e);
+ //
+ ggml_build_forward_expand(&gf, e);
+ ggml_graph_compute(ctx0, &gf);
+
+ float error_after_opt = ggml_get_f32_1d(e, 0);
+
+ if (ex % 8 == 0) {
+ printf("Example %d\n", (ex+1));
+ printf("error_before_opt: %.2f\n", error_before_opt);
+ printf("error_after_opt: %.2f\n", error_after_opt);
+ }
+
+ if (ex % 64 == 0) {
+ sample_softmax_batch(ctx0, logits, after_opt_probs, after_opt_best_samples);
+ // printf("probabilities after optimization:\n");
+ // print_matrix(after_opt_probs);
+ printf("best samples after optimization:\n");
+ print_tokens(after_opt_best_samples, n_vocab);
+ }
+
+ ggml_free(ctx0);
+ }
+
+ {
+ int n_gen = 128;
+ int sample_ctx = n_tokens-n_tokens/8;
+
+ printf("Generating %d tokens.\n", n_gen);
+
+ struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
+ struct ggml_tensor * targets = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens);
+
+ get_example_targets(137, tokens_input, targets);
+ for (int i=sample_ctx; i<n_tokens; ++i) {
+ ggml_set_i32_1d(tokens_input, i, n_vocab/2);
+ }
+
+ for (int i=0; i<sample_ctx-1; ++i) {
+ print_token(ggml_get_i32_1d(tokens_input, i), n_vocab);
+ }
+ printf("---\n");
+ for (int i=0; i<n_gen; ++i) {
+ struct ggml_init_params params = {
+ /*.mem_size =*/ compute_size,
+ /*.mem_buffer =*/ compute_addr,
+ /*.no_alloc =*/ false,
+ };
+ struct ggml_context * ctx0 = ggml_init(params);
+
+ ggml_cgraph gf = {};
+ gf.n_threads = 1;
+
+ int n_past = 0;
+ struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
+
+ ggml_build_forward_expand(&gf, logits);
+ ggml_graph_compute(ctx0, &gf);
+
+ struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
+ struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
+
+ sample_softmax(logits, probs, best_samples);
+
+ // int sample_at = n_tokens-1;
+ int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
+
+ // print_row(probs, sample_at);
+ print_token(token, n_vocab);
+
+ lshift_examples(tokens_input, targets, 1);
+ ggml_set_i32_1d(tokens_input, 0, 0);
+ ggml_set_i32_1d(tokens_input, sample_ctx-1, token);
+
+ ggml_free(ctx0);
+ }
+ }
+
+ print_matrix(model.tok_embeddings);
+
+ printf("done\n");
+ // ggml_free(kv_self.ctx);
+ // ggml_free(model_lora.ctx);
+ ggml_free(model.ctx);
+ return 0;
+}
diff --git a/ggml.c b/ggml.c
index b42ca03..675eb0d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1931,6 +1931,7 @@ inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) {
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
+inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
@@ -3064,6 +3065,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); }
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
+inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
@@ -3105,12 +3107,12 @@ inline static float ggml_silu_f32(float x) {
return x/(1.0f + expf(-x));
}
-inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
- const uint16_t * i16 = (const uint16_t *) x;
- for (int i = 0; i < n; ++i) {
- y[i] = table_silu_f16[i16[i]];
- }
-}
+//inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+// const uint16_t * i16 = (const uint16_t *) x;
+// for (int i = 0; i < n; ++i) {
+// y[i] = table_silu_f16[i16[i]];
+// }
+//}
#ifdef GGML_SILU_FP16
inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
@@ -3129,6 +3131,29 @@ inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
}
#endif
+inline static float ggml_silu_backward_f32(float x, float dy) {
+ const float s = 1.0f/(1.0f + expf(-x));
+ return dy*s*(1.0f + x*(1.0f - s));
+}
+
+#ifdef GGML_SILU_FP16
+inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
+ for (int i = 0; i < n; ++i) {
+ // we did not use x[i] to compute forward silu but its f16 equivalent
+ // take derivative at f16 of x[i]:
+ ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
+ float usedx = GGML_FP16_TO_FP32(fp16);
+ dx[i] = ggml_silu_backward_f32(usedx, dy[i]);
+ }
+}
+#else
+inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
+ for (int i = 0; i < n; ++i) {
+ dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
+ }
+}
+#endif
+
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
#ifndef GGML_USE_ACCELERATE
ggml_float sum = 0.0;
@@ -3260,12 +3285,16 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"DUP",
"ADD",
+ "ADD1",
+ "ACC",
"SUB",
"MUL",
"DIV",
"SQR",
"SQRT",
+ "LOG",
"SUM",
+ "SUM_ROWS",
"MEAN",
"REPEAT",
"ABS",
@@ -3275,12 +3304,15 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"RELU",
"GELU",
"SILU",
+ "SILU_BACK",
"NORM",
"RMS_NORM",
+ "RMS_NORM_BACK",
"MUL_MAT",
"SCALE",
+ "SET",
"CPY",
"CONT",
"RESHAPE",
@@ -3288,9 +3320,13 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"PERMUTE",
"TRANSPOSE",
"GET_ROWS",
+ "GET_ROWS_BACK",
+ "DIAG",
"DIAG_MASK_INF",
+ "DIAG_MASK_ZERO",
"SOFT_MAX",
"ROPE",
+ "ROPE_BACK",
"ALIBI",
"CONV_1D_1S",
"CONV_1D_2S",
@@ -3302,19 +3338,23 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"MAP_BINARY",
};
-static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
+static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
"x",
"x+y",
+ "x+y",
+ "view(x,nb,offset)+=y->x",
"x-y",
"x*y",
"x/y",
"x^2",
"√x",
+ "log(x)",
"Σx",
+ "Σx_k",
"Σx/n",
"repeat(x)",
"abs(x)",
@@ -3324,12 +3364,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"relu(x)",
"gelu(x)",
"silu(x)",
+ "silu_back(x)",
"norm(x)",
"rms_norm(x)",
+ "rms_norm_back(x)",
"X*Y",
"x*v",
+ "y-\\>view(x)",
"x-\\>y",
"cont(x)",
"reshape(x)",
@@ -3337,9 +3380,13 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"permute(x)",
"transpose(x)",
"get_rows(x)",
+ "get_rows_back(x)",
+ "diag(x)",
"diag_mask_inf(x)",
+ "diag_mask_zero(x)",
"soft_max(x)",
"rope(x)",
+ "rope_back(x)",
"alibi(x)",
"conv_1d_1s(x)",
"conv_1d_2s(x)",
@@ -3351,7 +3398,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"f(x,y)",
};
-static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
+static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -3589,9 +3636,9 @@ static inline int ggml_up32(int n) {
return (n + 31) & ~31;
}
-static inline int ggml_up64(int n) {
- return (n + 63) & ~63;
-}
+//static inline int ggml_up64(int n) {
+// return (n + 63) & ~63;
+//}
static inline int ggml_up(int n, int m) {
// assert m is a power of 2
@@ -4301,6 +4348,107 @@ struct ggml_tensor * ggml_add_inplace(
return ggml_add_impl(ctx, a, b, true);
}
+// ggml_add1
+
+struct ggml_tensor * ggml_add1_impl(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ bool inplace) {
+ GGML_ASSERT(ggml_is_scalar(b));
+ GGML_ASSERT(ggml_is_padded_1d(a));
+
+ bool is_node = false;
+
+ if (!inplace && (a->grad || b->grad)) {
+ is_node = true;
+ }
+
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+ result->op = GGML_OP_ADD1;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src0 = a;
+ result->src1 = b;
+
+ return result;
+}
+
+struct ggml_tensor * ggml_add1(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b) {
+ return ggml_add1_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_add1_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b) {
+ return ggml_add1_impl(ctx, a, b, true);
+}
+
+// ggml_acc
+
+struct ggml_tensor * ggml_acc_impl(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t nb2,
+ size_t nb3,
+ size_t offset,
+ bool inplace) {
+ GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a));
+ GGML_ASSERT(ggml_is_contiguous(a));
+ GGML_ASSERT(a->type == GGML_TYPE_F32);
+ GGML_ASSERT(b->type == GGML_TYPE_F32);
+
+ bool is_node = false;
+
+ if (!inplace && (a->grad || b->grad)) {
+ is_node = true;
+ }
+
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+ struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5);
+ ((int32_t *) c->data)[0] = nb1;
+ ((int32_t *) c->data)[1] = nb2;
+ ((int32_t *) c->data)[2] = nb3;
+ ((int32_t *) c->data)[3] = offset;
+ ((int32_t *) c->data)[4] = inplace ? 1 : 0;
+
+ result->op = GGML_OP_ACC;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src0 = a;
+ result->src1 = b;
+ result->opt[0] = c;
+
+ return result;
+}
+
+struct ggml_tensor * ggml_acc(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t nb2,
+ size_t nb3,
+ size_t offset) {
+ return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
+}
+
+struct ggml_tensor * ggml_acc_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t nb2,
+ size_t nb3,
+ size_t offset) {
+ return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
+}
+
// ggml_sub
struct ggml_tensor * ggml_sub_impl(
@@ -4494,6 +4642,41 @@ struct ggml_tensor * ggml_sqrt_inplace(
return ggml_sqrt_impl(ctx, a, true);
}
+
+// ggml_log
+
+struct ggml_tensor * ggml_log_impl(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ bool inplace) {
+ bool is_node = false;
+
+ if (!inplace && (a->grad)) {
+ is_node = true;
+ }
+
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+ result->op = GGML_OP_LOG;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src0 = a;
+ result->src1 = NULL;
+
+ return result;
+}
+
+struct ggml_tensor * ggml_log(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a) {
+ return ggml_log_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_log_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a) {
+ return ggml_log_impl(ctx, a, true);
+}
+
// ggml_sum
struct ggml_tensor * ggml_sum(
@@ -4515,6 +4698,33 @@ struct ggml_tensor * ggml_sum(
return result;
}
+
+// ggml_sum_rows
+
+struct ggml_tensor * ggml_sum_rows(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a) {
+ bool is_node = false;
+
+ if (a->grad) {
+ is_node = true;
+ }
+
+ int64_t ne[4] = {1,1,1,1};
+ for (int i=1; i<a->n_dims; ++i) {
+ ne[i] = a->ne[i];
+ }
+
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne);
+
+ result->op = GGML_OP_SUM_ROWS;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src0 = a;
+ result->src1 = NULL;
+
+ return result;
+}
+
// ggml_mean
struct ggml_tensor * ggml_mean(
@@ -4805,6 +5015,29 @@ struct ggml_tensor * ggml_silu_inplace(
return ggml_silu_impl(ctx, a, true);
}
+// ggml_silu_back
+
+struct ggml_tensor * ggml_silu_back(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b) {
+ bool is_node = false;
+
+ if (a->grad || b->grad) {
+ // TODO: implement backward
+ is_node = true;
+ }
+
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+ result->op = GGML_OP_SILU_BACK;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src0 = a;
+ result->src1 = b;
+
+ return result;
+}
+
// ggml_norm
struct ggml_tensor * ggml_norm_impl(
@@ -4847,7 +5080,6 @@ struct ggml_tensor * ggml_rms_norm_impl(
bool is_node = false;
if (!inplace && (a->grad)) {
- GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
@@ -4873,6 +5105,28 @@ struct ggml_tensor * ggml_rms_norm_inplace(
return ggml_rms_norm_impl(ctx, a, true);
}
+struct ggml_tensor * ggml_rms_norm_back(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b) {
+ bool is_node = false;
+
+ if (a->grad) {
+ // TODO: implement backward
+ is_node = true;
+ }
+
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+ result->op = GGML_OP_RMS_NORM_BACK;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src0 = a;
+ result->src1 = b;
+
+ return result;
+}
+
+
// ggml_mul_mat
struct ggml_tensor * ggml_mul_mat(
@@ -4912,13 +5166,10 @@ struct ggml_tensor * ggml_scale_impl(
bool is_node = false;
if (!inplace && (a->grad || b->grad)) {
- GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
- // TODO: when implement backward, fix this:
- //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
- struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
result->op = GGML_OP_SCALE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4942,6 +5193,100 @@ struct ggml_tensor * ggml_scale_inplace(
return ggml_scale_impl(ctx, a, b, true);
}
+// ggml_set
+
+struct ggml_tensor * ggml_set_impl(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t nb2,
+ size_t nb3,
+ size_t offset,
+ bool inplace) {
+ GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b));
+
+ bool is_node = false;
+
+ if (!inplace && (a->grad || b->grad)) {
+ is_node = true;
+ }
+
+ // make a view of the destination
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+ struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5);
+ (( int32_t * ) c->data)[0] = nb1;
+ (( int32_t * ) c->data)[1] = nb2;
+ (( int32_t * ) c->data)[2] = nb3;
+ (( int32_t * ) c->data)[3] = offset;
+ (( int32_t * ) c->data)[4] = inplace ? 1 : 0;
+
+ result->op = GGML_OP_SET;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src0 = a;
+ result->src1 = b;
+ result->opt[0] = c;
+
+ return result;
+}
+
+struct ggml_tensor * ggml_set(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t nb2,
+ size_t nb3,
+ size_t offset) {
+ return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
+}
+
+struct ggml_tensor * ggml_set_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t nb2,
+ size_t nb3,
+ size_t offset) {
+ return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
+}
+
+struct ggml_tensor * ggml_set_1d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t offset) {
+ return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false);
+}
+
+struct ggml_tensor * ggml_set_1d_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t offset) {
+ return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true);
+}
+
+struct ggml_tensor * ggml_set_2d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t offset) {
+ return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
+}
+
+struct ggml_tensor * ggml_set_2d_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t offset) {
+ return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
+}
+
+
// ggml_cpy
struct ggml_tensor * ggml_cpy_impl(
@@ -4954,7 +5299,6 @@ struct ggml_tensor * ggml_cpy_impl(
bool is_node = false;
if (!inplace && (a->grad || b->grad)) {
- GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
@@ -4992,7 +5336,6 @@ struct ggml_tensor * ggml_cont_impl(
bool is_node = false;
if (!inplace && a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
@@ -5030,11 +5373,15 @@ struct ggml_tensor * ggml_reshape(
bool is_node = false;
- if (a->grad || b->grad) {
- GGML_ASSERT(false); // TODO: implement backward
+ if (a->grad) {
is_node = true;
}
+ if (b->grad) {
+ // gradient propagation is not supported
+ //GGML_ASSERT(false);
+ }
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
result->op = GGML_OP_RESHAPE;
@@ -5045,6 +5392,30 @@ struct ggml_tensor * ggml_reshape(
return result;
}
+struct ggml_tensor * ggml_reshape_1d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0) {
+ GGML_ASSERT(ggml_is_contiguous(a));
+ GGML_ASSERT(ggml_nelements(a) == ne0);
+
+ bool is_node = false;
+
+ if (a->grad) {
+ is_node = true;
+ }
+
+ const int64_t ne[1] = { ne0 };
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
+
+ result->op = GGML_OP_RESHAPE;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src0 = a;
+ result->src1 = NULL;
+
+ return result;
+}
+
struct ggml_tensor * ggml_reshape_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
@@ -5056,7 +5427,6 @@ struct ggml_tensor * ggml_reshape_2d(
bool is_node = false;
if (a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
@@ -5083,7 +5453,6 @@ struct ggml_tensor * ggml_reshape_3d(
bool is_node = false;
if (a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
@@ -5098,6 +5467,34 @@ struct ggml_tensor * ggml_reshape_3d(
return result;
}
+
+struct ggml_tensor * ggml_reshape_4d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0,
+ int64_t ne1,
+ int64_t ne2,
+ int64_t ne3) {
+ GGML_ASSERT(ggml_is_contiguous(a));
+ GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3);
+
+ bool is_node = false;
+
+ if (a->grad) {
+ is_node = true;
+ }
+
+ const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
+
+ result->op = GGML_OP_RESHAPE;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src0 = a;
+ result->src1 = NULL;
+
+ return result;
+}
+
// ggml_view_1d
struct ggml_tensor * ggml_view_1d(
@@ -5105,16 +5502,23 @@ struct ggml_tensor * ggml_view_1d(
struct ggml_tensor * a,
int64_t ne0,
size_t offset) {
+
+ bool is_node = false;
+
if (a->grad) {
- GGML_ASSERT(false); // gradient propagation is not supported
+ is_node = true;
}
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
result->op = GGML_OP_VIEW;
- result->grad = NULL;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
- result->src1 = NULL; // TODO: maybe store the offset here?
+ result->src1 = NULL;
+
+ if (is_node) {
+ memcpy(result->padding, &offset, sizeof(offset));
+ }
return result;
}
@@ -5128,8 +5532,11 @@ struct ggml_tensor * ggml_view_2d(
int64_t ne1,
size_t nb1,
size_t offset) {
+
+ bool is_node = false;
+
if (a->grad) {
- GGML_ASSERT(false); // gradient propagation is not supported
+ is_node = true;
}
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
@@ -5141,9 +5548,13 @@ struct ggml_tensor * ggml_view_2d(
result->nb[3] = result->nb[2];
result->op = GGML_OP_VIEW;
- result->grad = NULL;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
- result->src1 = NULL; // TODO: maybe store the offset here?
+ result->src1 = NULL;
+
+ if (is_node) {
+ memcpy(result->padding, &offset, sizeof(offset));
+ }
return result;
}
@@ -5159,8 +5570,11 @@ struct ggml_tensor * ggml_view_3d(
size_t nb1,
size_t nb2,
size_t offset) {
+
+ bool is_node = false;
+
if (a->grad) {
- GGML_ASSERT(false); // gradient propagation is not supported
+ is_node = true;
}
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
@@ -5172,9 +5586,53 @@ struct ggml_tensor * ggml_view_3d(
result->nb[3] = result->nb[2]*ne2;
result->op = GGML_OP_VIEW;
- result->grad = NULL;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src0 = a;
+ result->src1 = NULL;
+
+ if (is_node) {
+ memcpy(result->padding, &offset, sizeof(offset));
+ }
+
+ return result;
+}
+
+// ggml_view_4d
+
+struct ggml_tensor * ggml_view_4d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0,
+ int64_t ne1,
+ int64_t ne2,
+ int64_t ne3,
+ size_t nb1,
+ size_t nb2,
+ size_t nb3,
+ size_t offset) {
+
+ bool is_node = false;
+
+ if (a->grad) {
+ is_node = true;
+ }
+
+ const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
+
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
+
+ result->nb[1] = nb1;
+ result->nb[2] = nb2;
+ result->nb[3] = nb3;
+
+ result->op = GGML_OP_VIEW;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
- result->src1 = NULL; // TODO: maybe store the offset here?
+ result->src1 = NULL;
+
+ if (is_node) {
+ memcpy(result->padding, &offset, sizeof(offset));
+ }
return result;
}
@@ -5203,7 +5661,6 @@ struct ggml_tensor * ggml_permute(
bool is_node = false;
if (a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
@@ -5235,7 +5692,14 @@ struct ggml_tensor * ggml_permute(
result->op = GGML_OP_PERMUTE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
- result->src1 = NULL; // TODO: maybe store the permutation here?
+ result->src1 = NULL;
+
+ if (is_node) {
+ result->padding[0] = axis0;
+ result->padding[1] = axis1;
+ result->padding[2] = axis2;
+ result->padding[3] = axis3;
+ }
return result;
}
@@ -5248,7 +5712,6 @@ struct ggml_tensor * ggml_transpose(
bool is_node = false;
if (a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
@@ -5279,7 +5742,6 @@ struct ggml_tensor * ggml_get_rows(
bool is_node = false;
if (a->grad || b->grad) {
- GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
@@ -5295,26 +5757,120 @@ struct ggml_tensor * ggml_get_rows(
return result;
}
+// ggml_get_rows_back
+
+struct ggml_tensor * ggml_get_rows_back(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ struct ggml_tensor * c) {
+ GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
+ GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
+
+ bool is_node = false;
+
+ if (a->grad || b->grad) {
+ is_node = true;
+ }
+
+ // TODO: implement non F32 return
+ //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
+ struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
+
+ result->op = GGML_OP_GET_ROWS_BACK;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src0 = a;
+ result->src1 = b;
+ result->opt[0] = c;
+
+ return result;
+}
+
+// ggml_diag
+
+struct ggml_tensor * ggml_diag(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a) {
+ GGML_ASSERT(a->ne[1] == 1);
+ bool is_node = false;
+
+ if (a->grad) {
+ is_node = true;
+ }
+
+ const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
+ struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne);
+
+ result->op = GGML_OP_DIAG;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src0 = a;
+ result->src1 = NULL;
+
+ return result;
+}
+
+
// ggml_diag_mask_inf
+struct ggml_tensor * ggml_diag_mask_inf_impl(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past,
+ bool inplace) {
+ bool is_node = false;
+
+ if (a->grad) {
+ is_node = true;
+ }
+
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+ ((int32_t *) b->data)[0] = n_past;
+ ((int32_t *) b->data)[1] = inplace ? 1 : 0;
+
+ result->op = GGML_OP_DIAG_MASK_INF;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src0 = a;
+ result->src1 = b;
+
+ return result;
+}
+
struct ggml_tensor * ggml_diag_mask_inf(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past) {
+ return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
+}
+
+
+struct ggml_tensor * ggml_diag_mask_inf_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past) {
+ return ggml_diag_mask_inf_impl(ctx, a, n_past, true);
+}
+
+// ggml_diag_mask_zero
+
+struct ggml_tensor * ggml_diag_mask_zero_impl(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past,
+ bool inplace) {
bool is_node = false;
if (a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
- // TODO: when implement backward, fix this:
- //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
- struct ggml_tensor * result = ggml_view_tensor(ctx, a);
- struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
- ggml_set_name(b, "n_past");
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+ ggml_set_name(b, "n_past, inplace");
+ ((int32_t *) b->data)[0] = n_past;
+ ((int32_t *) b->data)[1] = inplace ? 1 : 0;
- result->op = GGML_OP_DIAG_MASK_INF;
+ result->op = GGML_OP_DIAG_MASK_ZERO;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = b;
@@ -5322,21 +5878,33 @@ struct ggml_tensor * ggml_diag_mask_inf(
return result;
}
+struct ggml_tensor * ggml_diag_mask_zero(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past) {
+ return ggml_diag_mask_zero_impl(ctx, a, n_past, false);
+}
+
+struct ggml_tensor * ggml_diag_mask_zero_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past) {
+ return ggml_diag_mask_zero_impl(ctx, a, n_past, true);
+}
+
// ggml_soft_max
-struct ggml_tensor * ggml_soft_max(
+struct ggml_tensor * ggml_soft_max_impl(
struct ggml_context * ctx,
- struct ggml_tensor * a) {
+ struct ggml_tensor * a,
+ bool inplace) {
bool is_node = false;
if (a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
- // TODO: when implement backward, fix this:
- //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
- struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
result->op = GGML_OP_SOFT_MAX;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5346,14 +5914,75 @@ struct ggml_tensor * ggml_soft_max(
return result;
}
+struct ggml_tensor * ggml_soft_max(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a) {
+ return ggml_soft_max_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_soft_max_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a) {
+ return ggml_soft_max_impl(ctx, a, true);
+}
+
// ggml_rope
+struct ggml_tensor * ggml_rope_impl(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past,
+ int n_dims,
+ int mode,
+ bool inplace) {
+ GGML_ASSERT(n_past >= 0);
+ bool is_node = false;
+
+ if (!inplace && a->grad) {
+ is_node = true;
+ }
+
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
+ ((int32_t *) b->data)[0] = n_past;
+ ((int32_t *) b->data)[1] = n_dims;
+ ((int32_t *) b->data)[2] = mode;
+
+ result->op = GGML_OP_ROPE;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src0 = a;
+ result->src1 = b;
+
+ return result;
+}
+
struct ggml_tensor * ggml_rope(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode) {
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false);
+}
+
+struct ggml_tensor * ggml_rope_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past,
+ int n_dims,
+ int mode) {
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true);
+}
+
+// ggml_rope_back
+
+struct ggml_tensor * ggml_rope_back(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past,
+ int n_dims,
+ int mode) {
GGML_ASSERT(n_past >= 0);
bool is_node = false;
@@ -5362,9 +5991,7 @@ struct ggml_tensor * ggml_rope(
is_node = true;
}
- // TODO: when implement backward, fix this:
- //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
- struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
((int32_t *) b->data)[0] = n_past;
@@ -5372,7 +5999,7 @@ struct ggml_tensor * ggml_rope(
((int32_t *) b->data)[2] = mode;
ggml_set_name(b, "n_past, n_dims, mode");
- result->op = GGML_OP_ROPE;
+ result->op = GGML_OP_ROPE_BACK;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = b;
@@ -5626,6 +6253,38 @@ void ggml_set_param(
// ggml_compute_forward_dup
+static void ggml_compute_forward_dup_same_cont(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+ GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+ GGML_ASSERT(src0->type == dst->type);
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ const size_t nb00 = src0->nb[0];
+ const size_t nb0 = dst->nb[0];
+
+ const int ith = params->ith; // thread index
+ const int nth = params->nth; // number of threads
+
+ // parallelize by elements
+ const int ne = ggml_nelements(dst);
+ const int dr = (ne + nth - 1) / nth;
+ const int ie0 = dr * ith;
+ const int ie1 = MIN(ie0 + dr, ne);
+
+ if (ie0 < ie1) {
+ memcpy(
+ ((char *) dst->data + ie0*nb0),
+ ((char *) src0->data + ie0*nb00),
+ (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
+ }
+
+}
static void ggml_compute_forward_dup_f16(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
@@ -5660,17 +6319,7 @@ static void ggml_compute_forward_dup_f16(
const int nth = params->nth; // number of threads
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
- // parallelize by elements
- const int ne = ggml_nelements(dst);
- const int dr = (ne + nth - 1) / nth;
- const int ie0 = dr * ith;
- const int ie1 = MIN(ie0 + dr, ne);
-
- memcpy(
- ((char *) dst->data + ie0*nb0),
- ((char *) src0->data + ie0*nb00),
- (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
-
+ ggml_compute_forward_dup_same_cont(params, src0, dst);
return;
}
@@ -5959,17 +6608,7 @@ static void ggml_compute_forward_dup_f32(
const int nth = params->nth; // number of threads
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
- // parallelize by elements
- const int ne = ggml_nelements(dst);
- const int dr = (ne + nth - 1) / nth;
- const int ie0 = dr * ith;
- const int ie1 = MIN(ie0 + dr, ne);
-
- memcpy(
- ((char *) dst->data + ie0*nb0),
- ((char *) src0->data + ie0*nb00),
- (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
-
+ ggml_compute_forward_dup_same_cont(params, src0, dst);
return;
}
@@ -6224,6 +6863,10 @@ static void ggml_compute_forward_dup(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
+ ggml_compute_forward_dup_same_cont(params, src0, dst);
+ return;
+ }
switch (src0->type) {
case GGML_TYPE_F16:
{
@@ -6256,44 +6899,73 @@ static void ggml_compute_forward_add_f32(
const int ith = params->ith;
const int nth = params->nth;
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
+ const int nr = ggml_nrows(src0);
+ const int64_t ne0 = src0->ne[0];
+ const int64_t ne1 = src0->ne[1];
+ const int64_t ne2 = src0->ne[2];
const size_t nb00 = src0->nb[0];
const size_t nb01 = src0->nb[1];
+ const size_t nb02 = src0->nb[2];
+ const size_t nb03 = src0->nb[3];
const size_t nb10 = src1->nb[0];
const size_t nb11 = src1->nb[1];
+ const size_t nb12 = src1->nb[2];
+ const size_t nb13 = src1->nb[3];
const size_t nb0 = dst->nb[0];
const size_t nb1 = dst->nb[1];
+ const size_t nb2 = dst->nb[2];
+ const size_t nb3 = dst->nb[3];
GGML_ASSERT( nb0 == sizeof(float));
GGML_ASSERT(nb00 == sizeof(float));
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
if (nb10 == sizeof(float)) {
- for (int j = ith; j < n; j += nth) {
+ for (int ir = ir0; ir < ir1; ++ir) {
+ // src0, src1 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+
#ifdef GGML_USE_ACCELERATE
vDSP_vadd(
- (float *) ((char *) src0->data + j*nb01), 1,
- (float *) ((char *) src1->data + j*nb11), 1,
- (float *) ((char *) dst->data + j*nb1), 1, nc);
+ (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
+ (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
+ (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
+ ne0);
#else
- ggml_vec_add_f32(nc,
- (float *) ((char *) dst->data + j*nb1),
- (float *) ((char *) src0->data + j*nb01),
- (float *) ((char *) src1->data + j*nb11));
+ ggml_vec_add_f32(ne0,
+ (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
+ (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
+ (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
#endif
+ // }
+ // }
}
} else {
// src1 is not contiguous
- for (int j = ith; j < n; j += nth) {
- float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
- float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
- for (int i = 0; i < nc; i++) {
- float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
-
- dst_ptr[i] = src0_ptr[i] + *src1_ptr;
+ for (int ir = ir0; ir < ir1; ++ir) {
+ // src0, src1 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+ float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
+ float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+ for (int i0 = 0; i0 < ne0; i0++) {
+ float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
+
+ dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
}
}
}
@@ -6313,17 +6985,25 @@ static void ggml_compute_forward_add_f16_f32(
const int ith = params->ith;
const int nth = params->nth;
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
+ const int nr = ggml_nrows(src0);
+ const int64_t ne0 = src0->ne[0];
+ const int64_t ne1 = src0->ne[1];
+ const int64_t ne2 = src0->ne[2];
const size_t nb00 = src0->nb[0];
const size_t nb01 = src0->nb[1];
+ const size_t nb02 = src0->nb[2];
+ const size_t nb03 = src0->nb[3];
const size_t nb10 = src1->nb[0];
const size_t nb11 = src1->nb[1];
+ const size_t nb12 = src1->nb[2];
+ const size_t nb13 = src1->nb[3];
const size_t nb0 = dst->nb[0];
const size_t nb1 = dst->nb[1];
+ const size_t nb2 = dst->nb[2];
+ const size_t nb3 = dst->nb[3];
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -6332,13 +7012,26 @@ static void ggml_compute_forward_add_f16_f32(
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
if (nb10 == sizeof(float)) {
- for (int j = ith; j < n; j += nth) {
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1);
- ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
- for (int i = 0; i < nc; i++) {
- float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
- dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr);
+ for (int ir = ir0; ir < ir1; ++ir) {
+ // src0, src1 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+ float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+ for (int i = 0; i < ne0; i++) {
+ dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
}
}
}
@@ -6362,32 +7055,53 @@ static void ggml_compute_forward_add_f16_f16(
const int ith = params->ith;
const int nth = params->nth;
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
+ const int nr = ggml_nrows(src0);
+ const int64_t ne0 = src0->ne[0];
+ const int64_t ne1 = src0->ne[1];
+ const int64_t ne2 = src0->ne[2];
const size_t nb00 = src0->nb[0];
const size_t nb01 = src0->nb[1];
+ const size_t nb02 = src0->nb[2];
+ const size_t nb03 = src0->nb[3];
const size_t nb10 = src1->nb[0];
const size_t nb11 = src1->nb[1];
+ const size_t nb12 = src1->nb[2];
+ const size_t nb13 = src1->nb[3];
const size_t nb0 = dst->nb[0];
const size_t nb1 = dst->nb[1];
+ const size_t nb2 = dst->nb[2];
+ const size_t nb3 = dst->nb[3];
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F16);
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
if (nb10 == sizeof(ggml_fp16_t)) {
- for (int j = ith; j < n; j += nth) {
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1);
- ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
- for (int i = 0; i < nc; i++) {
- ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + j*nb11 + i*nb10);
- dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(*src1_ptr));
+ for (int ir = ir0; ir < ir1; ++ir) {
+ // src0, src1 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+ ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+ for (int i = 0; i < ne0; i++) {
+ dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i]));
}
}
}
@@ -6408,50 +7122,36 @@ static void ggml_compute_forward_add_q_f32(
return;
}
+ const int nr = ggml_nrows(src0);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
- const int64_t ne03 = src0->ne[3];
-
- //const int64_t ne10 = src1->ne[0];
- //const int64_t ne11 = src1->ne[1];
- const int64_t ne12 = src1->ne[2];
- const int64_t ne13 = src1->ne[3];
-
- //const int64_t ne0 = dst->ne[0];
- //const int64_t ne1 = dst->ne[1];
- const int64_t ne2 = dst->ne[2];
- const int64_t ne3 = dst->ne[3];
+ //const int64_t ne03 = src0->ne[3];
- const int nb00 = src0->nb[0];
- const int nb01 = src0->nb[1];
- const int nb02 = src0->nb[2];
- const int nb03 = src0->nb[3];
+ const size_t nb00 = src0->nb[0];
+ const size_t nb01 = src0->nb[1];
+ const size_t nb02 = src0->nb[2];
+ const size_t nb03 = src0->nb[3];
- const int nb10 = src1->nb[0];
- const int nb11 = src1->nb[1];
- const int nb12 = src1->nb[2];
- const int nb13 = src1->nb[3];
+ const size_t nb10 = src1->nb[0];
+ const size_t nb11 = src1->nb[1];
+ const size_t nb12 = src1->nb[2];
+ const size_t nb13 = src1->nb[3];
- const int nb0 = dst->nb[0];
- const int nb1 = dst->nb[1];
- const int nb2 = dst->nb[2];
- const int nb3 = dst->nb[3];
+ const size_t nb0 = dst->nb[0];
+ const size_t nb1 = dst->nb[1];
+ const size_t nb2 = dst->nb[2];
+ const size_t nb3 = dst->nb[3];
const int ith = params->ith;
const int nth = params->nth;
- GGML_ASSERT(ne02 == ne12);
- GGML_ASSERT(ne03 == ne13);
- GGML_ASSERT(ne2 == ne12);
- GGML_ASSERT(ne3 == ne13);
-
const enum ggml_type type = src0->type;
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
// we don't support permuted src0 or src1
- GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
+ GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
GGML_ASSERT(nb10 == sizeof(float));
// dst cannot be transposed or permuted
@@ -6463,9 +7163,6 @@ static void ggml_compute_forward_add_q_f32(
GGML_ASSERT(dst->type == src0->type);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
- // total rows in src0
- const int nr = ne01*ne02*ne03;
-
// rows per thread
const int dr = (nr + nth - 1)/nth;
@@ -6542,6 +7239,428 @@ static void ggml_compute_forward_add(
}
}
+// ggml_compute_forward_add1
+
+static void ggml_compute_forward_add1_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
+ GGML_ASSERT(ggml_is_scalar(src1));
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nr = ggml_nrows(src0);
+ const int64_t ne0 = src0->ne[0];
+ const int64_t ne1 = src0->ne[1];
+ const int64_t ne2 = src0->ne[2];
+
+ const size_t nb00 = src0->nb[0];
+ const size_t nb01 = src0->nb[1];
+ const size_t nb02 = src0->nb[2];
+ const size_t nb03 = src0->nb[3];
+
+ const size_t nb0 = dst->nb[0];
+ const size_t nb1 = dst->nb[1];
+ const size_t nb2 = dst->nb[2];
+ const size_t nb3 = dst->nb[3];
+
+ GGML_ASSERT( nb0 == sizeof(float));
+ GGML_ASSERT(nb00 == sizeof(float));
+
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ for (int ir = ir0; ir < ir1; ++ir) {
+ // src0 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+#ifdef GGML_USE_ACCELERATE
+ UNUSED(ggml_vec_add1_f32);
+
+ vDSP_vadd(
+ (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
+ (float *) ((char *) src1->data), 0,
+ (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
+ ne0);
+#else
+ ggml_vec_add1_f32(ne0,
+ (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
+ (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
+ *(float *) src1->data);
+#endif
+ }
+}
+
+static void ggml_compute_forward_add1_f16_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
+ GGML_ASSERT(ggml_is_scalar(src1));
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ // scalar to add
+ const float v = *(float *) src1->data;
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nr = ggml_nrows(src0);
+ const int64_t ne0 = src0->ne[0];
+ const int64_t ne1 = src0->ne[1];
+ const int64_t ne2 = src0->ne[2];
+
+ const size_t nb00 = src0->nb[0];
+ const size_t nb01 = src0->nb[1];
+ const size_t nb02 = src0->nb[2];
+ const size_t nb03 = src0->nb[3];
+
+ const size_t nb0 = dst->nb[0];
+ const size_t nb1 = dst->nb[1];
+ const size_t nb2 = dst->nb[2];
+ const size_t nb3 = dst->nb[3];
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
+
+ GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ for (int ir = ir0; ir < ir1; ++ir) {
+ // src0 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+ for (int i = 0; i < ne0; i++) {
+ dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
+ }
+ }
+}
+
+static void ggml_compute_forward_add1_f16_f16(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
+ GGML_ASSERT(ggml_is_scalar(src1));
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ // scalar to add
+ const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nr = ggml_nrows(src0);
+ const int64_t ne0 = src0->ne[0];
+ const int64_t ne1 = src0->ne[1];
+ const int64_t ne2 = src0->ne[2];
+
+ const size_t nb00 = src0->nb[0];
+ const size_t nb01 = src0->nb[1];
+ const size_t nb02 = src0->nb[2];
+ const size_t nb03 = src0->nb[3];
+
+ const size_t nb0 = dst->nb[0];
+ const size_t nb1 = dst->nb[1];
+ const size_t nb2 = dst->nb[2];
+ const size_t nb3 = dst->nb[3];
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
+ GGML_ASSERT(src1->type == GGML_TYPE_F16);
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
+
+ GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ for (int ir = ir0; ir < ir1; ++ir) {
+ // src0 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+ for (int i = 0; i < ne0; i++) {
+ dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
+ }
+ }
+}
+
+static void ggml_compute_forward_add1_q_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
+ GGML_ASSERT(ggml_is_scalar(src1));
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ // scalar to add
+ const float v = *(float *) src1->data;
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nr = ggml_nrows(src0);
+ const int64_t ne0 = src0->ne[0];
+ const int64_t ne1 = src0->ne[1];
+ const int64_t ne2 = src0->ne[2];
+
+ const size_t nb00 = src0->nb[0];
+ const size_t nb01 = src0->nb[1];
+ const size_t nb02 = src0->nb[2];
+ const size_t nb03 = src0->nb[3];
+
+ const size_t nb0 = dst->nb[0];
+ const size_t nb1 = dst->nb[1];
+ const size_t nb2 = dst->nb[2];
+ const size_t nb3 = dst->nb[3];
+
+ const enum ggml_type type = src0->type;
+ dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
+ quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
+
+ // we don't support permuted src0
+ GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
+
+ // dst cannot be transposed or permuted
+ GGML_ASSERT(nb0 <= nb1);
+ GGML_ASSERT(nb1 <= nb2);
+ GGML_ASSERT(nb2 <= nb3);
+
+ GGML_ASSERT(ggml_is_quantized(src0->type));
+ GGML_ASSERT(dst->type == src0->type);
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
+
+ for (int ir = ir0; ir < ir1; ++ir) {
+ // src0 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+ void * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03));
+ void * dst_row = (void *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb0 ));
+
+ assert(ne0 % 32 == 0);
+
+ // unquantize row from src0 to temp buffer
+ dequantize_row_q(src0_row, wdata, ne0);
+ // add src1
+ ggml_vec_acc1_f32(ne0, wdata, v);
+ // quantize row to dst
+ quantize_row_q(wdata, dst_row, ne0);
+ }
+}
+
+static void ggml_compute_forward_add1(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_add1_f32(params, src0, src1, dst);
+ } break;
+ case GGML_TYPE_F16:
+ {
+ if (src1->type == GGML_TYPE_F16) {
+ ggml_compute_forward_add1_f16_f16(params, src0, src1, dst);
+ }
+ else if (src1->type == GGML_TYPE_F32) {
+ ggml_compute_forward_add1_f16_f32(params, src0, src1, dst);
+ }
+ else {
+ GGML_ASSERT(false);
+ }
+ } break;
+ case GGML_TYPE_Q4_0:
+ case GGML_TYPE_Q4_1:
+ case GGML_TYPE_Q5_0:
+ case GGML_TYPE_Q5_1:
+ case GGML_TYPE_Q8_0:
+ case GGML_TYPE_Q8_1:
+ {
+ ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
+
+
+// ggml_compute_forward_acc
+
+static void ggml_compute_forward_acc_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ const struct ggml_tensor * opt0,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
+ GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+
+ GGML_ASSERT(opt0->type == GGML_TYPE_I32);
+ GGML_ASSERT(ggml_nelements(opt0) == 5);
+
+ // view src0 and dst with these strides and data offset inbytes during acc
+ // nb0 is implicitely element_size because src0 and dst are contiguous
+ size_t nb1 = ((int32_t *) opt0->data)[0];
+ size_t nb2 = ((int32_t *) opt0->data)[1];
+ size_t nb3 = ((int32_t *) opt0->data)[2];
+ size_t offset = ((int32_t *) opt0->data)[3];
+ bool inplace = (bool) ((int32_t *) opt0->data)[4];
+
+ if (!inplace && (params->type == GGML_TASK_INIT)) {
+ // memcpy needs to be synchronized across threads to avoid race conditions.
+ // => do it in INIT phase
+ memcpy(
+ ((char *) dst->data),
+ ((char *) src0->data),
+ ggml_nbytes(dst));
+ }
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nr = ggml_nrows(src1);
+ const int nc = src1->ne[0];
+
+ const int64_t ne10 = src1->ne[0];
+ const int64_t ne11 = src1->ne[1];
+ const int64_t ne12 = src1->ne[2];
+ const int64_t ne13 = src1->ne[3];
+
+ const size_t nb10 = src1->nb[0];
+ const size_t nb11 = src1->nb[1];
+ const size_t nb12 = src1->nb[2];
+ const size_t nb13 = src1->nb[3];
+
+ // src0 and dst as viewed during acc
+ const size_t nb0 = ggml_element_size(src0);
+
+ const size_t nb00 = nb0;
+ const size_t nb01 = nb1;
+ const size_t nb02 = nb2;
+ const size_t nb03 = nb3;
+
+ GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0 + (ne11 == 0 ? 0 : ne11-1)*nb1 + (ne12 == 0 ? 0 : ne12-1)*nb2 + (ne13 == 0 ? 0 : ne13-1)*nb3 < ggml_nbytes(dst));
+ GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_nbytes(src0));
+
+ GGML_ASSERT(nb10 == sizeof(float));
+
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ for (int ir = ir0; ir < ir1; ++ir) {
+ // src0 and dst are viewed with shape of src1 and offset
+ // => same indices
+ const int i3 = ir/(ne12*ne11);
+ const int i2 = (ir - i3*ne12*ne11)/ne11;
+ const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
+
+#ifdef GGML_USE_ACCELERATE
+ vDSP_vadd(
+ (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1,
+ (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
+ (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), 1, nc);
+#else
+ ggml_vec_add_f32(nc,
+ (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset),
+ (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset),
+ (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+#endif
+ }
+}
+
+static void ggml_compute_forward_acc(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ const struct ggml_tensor * opt0,
+ struct ggml_tensor * dst) {
+
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_acc_f32(params, src0, src1, opt0, dst);
+ } break;
+ case GGML_TYPE_F16:
+ case GGML_TYPE_Q4_0:
+ case GGML_TYPE_Q4_1:
+ case GGML_TYPE_Q5_0:
+ case GGML_TYPE_Q5_1:
+ case GGML_TYPE_Q8_0:
+ case GGML_TYPE_Q8_1:
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
+
// ggml_compute_forward_sub
static void ggml_compute_forward_sub_f32(
@@ -6556,18 +7675,68 @@ static void ggml_compute_forward_sub_f32(
return;
}
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
+ const int nr = ggml_nrows(src0);
+ const int64_t ne0 = src0->ne[0];
+ const int64_t ne1 = src0->ne[1];
+ const int64_t ne2 = src0->ne[2];
- assert( dst->nb[0] == sizeof(float));
- assert(src0->nb[0] == sizeof(float));
- assert(src1->nb[0] == sizeof(float));
+ const size_t nb00 = src0->nb[0];
+ const size_t nb01 = src0->nb[1];
+ const size_t nb02 = src0->nb[2];
+ const size_t nb03 = src0->nb[3];
- for (int i = 0; i < n; i++) {
- ggml_vec_sub_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])),
- (float *) ((char *) src1->data + i*(src1->nb[1])));
+ const size_t nb10 = src1->nb[0];
+ const size_t nb11 = src1->nb[1];
+ const size_t nb12 = src1->nb[2];
+ const size_t nb13 = src1->nb[3];
+
+ const size_t nb0 = dst->nb[0];
+ const size_t nb1 = dst->nb[1];
+ const size_t nb2 = dst->nb[2];
+ const size_t nb3 = dst->nb[3];
+
+ GGML_ASSERT( nb0 == sizeof(float));
+ GGML_ASSERT(nb00 == sizeof(float));
+
+ if (nb10 == sizeof(float)) {
+ for (int ir = 0; ir < nr; ++ir) {
+ // src0, src1 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+
+#ifdef GGML_USE_ACCELERATE
+ vDSP_vsub(
+ (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
+ (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
+ (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
+ ne0);
+#else
+ ggml_vec_sub_f32(ne0,
+ (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
+ (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
+ (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+#endif
+ // }
+ // }
+ }
+ } else {
+ // src1 is not contiguous
+ for (int ir = 0; ir < nr; ++ir) {
+ // src0, src1 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+ float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
+ float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+ for (int i0 = 0; i0 < ne0; i0++) {
+ float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
+
+ dst_ptr[i0] = src0_ptr[i0] - *src1_ptr;
+ }
+ }
}
}
@@ -6602,18 +7771,70 @@ static void ggml_compute_forward_mul_f32(
return;
}
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
+ const int nr = ggml_nrows(src0);
+ const int64_t ne0 = src0->ne[0];
+ const int64_t ne1 = src0->ne[1];
+ const int64_t ne2 = src0->ne[2];
- assert( dst->nb[0] == sizeof(float));
- assert(src0->nb[0] == sizeof(float));
- assert(src1->nb[0] == sizeof(float));
+ const size_t nb00 = src0->nb[0];
+ const size_t nb01 = src0->nb[1];
+ const size_t nb02 = src0->nb[2];
+ const size_t nb03 = src0->nb[3];
- for (int i = 0; i < n; i++) {
- ggml_vec_mul_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])),
- (float *) ((char *) src1->data + i*(src1->nb[1])));
+ const size_t nb10 = src1->nb[0];
+ const size_t nb11 = src1->nb[1];
+ const size_t nb12 = src1->nb[2];
+ const size_t nb13 = src1->nb[3];
+
+ const size_t nb0 = dst->nb[0];
+ const size_t nb1 = dst->nb[1];
+ const size_t nb2 = dst->nb[2];
+ const size_t nb3 = dst->nb[3];
+
+ GGML_ASSERT( nb0 == sizeof(float));
+ GGML_ASSERT(nb00 == sizeof(float));
+
+ if (nb10 == sizeof(float)) {
+ for (int ir = 0; ir < nr; ++ir) {
+ // src0, src1 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+
+#ifdef GGML_USE_ACCELERATE
+ UNUSED(ggml_vec_mul_f32);
+
+ vDSP_vmul(
+ (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
+ (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
+ (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
+ ne0);
+#else
+ ggml_vec_mul_f32(ne0,
+ (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
+ (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
+ (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+#endif
+ // }
+ // }
+ }
+ } else {
+ // src1 is not contiguous
+ for (int ir = 0; ir < nr; ++ir) {
+ // src0, src1 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+ float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
+ float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+ for (int i0 = 0; i0 < ne0; i0++) {
+ float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
+
+ dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
+ }
+ }
}
}
@@ -6648,18 +7869,68 @@ static void ggml_compute_forward_div_f32(
return;
}
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
+ const int nr = ggml_nrows(src0);
+ const int64_t ne0 = src0->ne[0];
+ const int64_t ne1 = src0->ne[1];
+ const int64_t ne2 = src0->ne[2];
- assert( dst->nb[0] == sizeof(float));
- assert(src0->nb[0] == sizeof(float));
- assert(src1->nb[0] == sizeof(float));
+ const size_t nb00 = src0->nb[0];
+ const size_t nb01 = src0->nb[1];
+ const size_t nb02 = src0->nb[2];
+ const size_t nb03 = src0->nb[3];
- for (int i = 0; i < n; i++) {
- ggml_vec_div_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])),
- (float *) ((char *) src1->data + i*(src1->nb[1])));
+ const size_t nb10 = src1->nb[0];
+ const size_t nb11 = src1->nb[1];
+ const size_t nb12 = src1->nb[2];
+ const size_t nb13 = src1->nb[3];
+
+ const size_t nb0 = dst->nb[0];
+ const size_t nb1 = dst->nb[1];
+ const size_t nb2 = dst->nb[2];
+ const size_t nb3 = dst->nb[3];
+
+ GGML_ASSERT( nb0 == sizeof(float));
+ GGML_ASSERT(nb00 == sizeof(float));
+
+ if (nb10 == sizeof(float)) {
+ for (int ir = 0; ir < nr; ++ir) {
+ // src0, src1 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+
+#ifdef GGML_USE_ACCELERATE
+ vDSP_vdiv(
+ (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
+ (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
+ (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
+ ne0);
+#else
+ ggml_vec_div_f32(ne0,
+ (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
+ (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
+ (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+#endif
+ // }
+ // }
+ }
+ } else {
+ // src1 is not contiguous
+ for (int ir = 0; ir < nr; ++ir) {
+ // src0, src1 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+ float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
+ float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+ for (int i0 = 0; i0 < ne0; i0++) {
+ float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
+
+ dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
+ }
+ }
}
}
@@ -6764,6 +8035,49 @@ static void ggml_compute_forward_sqrt(
}
}
+
+// ggml_compute_forward_log
+
+static void ggml_compute_forward_log_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(params->ith == 0);
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ const int n = ggml_nrows(src0);
+ const int nc = src0->ne[0];
+
+ GGML_ASSERT( dst->nb[0] == sizeof(float));
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+ for (int i = 0; i < n; i++) {
+ ggml_vec_log_f32(nc,
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
+ }
+}
+
+static void ggml_compute_forward_log(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ struct ggml_tensor * dst) {
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_log_f32(params, src0, dst);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
+
// ggml_compute_forward_sum
static void ggml_compute_forward_sum_f32(
@@ -6821,6 +8135,73 @@ static void ggml_compute_forward_sum(
}
}
+// ggml_compute_forward_sum_rows
+
+static void ggml_compute_forward_sum_rows_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(params->ith == 0);
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
+ GGML_ASSERT(dst->nb[0] == sizeof(float));
+
+ const int64_t ne00 = src0->ne[0];
+ const int64_t ne01 = src0->ne[1];
+ const int64_t ne02 = src0->ne[2];
+ const int64_t ne03 = src0->ne[3];
+
+ const int64_t ne0 = dst->ne[0];
+ const int64_t ne1 = dst->ne[1];
+ const int64_t ne2 = dst->ne[2];
+ const int64_t ne3 = dst->ne[3];
+
+ GGML_ASSERT(ne0 == 1);
+ GGML_ASSERT(ne1 == ne01);
+ GGML_ASSERT(ne2 == ne02);
+ GGML_ASSERT(ne3 == ne03);
+
+ const size_t nb01 = src0->nb[1];
+ const size_t nb02 = src0->nb[2];
+ const size_t nb03 = src0->nb[3];
+
+ const size_t nb1 = dst->nb[1];
+ const size_t nb2 = dst->nb[2];
+ const size_t nb3 = dst->nb[3];
+
+ for (int64_t i3 = 0; i3 < ne03; i3++) {
+ for (int64_t i2 = 0; i2 < ne02; i2++) {
+ for (int64_t i1 = 0; i1 < ne01; i1++) {
+ float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
+ float* dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
+ float row_sum = 0;
+ ggml_vec_sum_f32(ne00, &row_sum, src_row);
+ dst_row[0] = row_sum;
+ }
+ }
+ }
+}
+
+static void ggml_compute_forward_sum_rows(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ struct ggml_tensor * dst) {
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_sum_rows_f32(params, src0, dst);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
+
// ggml_compute_forward_mean
static void ggml_compute_forward_mean_f32(
@@ -6898,37 +8279,58 @@ static void ggml_compute_forward_repeat_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
- assert(params->ith == 0);
- assert(ggml_can_repeat(src0, dst));
+ GGML_ASSERT(params->ith == 0);
+ GGML_ASSERT(ggml_can_repeat(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
- // TODO: implement support for rank > 2 tensors
- assert(src0->ne[2] == 1);
- assert(src0->ne[3] == 1);
- assert( dst->ne[2] == 1);
- assert( dst->ne[3] == 1);
+ const int64_t ne0 = dst->ne[0];
+ const int64_t ne1 = dst->ne[1];
+ const int64_t ne2 = dst->ne[2];
+ const int64_t ne3 = dst->ne[3];
+
+ const int64_t ne00 = src0->ne[0];
+ const int64_t ne01 = src0->ne[1];
+ const int64_t ne02 = src0->ne[2];
+ const int64_t ne03 = src0->ne[3];
+
+ const size_t nb0 = dst->nb[0];
+ const size_t nb1 = dst->nb[1];
+ const size_t nb2 = dst->nb[2];
+ const size_t nb3 = dst->nb[3];
+
+ const size_t nb00 = src0->nb[0];
+ const size_t nb01 = src0->nb[1];
+ const size_t nb02 = src0->nb[2];
+ const size_t nb03 = src0->nb[3];
- const int nc = dst->ne[0];
- const int nr = dst->ne[1];
- const int nc0 = src0->ne[0];
- const int nr0 = src0->ne[1];
- const int ncr = nc/nc0; // guaranteed to be an integer due to the check in ggml_can_repeat
- const int nrr = nr/nr0; // guaranteed to be an integer due to the check in ggml_can_repeat
+ // guaranteed to be an integer due to the check in ggml_can_repeat
+ const int nr0 = (int)(ne0/ne00);
+ const int nr1 = (int)(ne1/ne01);
+ const int nr2 = (int)(ne2/ne02);
+ const int nr3 = (int)(ne3/ne03);
// TODO: support for transposed / permuted tensors
- assert( dst->nb[0] == sizeof(float));
- assert(src0->nb[0] == sizeof(float));
+ GGML_ASSERT(nb0 == sizeof(float));
+ GGML_ASSERT(nb00 == sizeof(float));
// TODO: maybe this is not optimal?
- for (int i = 0; i < nrr; i++) {
- for (int j = 0; j < ncr; j++) {
- for (int k = 0; k < nr0; k++) {
- ggml_vec_cpy_f32(nc0,
- (float *) ((char *) dst->data + (i*nr0 + k)*( dst->nb[1]) + j*nc0*( dst->nb[0])),
- (float *) ((char *) src0->data + ( k)*(src0->nb[1])));
+ for (int i3 = 0; i3 < nr3; i3++) {
+ for (int k3 = 0; k3 < ne03; k3++) {
+ for (int i2 = 0; i2 < nr2; i2++) {
+ for (int k2 = 0; k2 < ne02; k2++) {
+ for (int i1 = 0; i1 < nr1; i1++) {
+ for (int k1 = 0; k1 < ne01; k1++) {
+ for (int i0 = 0; i0 < nr0; i0++) {
+ ggml_vec_cpy_f32(ne00,
+ (float *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0),
+ (float *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01));
+ }
+ }
+ }
+ }
}
}
}
@@ -7281,6 +8683,70 @@ static void ggml_compute_forward_silu(
}
+// ggml_compute_forward_silu_back
+
+static void ggml_compute_forward_silu_back_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * grad,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(ggml_is_contiguous(grad));
+ GGML_ASSERT(ggml_is_contiguous(src0));
+ GGML_ASSERT(ggml_is_contiguous(dst));
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
+ GGML_ASSERT(ggml_are_same_shape(src0, grad));
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nc = src0->ne[0];
+ const int nr = ggml_nrows(src0);
+
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ for (int i1 = ir0; i1 < ir1; i1++) {
+ ggml_vec_silu_backward_f32(nc,
+ (float *) ((char *) dst->data + i1*( dst->nb[1])),
+ (float *) ((char *) src0->data + i1*(src0->nb[1])),
+ (float *) ((char *) grad->data + i1*(grad->nb[1])));
+
+#ifndef NDEBUG
+ for (int k = 0; k < nc; k++) {
+ const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+ UNUSED(x);
+ assert(!isnan(x));
+ assert(!isinf(x));
+ }
+#endif
+ }
+}
+
+static void ggml_compute_forward_silu_back(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * grad,
+ struct ggml_tensor * dst) {
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_silu_back_f32(params, src0, grad, dst);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
+
// ggml_compute_forward_norm
static void ggml_compute_forward_norm_f32(
@@ -7435,6 +8901,195 @@ static void ggml_compute_forward_rms_norm(
}
+static void ggml_compute_forward_rms_norm_back_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int64_t ne00 = src0->ne[0];
+ const int64_t ne01 = src0->ne[1];
+ const int64_t ne02 = src0->ne[2];
+ const int64_t ne03 = src0->ne[3];
+
+ const size_t nb01 = src0->nb[1];
+ const size_t nb02 = src0->nb[2];
+ const size_t nb03 = src0->nb[3];
+
+ const size_t nb11 = src1->nb[1];
+ const size_t nb12 = src1->nb[2];
+ const size_t nb13 = src1->nb[3];
+
+ const size_t nb1 = dst->nb[1];
+ const size_t nb2 = dst->nb[2];
+ const size_t nb3 = dst->nb[3];
+
+ const float eps = 1e-6f; // TODO: make this a parameter
+
+ // TODO: optimize
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+ // src1 is same shape as src0 => same indices
+ const int64_t i11 = i01;
+ const int64_t i12 = i02;
+ const int64_t i13 = i03;
+
+ const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+ const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
+
+ ggml_float sum_xx = 0.0;
+ ggml_float sum_xdz = 0.0;
+
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
+ sum_xx += (ggml_float)(x[i00] * x[i00]);
+ sum_xdz += (ggml_float)(x[i00] * dz[i00]);
+ }
+
+ //const float mean = (float)(sum_xx)/ne00;
+ const float mean_eps = (float)(sum_xx)/ne00 + eps;
+ const float sum_eps = (float)(sum_xx) + eps*ne00;
+ //const float mean_xdz = (float)(sum_xdz)/ne00;
+ // we could cache rms from forward pass to improve performance.
+ // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms.
+ //const float rms = sqrtf(mean_eps);
+ const float rrms = 1.0f / sqrtf(mean_eps);
+ //const float scale = -rrms/(ne00 * mean_eps); // -1/(n*rms**3)
+
+ {
+ // z = rms_norm(x)
+ //
+ // rms_norm(src0) =
+ // scale(
+ // src0,
+ // div(
+ // 1,
+ // sqrt(
+ // add(
+ // scale(
+ // sum(
+ // sqr(
+ // src0)),
+ // (1.0/N)),
+ // eps))));
+
+ // postorder:
+ // ## op args grad
+ // 00 param src0 grad[#00]
+ // 01 const 1
+ // 02 sqr (#00) grad[#02]
+ // 03 sum (#02) grad[#03]
+ // 04 const 1/N
+ // 05 scale (#03, #04) grad[#05]
+ // 06 const eps
+ // 07 add (#05, #06) grad[#07]
+ // 08 sqrt (#07) grad[#08]
+ // 09 div (#01,#08) grad[#09]
+ // 10 scale (#00,#09) grad[#10]
+ //
+ // backward pass, given grad[#10]
+ // #10: scale
+ // grad[#00] += scale(grad[#10],#09)
+ // grad[#09] += sum(mul(grad[#10],#00))
+ // #09: div
+ // grad[#08] += neg(mul(grad[#09], div(#09,#08)))
+ // #08: sqrt
+ // grad[#07] += mul(grad[#08], div(0.5, #08))
+ // #07: add
+ // grad[#05] += grad[#07]
+ // #05: scale
+ // grad[#03] += scale(grad[#05],#04)
+ // #03: sum
+ // grad[#02] += repeat(grad[#03], #02)
+ // #02:
+ // grad[#00] += scale(mul(#00, grad[#02]), 2.0)
+ //
+ // substitute and simplify:
+ // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
+ // grad[#02] = repeat(grad[#03], #02)
+ // grad[#02] = repeat(scale(grad[#05],#04), #02)
+ // grad[#02] = repeat(scale(grad[#07],#04), #02)
+ // grad[#02] = repeat(scale(mul(grad[#08], div(0.5, #08)),#04), #02)
+ // grad[#02] = repeat(scale(mul(neg(mul(grad[#09], div(#09,#08))), div(0.5, #08)),#04), #02)
+ // grad[#02] = repeat(scale(mul(neg(mul(sum(mul(grad[#10],#00)), div(#09,#08))), div(0.5, #08)),#04), #02)
+ // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(#09,#08) * div(0.5, #08) * (1/N)), #02)
+ // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(div(#01,#08),#08) * div(0.5, #08) * (1/N)), #02)
+ // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#08*#08) * div(0.5, #08) * (1/N)), #02)
+ // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)
+ // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
+ // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)), 2.0)
+ // grad[#00] = scale(grad(#10), #09) + scale(scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N))), 2.0)
+ // grad[#00] = scale(grad(#10), #09) + scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(1,#08) * (1/N)))
+ // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
+ // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
+ // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,mean_eps*rms) * (-1/N))
+ // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*mean_eps))
+ // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*(sum_xx/N+eps)))
+ // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*sum_xx+rms*N*eps))
+ // grad[#00] = scale(dz, rrms) + scale(x, sum(mul(dz,x)) * div(-1,rms*N*mean_eps))
+ // grad[#00] = scale(dz, rrms) + scale(x, sum_xdz * div(-1,rms*N*mean_eps))
+ // a = b*c + d*e
+ // a = b*c*f/f + d*e*f/f
+ // a = (b*c*f + d*e*f)*(1/f)
+ // a = (b*c*(1/c) + d*e*(1/c))*(1/(1/c))
+ // a = (b + d*e/c)*c
+ // b = dz, c = rrms, d = x, e = sum_xdz * div(-1,rms*N*mean_eps)
+ // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)/rrms)*rrms
+ // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)*rms)*rrms
+ // a = (dz + x*sum_xdz * div(-rms,rms*N*mean_eps))*rrms
+ // a = (dz + x*sum_xdz * div(-1,N*mean_eps))*rrms
+ // a = (dz + x*div(-sum_xdz,N*mean_eps))*rrms
+ // a = (dz + x*div(-mean_xdz,mean_eps))*rrms
+ // grad[#00] = scale(dz + scale(x, div(-mean_xdz,mean_eps)),rrms)
+ // grad[#00] = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+ // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+ }
+ // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+ // post-order:
+ // dx := x
+ // dx := scale(dx,-mean_xdz/mean_eps)
+ // dx := add(dx, dz)
+ // dx := scale(dx, rrms)
+ float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+ ggml_vec_cpy_f32 (ne00, dx, x);
+ // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
+ ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps);
+ ggml_vec_acc_f32 (ne00, dx, dz);
+ ggml_vec_scale_f32(ne00, dx, rrms);
+ }
+ }
+ }
+}
+
+static void ggml_compute_forward_rms_norm_back(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
+
+
// ggml_compute_forward_mul_mat
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
@@ -8137,8 +9792,17 @@ static void ggml_compute_forward_scale_f32(
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
+ const size_t nb01 = src0->nb[1];
+
+ const size_t nb1 = dst->nb[1];
+
+
for (int i1 = ir0; i1 < ir1; i1++) {
- ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), v);
+ if (dst->data != src0->data) {
+ // src0 is same shape as dst => same indices
+ memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
+ }
+ ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
}
}
@@ -8159,6 +9823,115 @@ static void ggml_compute_forward_scale(
}
}
+// ggml_compute_forward_set
+
+static void ggml_compute_forward_set_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ const struct ggml_tensor * opt0,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
+ GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+
+ GGML_ASSERT(opt0->type == GGML_TYPE_I32);
+ GGML_ASSERT(ggml_nelements(opt0) == 5);
+
+ // view src0 and dst with these strides and data offset inbytes during set
+ // nb0 is implicitely element_size because src0 and dst are contiguous
+ size_t nb1 = ((int32_t *) opt0->data)[0];
+ size_t nb2 = ((int32_t *) opt0->data)[1];
+ size_t nb3 = ((int32_t *) opt0->data)[2];
+ size_t offset = ((int32_t *) opt0->data)[3];
+ bool inplace = (bool) ((int32_t *) opt0->data)[4];
+
+ if (!inplace && (params->type == GGML_TASK_INIT)) {
+ // memcpy needs to be synchronized across threads to avoid race conditions.
+ // => do it in INIT phase
+ memcpy(
+ ((char *) dst->data),
+ ((char *) src0->data),
+ ggml_nbytes(dst));
+ }
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nr = ggml_nrows(src1);
+ const int nc = src1->ne[0];
+
+ const int64_t ne10 = src1->ne[0];
+ const int64_t ne11 = src1->ne[1];
+ const int64_t ne12 = src1->ne[2];
+ const int64_t ne13 = src1->ne[3];
+
+ const size_t nb10 = src1->nb[0];
+ const size_t nb11 = src1->nb[1];
+ const size_t nb12 = src1->nb[2];
+ const size_t nb13 = src1->nb[3];
+
+ // src0 and dst as viewed during set
+ const size_t nb0 = ggml_element_size(src0);
+
+ const int im0 = (ne10 == 0 ? 0 : ne10-1);
+ const int im1 = (ne11 == 0 ? 0 : ne11-1);
+ const int im2 = (ne12 == 0 ? 0 : ne12-1);
+ const int im3 = (ne13 == 0 ? 0 : ne13-1);
+
+ GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 < ggml_nbytes(dst));
+
+ GGML_ASSERT(nb10 == sizeof(float));
+
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ for (int ir = ir0; ir < ir1; ++ir) {
+ // src0 and dst are viewed with shape of src1 and offset
+ // => same indices
+ const int i3 = ir/(ne12*ne11);
+ const int i2 = (ir - i3*ne12*ne11)/ne11;
+ const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
+
+ ggml_vec_cpy_f32(nc,
+ (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset),
+ (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+ }
+}
+
+static void ggml_compute_forward_set(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ const struct ggml_tensor * opt0,
+ struct ggml_tensor * dst) {
+
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_set_f32(params, src0, src1, opt0, dst);
+ } break;
+ case GGML_TYPE_F16:
+ case GGML_TYPE_Q4_0:
+ case GGML_TYPE_Q4_1:
+ case GGML_TYPE_Q5_0:
+ case GGML_TYPE_Q5_1:
+ case GGML_TYPE_Q8_0:
+ case GGML_TYPE_Q8_1:
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
+
// ggml_compute_forward_cpy
static void ggml_compute_forward_cpy(
@@ -8353,22 +10126,210 @@ static void ggml_compute_forward_get_rows(
//}
}
-// ggml_compute_forward_diag_mask_inf
+// ggml_compute_forward_get_rows_back
-static void ggml_compute_forward_diag_mask_inf_f32(
+static void ggml_compute_forward_get_rows_back_f32_f16(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
+ const struct ggml_tensor * opt0,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(params->ith == 0);
+ GGML_ASSERT(ggml_are_same_shape(opt0, dst));
+ GGML_ASSERT(ggml_is_contiguous(opt0));
+ GGML_ASSERT(ggml_is_contiguous(dst));
+
+ ggml_compute_forward_dup_same_cont(params, opt0, dst);
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ const int nc = src0->ne[0];
+ const int nr = ggml_nelements(src1);
+
+ GGML_ASSERT( dst->ne[0] == nc);
+ GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
+
+ for (int i = 0; i < nr; ++i) {
+ const int r = ((int32_t *) src1->data)[i];
+
+ for (int j = 0; j < nc; ++j) {
+ ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j];
+ ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v);
+ }
+ }
+}
+
+static void ggml_compute_forward_get_rows_back_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ const struct ggml_tensor * opt0,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(params->ith == 0);
+ GGML_ASSERT(ggml_are_same_shape(opt0, dst));
+ GGML_ASSERT(ggml_is_contiguous(opt0));
+ GGML_ASSERT(ggml_is_contiguous(dst));
+
+ ggml_compute_forward_dup_same_cont(params, opt0, dst);
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ const int nc = src0->ne[0];
+ const int nr = ggml_nelements(src1);
+
+ GGML_ASSERT( dst->ne[0] == nc);
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+ for (int i = 0; i < nr; ++i) {
+ const int r = ((int32_t *) src1->data)[i];
+
+ ggml_vec_add_f32(nc,
+ (float *) ((char *) dst->data + r*dst->nb[1]),
+ (float *) ((char *) dst->data + r*dst->nb[1]),
+ (float *) ((char *) src0->data + i*src0->nb[1]));
+ }
+}
+
+
+static void ggml_compute_forward_get_rows_back(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ const struct ggml_tensor * opt0,
struct ggml_tensor * dst) {
+ switch (src0->type) {
+ case GGML_TYPE_F16:
+ {
+ ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, opt0, dst);
+ } break;
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_get_rows_back_f32(params, src0, src1, opt0, dst);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+
+ //static bool first = true;
+ //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
+ //if (first) {
+ // first = false;
+ //} else {
+ // for (int k = 0; k < dst->ne[1]; ++k) {
+ // for (int j = 0; j < dst->ne[0]/16; ++j) {
+ // for (int i = 0; i < 16; ++i) {
+ // printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
+ // }
+ // printf("\n");
+ // }
+ // printf("\n");
+ // }
+ // printf("\n");
+ // exit(0);
+ //}
+}
+
+// ggml_compute_forward_diag
+
+static void ggml_compute_forward_diag_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(params->ith == 0);
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ // TODO: handle transposed/permuted matrices
+
+ const int ne00 = src0->ne[0];
+ const int ne01 = src0->ne[1];
+ const int ne02 = src0->ne[2];
+ const int ne03 = src0->ne[3];
+ const int ne0 = dst->ne[0];
+ const int ne1 = dst->ne[1];
+ const int ne2 = dst->ne[2];
+ const int ne3 = dst->ne[3];
+ GGML_ASSERT(ne00 == ne0);
+ GGML_ASSERT(ne00 == ne1);
+ GGML_ASSERT(ne01 == 1);
+ GGML_ASSERT(ne02 == ne2);
+ GGML_ASSERT(ne03 == ne3);
+
+ const int nb00 = src0->nb[0];
+ //const int nb01 = src0->nb[1];
+ const int nb02 = src0->nb[2];
+ const int nb03 = src0->nb[3];
+ const int nb0 = dst->nb[0];
+ const int nb1 = dst->nb[1];
+ const int nb2 = dst->nb[2];
+ const int nb3 = dst->nb[3];
+
+ GGML_ASSERT(nb00 == sizeof(float));
+ GGML_ASSERT(nb0 == sizeof(float));
+
+ for (int i3 = 0; i3 < ne3; i3++) {
+ for (int i2 = 0; i2 < ne2; i2++) {
+ for (int i1 = 0; i1 < ne1; i1++) {
+ float * d = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
+ float * s = (float *)((char *) src0->data + i3*nb03 + i2*nb02);
+ for (int i0 = 0; i0 < i1; i0++) {
+ d[i0] = 0;
+ }
+ d[i1] = s[i1];
+ for (int i0 = i1+1; i0 < ne0; i0++) {
+ d[i0] = 0;
+ }
+ }
+ }
+ }
+}
+
+static void ggml_compute_forward_diag(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ struct ggml_tensor * dst) {
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_diag_f32(params, src0, dst);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
+
+// ggml_compute_forward_diag_mask_inf
+
+static void ggml_compute_forward_diag_mask_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst,
+ const float value) {
assert(params->ith == 0);
assert(src1->type == GGML_TYPE_I32);
- assert(ggml_nelements(src1) == 1);
+ assert(ggml_nelements(src1) == 2);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
- const int n_past = ((int32_t *) src1->data)[0];
+ const int n_past = ((int32_t *) src1->data)[0];
+ const bool inplace = (bool)((int32_t *) src1->data)[1];
+
+ if (!inplace) {
+ ggml_compute_forward_dup_same_cont(params, src0, dst);
+ }
// TODO: handle transposed/permuted matrices
@@ -8384,7 +10345,7 @@ static void ggml_compute_forward_diag_mask_inf_f32(
for (int j = 0; j < nr; j++) {
for (int i = n_past; i < nc; i++) {
if (i > n_past + j) {
- *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = -INFINITY;
+ *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value;
}
}
}
@@ -8399,7 +10360,24 @@ static void ggml_compute_forward_diag_mask_inf(
switch (src0->type) {
case GGML_TYPE_F32:
{
- ggml_compute_forward_diag_mask_inf_f32(params, src0, src1, dst);
+ ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, -INFINITY);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
+
+static void ggml_compute_forward_diag_mask_zero(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, 0);
} break;
default:
{
@@ -8438,44 +10416,44 @@ static void ggml_compute_forward_soft_max_f32(
const int ir1 = MIN(ir0 + dr, nr);
for (int i1 = ir0; i1 < ir1; i1++) {
- float *p = (float *)((char *) dst->data + i1*dst->nb[1]);
+ float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
+ float *dp = (float *)((char *) dst->data + i1*dst->nb[1]);
#ifndef NDEBUG
for (int i = 0; i < nc; ++i) {
//printf("p[%d] = %f\n", i, p[i]);
- assert(!isnan(p[i]));
+ assert(!isnan(sp[i]));
}
#endif
float max = -INFINITY;
- ggml_vec_max_f32(nc, &max, p);
+ ggml_vec_max_f32(nc, &max, sp);
ggml_float sum = 0.0;
uint16_t scvt;
for (int i = 0; i < nc; i++) {
- //printf("p[%3d] = %8.4f\n", i, p[i]);
- if (p[i] == -INFINITY) {
- p[i] = 0.0f;
+ if (sp[i] == -INFINITY) {
+ dp[i] = 0.0f;
} else {
- //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
- ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
+ // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
+ ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max);
memcpy(&scvt, &s, sizeof(scvt));
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
sum += (ggml_float)val;
- p[i] = val;
+ dp[i] = val;
}
}
assert(sum > 0.0);
sum = 1.0/sum;
- ggml_vec_scale_f32(nc, p, sum);
+ ggml_vec_scale_f32(nc, dp, sum);
#ifndef NDEBUG
for (int i = 0; i < nc; ++i) {
- assert(!isnan(p[i]));
- assert(!isinf(p[i]));
+ assert(!isnan(dp[i]));
+ assert(!isinf(dp[i]));
}
#endif
}
@@ -8658,8 +10636,8 @@ static void ggml_compute_forward_rope_f32(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
- assert(src1->type == GGML_TYPE_I32);
- assert(ggml_nelements(src1) == 3);
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
+ GGML_ASSERT(ggml_nelements(src1) == 3);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
@@ -8682,12 +10660,16 @@ static void ggml_compute_forward_rope_f32(
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
- assert(nb0 == sizeof(float));
+ GGML_ASSERT(nb0 == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
const int nr = ggml_nrows(src0);
+ const int nc = src0->ne[0];
+
+ GGML_ASSERT(n_dims <= nc);
+ GGML_ASSERT(n_dims % 2 == 0);
// rows per thread
const int dr = (nr + nth - 1)/nth;
@@ -8748,8 +10730,8 @@ static void ggml_compute_forward_rope_f16(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
- assert(src1->type == GGML_TYPE_I32);
- assert(ggml_nelements(src1) == 3);
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
+ GGML_ASSERT(ggml_nelements(src1) == 3);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
@@ -8772,12 +10754,16 @@ static void ggml_compute_forward_rope_f16(
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
- assert(nb0 == sizeof(ggml_fp16_t));
+ GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
const int ith = params->ith;
const int nth = params->nth;
const int nr = ggml_nrows(src0);
+ const int nc = src0->ne[0];
+
+ GGML_ASSERT(n_dims <= nc);
+ GGML_ASSERT(n_dims % 2 == 0);
// rows per thread
const int dr = (nr + nth - 1)/nth;
@@ -8854,6 +10840,217 @@ static void ggml_compute_forward_rope(
}
}
+// ggml_compute_forward_rope_back
+
+static void ggml_compute_forward_rope_back_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ assert(src1->type == GGML_TYPE_I32);
+ assert(ggml_nelements(src1) == 3);
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ // y = rope(x, src1)
+ // dx = rope_back(dy, src1)
+ // src0 is dy, src1 contains options
+
+ const int n_past = ((int32_t *) src1->data)[0];
+ const int n_dims = ((int32_t *) src1->data)[1];
+ const int mode = ((int32_t *) src1->data)[2];
+
+ //const int64_t ne0 = src0->ne[0];
+ const int64_t ne1 = src0->ne[1];
+ const int64_t ne2 = src0->ne[2];
+ const int64_t ne3 = src0->ne[3];
+
+ const int nb0 = src0->nb[0];
+ const int nb1 = src0->nb[1];
+ const int nb2 = src0->nb[2];
+ const int nb3 = src0->nb[3];
+
+ //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+ //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+
+ assert(nb0 == sizeof(float));
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nr = ggml_nrows(src0);
+
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ // row index used to determine which thread to use
+ int ir = 0;
+
+ const float theta_scale = powf(10000.0, -2.0f/n_dims);
+
+ const bool is_neox = mode & 2;
+
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
+ for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
+ const int p = ((mode & 1) == 0 ? n_past + i2 : i2);
+ for (int64_t i1 = 0; i1 < ne1; i1++) {
+ if (ir++ < ir0) continue;
+ if (ir > ir1) break;
+
+ float theta = (float)p;
+
+ for (int i0 = 0; i0 < n_dims; i0 += 2) {
+ const float cos_theta = cosf(theta);
+ const float sin_theta = sinf(theta);
+
+ theta *= theta_scale;
+
+ if (!is_neox) {
+ const float * const dy = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+ float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+ const float dy0 = dy[0];
+ const float dy1 = dy[1];
+
+ dx[0] = dy0*cos_theta + dy1*sin_theta;
+ dx[1] = - dy0*sin_theta + dy1*cos_theta;
+ } else {
+ const float * const dy = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
+ float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
+
+ const float dy0 = dy[0];
+ const float dy1 = dy[n_dims/2];
+
+ dx[0] = dy0*cos_theta + dy1*sin_theta;
+ dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
+ }
+ }
+ }
+ }
+ }
+}
+
+static void ggml_compute_forward_rope_back_f16(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ assert(src1->type == GGML_TYPE_I32);
+ assert(ggml_nelements(src1) == 3);
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ // y = rope(x, src1)
+ // dx = rope_back(dy, src1)
+ // src0 is dy, src1 contains options
+
+ const int n_past = ((int32_t *) src1->data)[0];
+ const int n_dims = ((int32_t *) src1->data)[1];
+ const int mode = ((int32_t *) src1->data)[2];
+
+ //const int64_t ne0 = src0->ne[0];
+ const int64_t ne1 = src0->ne[1];
+ const int64_t ne2 = src0->ne[2];
+ const int64_t ne3 = src0->ne[3];
+
+ const int nb0 = src0->nb[0];
+ const int nb1 = src0->nb[1];
+ const int nb2 = src0->nb[2];
+ const int nb3 = src0->nb[3];
+
+ //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+ //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+
+ assert(nb0 == sizeof(ggml_fp16_t));
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nr = ggml_nrows(src0);
+
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ // row index used to determine which thread to use
+ int ir = 0;
+
+ const float theta_scale = powf(10000.0, -2.0f/n_dims);
+
+ const bool is_neox = mode & 2;
+
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
+ for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
+ const int p = ((mode & 1) == 0 ? n_past + i2 : i2);
+ for (int64_t i1 = 0; i1 < ne1; i1++) {
+ if (ir++ < ir0) continue;
+ if (ir > ir1) break;
+
+ float theta = (float)p;
+
+ for (int i0 = 0; i0 < n_dims; i0 += 2) {
+ const float cos_theta = cosf(theta);
+ const float sin_theta = sinf(theta);
+
+ theta *= theta_scale;
+
+ if (!is_neox) {
+ const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+ ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+ const float dy0 = GGML_FP16_TO_FP32(dy[0]);
+ const float dy1 = GGML_FP16_TO_FP32(dy[1]);
+
+ dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
+ dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
+ } else {
+ const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
+ ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
+
+ const float dy0 = GGML_FP16_TO_FP32(dy[0]);
+ const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
+
+ dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
+ dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
+ }
+ }
+ }
+ }
+ }
+}
+
+static void ggml_compute_forward_rope_back(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ switch (src0->type) {
+ case GGML_TYPE_F16:
+ {
+ ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
+ } break;
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
+
// ggml_compute_forward_conv_1d_1s
static void ggml_compute_forward_conv_1d_1s_f16_f32(
@@ -10173,6 +12370,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor);
} break;
+ case GGML_OP_ADD1:
+ {
+ ggml_compute_forward_add1(params, tensor->src0, tensor->src1, tensor);
+ } break;
+ case GGML_OP_ACC:
+ {
+ ggml_compute_forward_acc(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+ } break;
case GGML_OP_SUB:
{
ggml_compute_forward_sub(params, tensor->src0, tensor->src1, tensor);
@@ -10193,10 +12398,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_sqrt(params, tensor->src0, tensor);
} break;
+ case GGML_OP_LOG:
+ {
+ ggml_compute_forward_log(params, tensor->src0, tensor);
+ } break;
case GGML_OP_SUM:
{
ggml_compute_forward_sum(params, tensor->src0, tensor);
} break;
+ case GGML_OP_SUM_ROWS:
+ {
+ ggml_compute_forward_sum_rows(params, tensor->src0, tensor);
+ } break;
case GGML_OP_MEAN:
{
ggml_compute_forward_mean(params, tensor->src0, tensor);
@@ -10233,6 +12446,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_silu(params, tensor->src0, tensor);
} break;
+ case GGML_OP_SILU_BACK:
+ {
+ ggml_compute_forward_silu_back(params, tensor->src0, tensor->src1, tensor);
+ } break;
case GGML_OP_NORM:
{
ggml_compute_forward_norm(params, tensor->src0, tensor);
@@ -10241,6 +12458,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_rms_norm(params, tensor->src0, tensor);
} break;
+ case GGML_OP_RMS_NORM_BACK:
+ {
+ ggml_compute_forward_rms_norm_back(params, tensor->src0, tensor->src1, tensor);
+ } break;
case GGML_OP_MUL_MAT:
{
ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor);
@@ -10249,6 +12470,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_scale(params, tensor->src0, tensor->src1, tensor);
} break;
+ case GGML_OP_SET:
+ {
+ ggml_compute_forward_set(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+ } break;
case GGML_OP_CPY:
{
ggml_compute_forward_cpy(params, tensor->src0, tensor);
@@ -10277,10 +12502,22 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_get_rows(params, tensor->src0, tensor->src1, tensor);
} break;
+ case GGML_OP_GET_ROWS_BACK:
+ {
+ ggml_compute_forward_get_rows_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+ } break;
+ case GGML_OP_DIAG:
+ {
+ ggml_compute_forward_diag(params, tensor->src0, tensor);
+ } break;
case GGML_OP_DIAG_MASK_INF:
{
ggml_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor);
} break;
+ case GGML_OP_DIAG_MASK_ZERO:
+ {
+ ggml_compute_forward_diag_mask_zero(params, tensor->src0, tensor->src1, tensor);
+ } break;
case GGML_OP_SOFT_MAX:
{
ggml_compute_forward_soft_max(params, tensor->src0, tensor);
@@ -10289,6 +12526,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor);
} break;
+ case GGML_OP_ROPE_BACK:
+ {
+ ggml_compute_forward_rope_back(params, tensor->src0, tensor->src1, tensor);
+ } break;
case GGML_OP_ALIBI:
{
ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
@@ -10357,6 +12598,48 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
src1->grad = ggml_add_impl(ctx, src1->grad, tensor->grad, inplace);
}
} break;
+ case GGML_OP_ADD1:
+ {
+ if (src0->grad) {
+ src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
+ }
+ if (src1->grad) {
+ src1->grad = ggml_add_impl(ctx,
+ src1->grad,
+ ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean
+ inplace);
+ }
+ } break;
+ case GGML_OP_ACC:
+ {
+ if (src0->grad) {
+ src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
+ }
+ if (src1->grad) {
+ GGML_ASSERT(ggml_nelements(tensor->opt[0]) == 5);
+ GGML_ASSERT(tensor->opt[0]->type == GGML_TYPE_I32);
+ const size_t nb1 = (( int32_t * ) tensor->opt[0]->data)[0];
+ const size_t nb2 = (( int32_t * ) tensor->opt[0]->data)[1];
+ const size_t nb3 = (( int32_t * ) tensor->opt[0]->data)[2];
+ const size_t offset = (( int32_t * ) tensor->opt[0]->data)[3];
+
+ struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
+ tensor->grad,
+ src1->grad->ne[0],
+ src1->grad->ne[1],
+ src1->grad->ne[2],
+ src1->grad->ne[3],
+ nb1, nb2, nb3, offset);
+
+ src1->grad =
+ ggml_add_impl(ctx,
+ src1->grad,
+ ggml_reshape(ctx,
+ ggml_cont(ctx, tensor_grad_view),
+ src1->grad),
+ inplace);
+ }
+ } break;
case GGML_OP_SUB:
{
if (src0->grad) {
@@ -10408,9 +12691,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
src0->grad =
ggml_add_impl(ctx,
src0->grad,
- ggml_mul(ctx,
+ ggml_scale(ctx,
ggml_mul(ctx, src0, tensor->grad),
- ggml_repeat(ctx, ggml_new_f32(ctx, 2.0f), src0)),
+ ggml_new_f32(ctx, 2.0f)),
inplace);
}
} break;
@@ -10420,9 +12703,23 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
src0->grad =
ggml_add_impl(ctx,
src0->grad,
+ ggml_mul(ctx,
+ tensor->grad, // this was not catched by test_grad because in test_grad tensor->grad is 1
+ ggml_div(ctx,
+ ggml_repeat(ctx, ggml_new_f32(ctx, 0.5f), tensor),
+ tensor)),
+ inplace);
+ }
+ } break;
+ case GGML_OP_LOG:
+ {
+ if (src0->grad) {
+ src0->grad =
+ ggml_add_impl(ctx,
+ src0->grad,
ggml_div(ctx,
- ggml_repeat(ctx, ggml_new_f32(ctx, 0.5f), tensor),
- tensor),
+ tensor->grad,
+ src0),
inplace);
}
} break;
@@ -10430,9 +12727,21 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
if (src0->grad) {
src0->grad =
+ ggml_add1_impl(ctx,
+ src0->grad,
+ tensor->grad,
+ inplace);
+ }
+ } break;
+ case GGML_OP_SUM_ROWS:
+ {
+ if (src0->grad) {
+ src0->grad =
ggml_add_impl(ctx,
src0->grad,
- ggml_repeat(ctx, tensor->grad, src0->grad),
+ ggml_repeat(ctx,
+ tensor->grad,
+ src0->grad),
inplace);
}
} break;
@@ -10442,11 +12751,44 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
} break;
case GGML_OP_REPEAT:
{
+ // necessary for llama
if (src0->grad) {
+ GGML_ASSERT(src0->n_dims == 1 || src0->n_dims == 2);
+ const int nc = tensor->ne[0];
+ const int nr = tensor->ne[1];
+ const int nc0 = src0->ne[0];
+ const int nr0 = src0->ne[1];
+ const int ncr = nc/nc0; // guaranteed to be an integer due to the check in ggml_can_repeat
+ const int nrr = nr/nr0; // guaranteed to be an integer due to the check in ggml_can_repeat
+ // tensor->grad [nc,nr,1,1]
+ // reshape [nc0,nc/nc0,nr0,nr/nr0]
+ // permute [nc0,nr0,nc/nc0,nr/nr0]
+ // substitute [nc0,nr0,ncr,nrr]
+ // reshape [nc0*nr0,ncr*nrr,1,1]
+ // transpose [ncr*nrr,nc0*nr0,1,1]
+ // sum rows [1,nc0*nr0,1,1]
+ // transpose [nc0*nr0,1,1]
+ // reshape [nc0,nr0,1,1] reshape_1d or reshape_2d
+ // add to src0->grad
+
+ int64_t ne[4] = {nc0,ncr,nr0,nrr};
+
+ struct ggml_tensor* F00 = tensor->grad;
+ struct ggml_tensor* F01 = ggml_reshape (ctx, F00, ggml_new_tensor(ctx,tensor->grad->type,4,ne));
+ struct ggml_tensor* F02 = ggml_permute (ctx, F01, 0,2,1,3);
+ struct ggml_tensor* F03 = ggml_cont (ctx, F02);
+ struct ggml_tensor* F04 = ggml_reshape_2d(ctx, F03, nc0*nr0, ncr*nrr);
+ struct ggml_tensor* F05 = ggml_transpose (ctx, F04);
+ struct ggml_tensor* F06 = ggml_cont (ctx, F05);
+ struct ggml_tensor* F07 = ggml_sum_rows (ctx, F06);
+ struct ggml_tensor* F08 = ggml_transpose (ctx, F07);
+ struct ggml_tensor* F09 = ggml_cont (ctx, F08);
+ struct ggml_tensor* F10 = ggml_reshape (ctx, F09, src0->grad);
+
src0->grad =
ggml_add_impl(ctx,
src0->grad,
- ggml_sum(ctx, tensor->grad),
+ F10,
inplace);
}
} break;
@@ -10501,6 +12843,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
} break;
case GGML_OP_SILU:
{
+ // necessary for llama
+ if (src0->grad) {
+ src0->grad = ggml_add_impl(ctx,
+ src0->grad,
+ ggml_silu_back(ctx, src0, tensor->grad),
+ inplace);
+ }
+ } break;
+ case GGML_OP_SILU_BACK:
+ {
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_NORM:
@@ -10509,67 +12861,371 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
} break;
case GGML_OP_RMS_NORM:
{
+ // necessary for llama
+ if (src0->grad) {
+ src0->grad = ggml_add_impl(ctx,
+ src0->grad,
+ ggml_rms_norm_back(ctx, src0, tensor->grad),
+ inplace);
+ }
+ } break;
+ case GGML_OP_RMS_NORM_BACK:
+ {
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_MUL_MAT:
{
+ // https://cs231n.github.io/optimization-2/#staged
+ // # forward pass
+ // s0 = np.random.randn(5, 10)
+ // s1 = np.random.randn(10, 3)
+ // t = s0.dot(s1)
+
+ // # now suppose we had the gradient on t from above in the circuit
+ // dt = np.random.randn(*t.shape) # same shape as t
+ // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix
+ // ds1 = t.T.dot(dt)
+
+ // tensor.shape [m,p]
+ // src0.shape [n,m]
+ // src1.shape [n,p]
+
+ // necessary for llama
if (src0->grad) {
// TODO: this requires outer product - ggml_out_prod(ctx, src1, tensor->grad);
- GGML_ASSERT(false);
+ src0->grad =
+ ggml_add_impl(ctx,
+ src0->grad,
+ // ds0 = dt.dot(s1.T)
+ // ggml_out_prod(ctx, // [n,m]
+ // src1, // [n,p]
+ // tensor->grad), // [m,p]
+ // for now just using A*B==(B.T*A.T).T
+ ggml_cont(ctx, // [n,m]
+ ggml_transpose(ctx, // [n,m]
+ ggml_mul_mat(ctx, // [m,n]
+ ggml_cont(ctx, // [p,m]
+ ggml_transpose(ctx, // [p,m]
+ tensor->grad)), // [m,p]
+ ggml_cont(ctx, // [p,n]
+ ggml_transpose(ctx, // [p,n]
+ src1))))), // [n,p]
+ inplace);
}
if (src1->grad) {
src1->grad =
ggml_add_impl(ctx,
src1->grad,
- ggml_mul_mat(ctx,
- ggml_cont(ctx, ggml_transpose(ctx, src0)),
- tensor->grad),
+ // ds1 = s0.T.dot(dt):
+ ggml_mul_mat(ctx, // [n,p]
+ ggml_cont(ctx, // [m,n]
+ ggml_transpose(ctx, src0)), // [m,n]
+ tensor->grad), // [m,p]
inplace);
}
} break;
case GGML_OP_SCALE:
{
- GGML_ASSERT(false); // TODO: not implemented
+ // necessary for llama
+ if (src0->grad) {
+ src0->grad =
+ ggml_add_impl(ctx,
+ src0->grad,
+ ggml_scale_impl(ctx, tensor->grad, src1, false),
+ inplace);
+ }
+ if (src1->grad) {
+ src1->grad =
+ ggml_add_impl(ctx,
+ src1->grad,
+ ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)),
+ inplace);
+ }
+ } break;
+ case GGML_OP_SET:
+ {
+ GGML_ASSERT(ggml_nelements(tensor->opt[0]) == 5);
+ GGML_ASSERT(tensor->opt[0]->type == GGML_TYPE_I32);
+ const size_t nb1 = (( int32_t * ) tensor->opt[0]->data)[0];
+ const size_t nb2 = (( int32_t * ) tensor->opt[0]->data)[1];
+ const size_t nb3 = (( int32_t * ) tensor->opt[0]->data)[2];
+ const size_t offset = (( int32_t * ) tensor->opt[0]->data)[3];
+
+ struct ggml_tensor * tensor_grad_view = NULL;
+
+ if (src0->grad || src1->grad) {
+ GGML_ASSERT(src0->type == tensor->type);
+ GGML_ASSERT(tensor->grad->type == tensor->type);
+ GGML_ASSERT(tensor->grad->type == src1->grad->type);
+
+ tensor_grad_view = ggml_view_4d(ctx,
+ tensor->grad,
+ src1->grad->ne[0],
+ src1->grad->ne[1],
+ src1->grad->ne[2],
+ src1->grad->ne[3],
+ nb1, nb2, nb3, offset);
+ }
+
+ if (src0->grad) {
+ src0->grad = ggml_add_impl(ctx,
+ src0->grad,
+ ggml_acc_impl(ctx,
+ tensor->grad,
+ ggml_neg(ctx, tensor_grad_view),
+ nb1, nb2, nb3, offset, false),
+ inplace);
+ }
+
+ if (src1->grad) {
+ src1->grad =
+ ggml_add_impl(ctx,
+ src1->grad,
+ ggml_reshape(ctx,
+ ggml_cont(ctx, tensor_grad_view),
+ src1->grad),
+ inplace);
+ }
} break;
case GGML_OP_CPY:
{
- GGML_ASSERT(false); // TODO: not implemented
+ // necessary for llama
+ // cpy overwrites value of src1 by src0 and returns view(src1)
+ // the overwriting is mathematically equivalent to:
+ // tensor = src0 * 1 + src1 * 0
+ if (src0->grad) {
+ // dsrc0 = dtensor * 1
+ src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
+ }
+ if (src1->grad) {
+ // dsrc1 = dtensor * 0 -> noop
+ }
} break;
case GGML_OP_CONT:
{
- GGML_ASSERT(false); // TODO: not implemented
+ // same as cpy
+ if (src0->grad) {
+ GGML_ASSERT(ggml_is_contiguous(src0->grad));
+ GGML_ASSERT(ggml_is_contiguous(tensor->grad));
+ src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
+ }
} break;
case GGML_OP_RESHAPE:
{
- GGML_ASSERT(false); // TODO: not implemented
+ // necessary for llama
+ if (src0->grad) {
+ src0->grad =
+ ggml_add_impl(ctx, src0->grad,
+ ggml_reshape(ctx, tensor->grad, src0->grad),
+ inplace);
+ }
} break;
case GGML_OP_VIEW:
{
- GGML_ASSERT(false); // not supported
+ // necessary for llama
+ if (src0->grad) {
+ size_t offset;
+ memcpy(&offset, tensor->padding, sizeof(offset));
+
+ size_t nb1 = tensor->nb[1];
+ size_t nb2 = tensor->nb[2];
+ size_t nb3 = tensor->nb[3];
+
+ if (src0->type != src0->grad->type) {
+ // gradient is typically F32, but src0 could be other type
+ size_t ng = ggml_element_size(src0->grad);
+ size_t n0 = ggml_element_size(src0);
+ GGML_ASSERT(offset % n0 == 0);
+ GGML_ASSERT(nb1 % n0 == 0);
+ GGML_ASSERT(nb2 % n0 == 0);
+ GGML_ASSERT(nb3 % n0 == 0);
+ offset = (offset / n0) * ng;
+ nb1 = (nb1 / n0) * ng;
+ nb2 = (nb2 / n0) * ng;
+ nb3 = (nb3 / n0) * ng;
+ }
+
+ src0->grad = ggml_acc_impl(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, inplace);
+ }
} break;
case GGML_OP_PERMUTE:
{
- GGML_ASSERT(false); // TODO: not implemented
+ // necessary for llama
+ if (src0->grad) {
+ int axis0 = tensor->padding[0] & 0x3;
+ int axis1 = tensor->padding[1] & 0x3;
+ int axis2 = tensor->padding[2] & 0x3;
+ int axis3 = tensor->padding[3] & 0x3;
+ int axes_backward[4] = {0,0,0,0};
+ axes_backward[axis0] = 0;
+ axes_backward[axis1] = 1;
+ axes_backward[axis2] = 2;
+ axes_backward[axis3] = 3;
+ src0->grad =
+ ggml_add_impl(ctx, src0->grad,
+ ggml_permute(ctx,
+ tensor->grad,
+ axes_backward[0],
+ axes_backward[1],
+ axes_backward[2],
+ axes_backward[3]),
+ inplace);
+ }
} break;
case GGML_OP_TRANSPOSE:
{
- GGML_ASSERT(false); // TODO: not implemented
+ // necessary for llama
+ if (src0->grad) {
+ src0->grad =
+ ggml_add_impl(ctx, src0->grad,
+ ggml_transpose(ctx, tensor->grad),
+ inplace);
+ }
} break;
case GGML_OP_GET_ROWS:
{
+ // necessary for llama (only for tokenizer)
+ if (src0->grad) {
+ src0->grad =
+ ggml_add_impl(ctx, src0->grad,
+ ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad),
+ inplace);
+ }
+ if (src1->grad) {
+ // noop
+ }
+ } break;
+ case GGML_OP_GET_ROWS_BACK:
+ {
GGML_ASSERT(false); // TODO: not implemented
} break;
- case GGML_OP_DIAG_MASK_INF:
+ case GGML_OP_DIAG:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
+ case GGML_OP_DIAG_MASK_INF:
+ {
+ // necessary for llama
+ if (src0->grad) {
+ assert(src1->type == GGML_TYPE_I32);
+ assert(ggml_nelements(src1) == 2);
+ const int n_past = ((int32_t *) src1->data)[0];
+ src0->grad =
+ ggml_add_impl(ctx, src0->grad,
+ ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
+ inplace);
+ }
+ if (src1->grad) {
+ // noop
+ }
+ } break;
+ case GGML_OP_DIAG_MASK_ZERO:
+ {
+ // necessary for llama
+ if (src0->grad) {
+ assert(src1->type == GGML_TYPE_I32);
+ assert(ggml_nelements(src1) == 2);
+ const int n_past = ((int32_t *) src1->data)[0];
+ src0->grad =
+ ggml_add_impl(ctx, src0->grad,
+ ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
+ inplace);
+ }
+ if (src1->grad) {
+ // noop
+ }
+ } break;
case GGML_OP_SOFT_MAX:
{
- GGML_ASSERT(false); // TODO: not implemented
+ // necessary for llama
+ if (src0->grad) {
+ // y = softmax(x)
+ //
+ // Jii = yi - yi*yi
+ // Jij = -yi*yj
+ // J = diag(y)-y.*y
+ // dx = J * dy
+ // dxk = sum(Jkj * dyk)
+
+ int64_t ne2[4] = {
+ tensor->ne[0],
+ 1,
+ tensor->ne[1]*tensor->ne[2],
+ tensor->ne[3]
+ };
+ struct ggml_tensor * tensor2 = ggml_cont(ctx,
+ ggml_reshape_4d(ctx,
+ ggml_cont(ctx, tensor),
+ ne2[0], ne2[1], ne2[2], ne2[3]));
+
+ struct ggml_tensor * grad2 = ggml_cont(ctx,
+ ggml_reshape_4d(ctx,
+ ggml_cont(ctx, tensor->grad),
+ ne2[0], ne2[1], ne2[2], ne2[3]));
+
+ struct ggml_tensor * tensor2_t = ggml_cont(ctx, // [1,ne0,ne1*ne2,ne3]
+ ggml_permute(ctx, // [1,ne0,ne1*ne2,ne3]
+ tensor2, // [ne0,1,ne1*ne2,ne3]
+ 1, 0, 2, 3));
+
+ src0->grad =
+ ggml_add_impl(ctx,
+ src0->grad, // [ne0,ne1,ne2,ne3]
+ ggml_reshape(ctx, // [ne0,ne1,ne2,ne3]
+ ggml_mul_mat(ctx, // [ne0,1,ne1*ne2,ne3]
+ ggml_sub(ctx, // [ne0,ne0,ne1*ne2,ne3]
+ ggml_diag(ctx, // [ne0,ne0,ne1*ne2,ne3]
+ tensor2), // [ne0,1,ne1*ne2,ne3]
+ ggml_mul_mat(ctx, // [ne0,ne0,ne1*ne2,ne3]
+ tensor2_t, // [1,ne0,ne1*ne2,ne3]
+ tensor2_t)), // [1,ne0,ne1*ne2,ne3]
+ grad2), // [ne0,1,ne1*ne2,ne3]
+ src0->grad),
+ inplace);
+ }
} break;
case GGML_OP_ROPE:
{
- GGML_ASSERT(false); // TODO: not implemented
+ // necessary for llama
+ if (src0->grad) {
+ assert(src1->type == GGML_TYPE_I32);
+ assert(ggml_nelements(src1) == 3);
+ const int n_past = ((int32_t *) src1->data)[0];
+ const int n_dims = ((int32_t *) src1->data)[1];
+ const int mode = ((int32_t *) src1->data)[2];
+ src0->grad = ggml_add_impl(ctx,
+ src0->grad,
+ ggml_rope_back(ctx,
+ tensor->grad,
+ n_past,
+ n_dims,
+ mode),
+ inplace);
+ }
+ if (src1->grad) {
+ // noop
+ }
+ } break;
+ case GGML_OP_ROPE_BACK:
+ {
+ if (src0->grad) {
+ assert(src1->type == GGML_TYPE_I32);
+ assert(ggml_nelements(src1) == 3);
+ const int n_past = ((int32_t *) src1->data)[0];
+ const int n_dims = ((int32_t *) src1->data)[1];
+ const int mode = ((int32_t *) src1->data)[2];
+ src0->grad = ggml_add_impl(ctx,
+ src0->grad,
+ ggml_rope(ctx,
+ tensor->grad,
+ n_past,
+ n_dims,
+ mode),
+ inplace);
+ }
+ if (src1->grad) {
+ // noop
+ }
} break;
case GGML_OP_CONV_1D_1S:
{
@@ -10927,6 +13583,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
work_size = MAX(work_size, cur);
} break;
case GGML_OP_ADD:
+ case GGML_OP_ADD1:
{
node->n_tasks = n_threads;
@@ -10938,12 +13595,26 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
work_size = MAX(work_size, cur);
} break;
+ case GGML_OP_ACC:
+ {
+ node->n_tasks = n_threads;
+
+ size_t cur = 0;
+
+ if (ggml_is_quantized(node->src0->type)) {
+ cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads;
+ }
+
+ work_size = MAX(work_size, cur);
+ } break;
case GGML_OP_SUB:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_SQR:
case GGML_OP_SQRT:
+ case GGML_OP_LOG:
case GGML_OP_SUM:
+ case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
case GGML_OP_REPEAT:
case GGML_OP_ABS:
@@ -10962,8 +13633,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
{
node->n_tasks = n_threads;
} break;
+ case GGML_OP_SILU_BACK:
+ {
+ node->n_tasks = n_threads;
+ } break;
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
+ case GGML_OP_RMS_NORM_BACK:
{
node->n_tasks = n_threads;
} break;
@@ -11029,21 +13705,23 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
{
node->n_tasks = n_threads;
} break;
+ case GGML_OP_SET:
case GGML_OP_CONT:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_GET_ROWS:
+ case GGML_OP_GET_ROWS_BACK:
+ case GGML_OP_DIAG:
case GGML_OP_DIAG_MASK_INF:
+ case GGML_OP_DIAG_MASK_ZERO:
{
node->n_tasks = 1;
} break;
case GGML_OP_SOFT_MAX:
- {
- node->n_tasks = n_threads;
- } break;
case GGML_OP_ROPE:
+ case GGML_OP_ROPE_BACK:
{
node->n_tasks = n_threads;
} break;
@@ -12180,7 +14858,7 @@ enum ggml_opt_result ggml_opt(
// build forward + backward compute graphs
struct ggml_cgraph gf = ggml_build_forward (f);
- struct ggml_cgraph gb = ggml_build_backward(ctx, &gf, false);
+ struct ggml_cgraph gb = ggml_build_backward(ctx, &gf, true);
switch (params.type) {
case GGML_OPT_ADAM:
diff --git a/ggml.h b/ggml.h
index bb9a025..2745fb3 100644
--- a/ggml.h
+++ b/ggml.h
@@ -192,7 +192,7 @@
#define GGML_MAX_DIMS 4
#define GGML_MAX_NODES 4096
-#define GGML_MAX_PARAMS 16
+#define GGML_MAX_PARAMS 256
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_OPT 4
#define GGML_DEFAULT_N_THREADS 4
@@ -262,12 +262,16 @@ extern "C" {
GGML_OP_DUP,
GGML_OP_ADD,
+ GGML_OP_ADD1,
+ GGML_OP_ACC,
GGML_OP_SUB,
GGML_OP_MUL,
GGML_OP_DIV,
GGML_OP_SQR,
GGML_OP_SQRT,
+ GGML_OP_LOG,
GGML_OP_SUM,
+ GGML_OP_SUM_ROWS,
GGML_OP_MEAN,
GGML_OP_REPEAT,
GGML_OP_ABS,
@@ -277,12 +281,15 @@ extern "C" {
GGML_OP_RELU,
GGML_OP_GELU,
GGML_OP_SILU,
+ GGML_OP_SILU_BACK,
GGML_OP_NORM, // normalize
GGML_OP_RMS_NORM,
+ GGML_OP_RMS_NORM_BACK,
GGML_OP_MUL_MAT,
GGML_OP_SCALE,
+ GGML_OP_SET,
GGML_OP_CPY,
GGML_OP_CONT,
GGML_OP_RESHAPE,
@@ -290,9 +297,13 @@ extern "C" {
GGML_OP_PERMUTE,
GGML_OP_TRANSPOSE,
GGML_OP_GET_ROWS,
+ GGML_OP_GET_ROWS_BACK,
+ GGML_OP_DIAG,
GGML_OP_DIAG_MASK_INF,
+ GGML_OP_DIAG_MASK_ZERO,
GGML_OP_SOFT_MAX,
GGML_OP_ROPE,
+ GGML_OP_ROPE_BACK,
GGML_OP_ALIBI,
GGML_OP_CONV_1D_1S,
GGML_OP_CONV_1D_2S,
@@ -496,6 +507,29 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);
+ GGML_API struct ggml_tensor * ggml_add1(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ GGML_API struct ggml_tensor * ggml_acc(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t nb2,
+ size_t nb3,
+ size_t offset);
+
+ GGML_API struct ggml_tensor * ggml_acc_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t nb2,
+ size_t nb3,
+ size_t offset);
+
GGML_API struct ggml_tensor * ggml_sub(
struct ggml_context * ctx,
struct ggml_tensor * a,
@@ -519,12 +553,24 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
+ GGML_API struct ggml_tensor * ggml_log(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_log_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
// return scalar
- // TODO: compute sum along rows
GGML_API struct ggml_tensor * ggml_sum(
struct ggml_context * ctx,
struct ggml_tensor * a);
+ // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
+ GGML_API struct ggml_tensor * ggml_sum_rows(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
// mean along rows
GGML_API struct ggml_tensor * ggml_mean(
struct ggml_context * ctx,
@@ -566,6 +612,13 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
+ // a - x
+ // b - dy
+ GGML_API struct ggml_tensor * ggml_silu_back(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
// normalize along rows
// TODO: eps is hardcoded to 1e-5 for now
GGML_API struct ggml_tensor * ggml_norm(
@@ -576,6 +629,13 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
+ // a - x
+ // b - dy
+ GGML_API struct ggml_tensor * ggml_rms_norm_back(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
// A: m rows, n columns
// B: p rows, n columns (i.e. we transpose it internally)
// result is m columns, p rows
@@ -588,12 +648,66 @@ extern "C" {
// operations on tensors without backpropagation
//
- // in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_scale(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
+ // in-place, returns view(a)
+ GGML_API struct ggml_tensor * ggml_scale_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ // b -> view(a,offset,nb1,nb2,3), return modified a
+ GGML_API struct ggml_tensor * ggml_set(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t nb2,
+ size_t nb3,
+ size_t offset);
+
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
+ GGML_API struct ggml_tensor * ggml_set_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t nb2,
+ size_t nb3,
+ size_t offset);
+
+ GGML_API struct ggml_tensor * ggml_set_1d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t offset);
+
+ GGML_API struct ggml_tensor * ggml_set_1d_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t offset);
+
+ // b -> view(a,offset,nb1,nb2,3), return modified a
+ GGML_API struct ggml_tensor * ggml_set_2d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t offset);
+
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
+ GGML_API struct ggml_tensor * ggml_set_2d_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t offset);
+
+
// a -> b, return view(b)
GGML_API struct ggml_tensor * ggml_cpy(
struct ggml_context * ctx,
@@ -614,6 +728,11 @@ extern "C" {
// return view(a)
// TODO: when we start computing gradient, make a copy instead of view
+ GGML_API struct ggml_tensor * ggml_reshape_1d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0);
+
GGML_API struct ggml_tensor * ggml_reshape_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
@@ -629,6 +748,14 @@ extern "C" {
int64_t ne1,
int64_t ne2);
+ GGML_API struct ggml_tensor * ggml_reshape_4d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0,
+ int64_t ne1,
+ int64_t ne2,
+ int64_t ne3);
+
// offset in bytes
GGML_API struct ggml_tensor * ggml_view_1d(
struct ggml_context * ctx,
@@ -654,6 +781,18 @@ extern "C" {
size_t nb2, // slice stride in bytes
size_t offset);
+ GGML_API struct ggml_tensor * ggml_view_4d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0,
+ int64_t ne1,
+ int64_t ne2,
+ int64_t ne3,
+ size_t nb1, // row stride in bytes
+ size_t nb2, // slice stride in bytes
+ size_t nb3,
+ size_t offset);
+
GGML_API struct ggml_tensor * ggml_permute(
struct ggml_context * ctx,
struct ggml_tensor * a,
@@ -672,20 +811,50 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);
+ GGML_API struct ggml_tensor * ggml_get_rows_back(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ struct ggml_tensor * c);
+
+ GGML_API struct ggml_tensor * ggml_diag(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
// set elements above the diagonal to -INF
- // in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past);
// in-place, returns view(a)
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past);
+
+ // set elements above the diagonal to 0
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past);
+
+ // in-place, returns view(a)
+ GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past);
+
GGML_API struct ggml_tensor * ggml_soft_max(
struct ggml_context * ctx,
struct ggml_tensor * a);
- // rotary position embedding
// in-place, returns view(a)
+ GGML_API struct ggml_tensor * ggml_soft_max_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ // rotary position embedding
// if mode & 1 == 1, skip n_past elements
// if mode & 2 == 1, GPT-NeoX style
// TODO: avoid creating a new tensor every time
@@ -696,6 +865,23 @@ extern "C" {
int n_dims,
int mode);
+ // in-place, returns view(a)
+ GGML_API struct ggml_tensor * ggml_rope_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past,
+ int n_dims,
+ int mode);
+
+ // rotary position embedding backward, i.e compute dx from dy
+ // a - dy
+ GGML_API struct ggml_tensor * ggml_rope_back(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past,
+ int n_dims,
+ int mode);
+
// alibi position embedding
// in-place, returns view(a)
struct ggml_tensor * ggml_alibi(
@@ -740,13 +926,13 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_map_unary_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
- const ggml_unary_op_f32_t fun);
+ ggml_unary_op_f32_t fun);
GGML_API struct ggml_tensor * ggml_map_binary_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
- const ggml_binary_op_f32_t fun);
+ ggml_binary_op_f32_t fun);
//
// automatic differentiation
diff --git a/llama.cpp b/llama.cpp
index e564de7..08c7352 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1128,8 +1128,8 @@ static bool llama_eval_internal(
// self-attention
{
// compute Q and K and RoPE them
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
ggml_set_name(Qcur, "Qcur");
ggml_set_name(Kcur, "Kcur");
@@ -1170,17 +1170,19 @@ static bool llama_eval_internal(
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
- struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
ggml_set_name(KQ_scaled, "KQ_scaled");
// KQ_masked = mask_past(KQ_scaled)
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
ggml_set_name(KQ_masked, "KQ_masked");
// KQ = soft_max(KQ_masked)
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
ggml_set_name(KQ_soft_max, "KQ_soft_max");
+
// split cached V into n_head heads
struct ggml_tensor * V =
ggml_view_3d(ctx0, kv_self.v,
@@ -1281,7 +1283,7 @@ static bool llama_eval_internal(
lctx.use_buf(ctx0, -1);
// logits -> probs
- //inpL = ggml_soft_max(ctx0, inpL);
+ //inpL = ggml_soft_max_inplace(ctx0, inpL);
// run the computation
ggml_build_forward_expand(&gf, inpL);
@@ -2375,7 +2377,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
if (scaling != 1.0f) {
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
- BA = ggml_scale(lora_ctx, BA, scale_tensor);
+ BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
}
ggml_tensor * r;
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 6456485..4171c12 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,3 +10,5 @@ llama_add_test(test-quantize-fns.cpp)
llama_add_test(test-quantize-perf.cpp)
llama_add_test(test-sampling.cpp)
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
+# llama_add_test(test-grad0.c) # SLOW
+# llama_add_test(test-opt.c) # SLOW
diff --git a/tests/test-grad0.c b/tests/test-grad0.c
new file mode 100644
index 0000000..ec50592
--- /dev/null
+++ b/tests/test-grad0.c
@@ -0,0 +1,1131 @@
+#include "ggml.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define MAX_NARGS 2
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define GGML_SILU_FP16
+
+//
+// logging
+//
+
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+#define GGML_PRINT(...) printf(__VA_ARGS__)
+
+float frand(void) {
+ return (float)rand()/(float)RAND_MAX;
+}
+
+int irand(int n) {
+ if (n == 0) return 0;
+ else return rand()%n;
+}
+
+void get_random_dims(int64_t * dims, int ndims) {
+ dims[0] = dims[1] = dims[2] = dims[3] = 1;
+
+ for (int i = 0; i < ndims; i++) {
+ dims[i] = 1 + irand(4);
+ }
+}
+
+struct ggml_tensor * get_random_tensor(
+ struct ggml_context * ctx0,
+ int ndims,
+ int64_t ne[],
+ float fmin,
+ float fmax) {
+ struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
+
+ switch (ndims) {
+ case 1:
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
+ }
+ break;
+ case 2:
+ for (int i1 = 0; i1 < ne[1]; i1++) {
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+ }
+ }
+ break;
+ case 3:
+ for (int i2 = 0; i2 < ne[2]; i2++) {
+ for (int i1 = 0; i1 < ne[1]; i1++) {
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+ }
+ }
+ }
+ break;
+ case 4:
+ for (int i3 = 0; i3 < ne[3]; i3++) {
+ for (int i2 = 0; i2 < ne[2]; i2++) {
+ for (int i1 = 0; i1 < ne[1]; i1++) {
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+ }
+ }
+ }
+ }
+ break;
+ default:
+ assert(false);
+ };
+
+ return result;
+}
+
+struct ggml_tensor * get_random_tensor_int(
+ struct ggml_context * ctx0,
+ int ndims,
+ int64_t ne[],
+ int32_t imin,
+ int32_t imax) {
+ struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne);
+
+ switch (ndims) {
+ case 1:
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((int32_t *)result->data)[i0] = irand(imax - imin) + imin;
+ }
+ break;
+ case 2:
+ for (int i1 = 0; i1 < ne[1]; i1++) {
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin;
+ }
+ }
+ break;
+ case 3:
+ for (int i2 = 0; i2 < ne[2]; i2++) {
+ for (int i1 = 0; i1 < ne[1]; i1++) {
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
+ }
+ }
+ }
+ break;
+ case 4:
+ for (int i3 = 0; i3 < ne[3]; i3++) {
+ for (int i2 = 0; i2 < ne[2]; i2++) {
+ for (int i1 = 0; i1 < ne[1]; i1++) {
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
+ }
+ }
+ }
+ }
+ break;
+ default:
+ assert(false);
+ };
+
+ return result;
+}
+
+float get_element(const struct ggml_tensor * t, int idx) {
+ if (t->type == GGML_TYPE_F32) {
+ return ((float *)t->data)[idx];
+ } else if (t->type == GGML_TYPE_I32) {
+ return ((int32_t *)t->data)[idx];
+ } else {
+ assert(false);
+ return INFINITY;
+ }
+}
+
+void set_element(struct ggml_tensor * t, int idx, float value) {
+ ((float *)t->data)[idx] = value;
+}
+
+void print_elements(const char* label, const struct ggml_tensor * t) {
+ if (!t) {
+ printf("%s: %s = null\n", __func__, label);
+ return;
+ }
+ const int nelements = ggml_nelements(t);
+ printf("%s: %s = [", __func__, label);
+ for (int k = 0; k < nelements; ++k) {
+ if (k > 0) { printf(", "); }
+ printf("%.5f", get_element(t, k));
+ }
+ printf("] shape: [");
+ for (int k = 0; k < t->n_dims; ++k) {
+ if (k > 0) { printf(", "); }
+ printf("%d", (int)t->ne[k]);
+ }
+ printf("]\n");
+
+}
+
+bool check_gradient(
+ const char * op_name,
+ struct ggml_context * ctx0,
+ struct ggml_tensor * x[],
+ struct ggml_tensor * f,
+ int ndims,
+ int nargs,
+ float eps,
+ float max_error_abs,
+ float max_error_rel) {
+
+ struct ggml_cgraph gf = ggml_build_forward (f);
+ struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
+
+ ggml_graph_compute(ctx0, &gf);
+ ggml_graph_reset (&gf);
+ ggml_set_f32 (f->grad, 1.0f);
+ ggml_graph_compute(ctx0, &gb);
+
+ // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
+ // ggml_graph_dump_dot(&gb, &gf, "test-grad0-backward.dot");
+
+ for (int i = 0; i < nargs; ++i) {
+ const int nelements = ggml_nelements(x[i]);
+ for (int k = 0; k < nelements; ++k) {
+ // compute gradient using finite differences
+ const float x0 = get_element(x[i], k);
+ const float xm = x0 - eps;
+ const float xp = x0 + eps;
+ set_element(x[i], k, xp);
+ ggml_graph_compute(ctx0, &gf);
+
+ const float f0 = ggml_get_f32_1d(f, 0);
+
+ set_element(x[i], k, xm);
+ ggml_graph_compute(ctx0, &gf);
+
+ const float f1 = ggml_get_f32_1d(f, 0);
+
+ const float g0 = (f0 - f1)/(2.0f*eps);
+
+ set_element(x[i], k, x0);
+
+ // compute gradient using backward graph
+ ggml_graph_reset (&gf);
+ ggml_set_f32 (f->grad, 1.0f);
+ ggml_graph_compute(ctx0, &gb);
+
+ const float g1 = get_element(x[i]->grad, k);
+
+ const float error_abs = fabsf(g0 - g1);
+ const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0;
+
+ if (error_abs > max_error_abs || error_rel > max_error_rel) {
+ printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
+ op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel);
+ //assert(false);
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+// TODO: clean-up this ..
+bool check_mat_mul(
+ const struct ggml_tensor * y,
+ const struct ggml_tensor * x0,
+ const struct ggml_tensor * x1) {
+ float * dst = (float *) y->data;
+ float * src0 = (float *) x0->data;
+ float * src1 = (float *) x1->data;
+
+ const int nc = x0->ne[1];
+ const int nr = x1->ne[1];
+ const int nk = x0->ne[0];
+
+ GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk);
+
+ GGML_PRINT_DEBUG("x0:\n");
+ for (int j = 0; j < x0->ne[1]; ++j) {
+ for (int i = 0; i < x0->ne[0]; ++i) {
+ GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]);
+ }
+ GGML_PRINT_DEBUG("\n");
+ }
+ GGML_PRINT_DEBUG("\n");
+
+ GGML_PRINT_DEBUG("x1:\n");
+ for (int j = 0; j < x1->ne[1]; ++j) {
+ for (int i = 0; i < x1->ne[0]; ++i) {
+ GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]);
+ }
+ GGML_PRINT_DEBUG("\n");
+ }
+ GGML_PRINT_DEBUG("\n");
+
+ GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]);
+ for (int j = 0; j < y->ne[1]; ++j) {
+ for (int i = 0; i < y->ne[0]; ++i) {
+ GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]);
+ }
+ GGML_PRINT_DEBUG("\n");
+ }
+
+ for (int i = 0; i < nr; ++i) {
+ for (int j = 0; j < nc; ++j) {
+ float sum = 0.0f;
+
+ for (int k = 0; k < nk; ++k) {
+ sum += src0[j*nk + k]*src1[i*nk + k];
+ }
+
+ if (fabsf(dst[i*nc + j] - sum) > 1e-5f) {
+ fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum);
+ assert(false);
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+#define NUM_PERMUTATIONS (4*3*2*1)
+
+int main(int argc, const char ** argv) {
+ struct ggml_init_params params = {
+ .mem_size = 128*1024*1024,
+ .mem_buffer = NULL,
+ .no_alloc = false,
+ };
+
+ int64_t ne[4];
+
+ int all_permutations[4 * NUM_PERMUTATIONS];
+ {
+ int count = 0;
+ for (int ax0=0; ax0<4; ++ax0) {
+ for (int ax1=0; ax1<4; ++ax1) {
+ if (ax1 == ax0) continue;
+ for (int ax2=0; ax2<4; ++ax2) {
+ if (ax2 == ax0) continue;
+ if (ax2 == ax1) continue;
+ for (int ax3=0; ax3<4; ++ax3) {
+ if (ax3 == ax0) continue;
+ if (ax3 == ax1) continue;
+ if (ax3 == ax2) continue;
+ assert(count < NUM_PERMUTATIONS);
+ all_permutations[count*4+0] = ax0;
+ all_permutations[count*4+1] = ax1;
+ all_permutations[count*4+2] = ax2;
+ all_permutations[count*4+3] = ax3;
+ ++count;
+ }
+ }
+ }
+ }
+ }
+
+
+ // original loop: 1000
+ int niter = 4;
+ const char *env = getenv("GGML_NLOOP");
+ if (env != NULL) {
+ niter = atoi(env);
+ }
+ if (argc > 1) {
+ niter = atoi(argv[1]);
+ }
+ for (int iter = 0; iter < niter; ++iter) {
+ printf("test-grad0: iter:%d/%d\n", iter, niter);
+ struct ggml_context * ctx0 = ggml_init(params);
+
+ get_random_dims(ne, 4);
+
+ struct ggml_tensor * x[MAX_NARGS];
+
+ // add
+ {
+ const int nargs = 2;
+
+ for (int ndims = 1; ndims <= 4; ++ndims) {
+ for (int i = 0; i < nargs; ++i) {
+ x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[i]);
+ }
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
+
+ check_gradient("add", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
+ }
+ }
+
+ // sub
+ {
+ const int nargs = 2;
+
+ for (int ndims = 1; ndims <= 4; ++ndims) {
+ for (int i = 0; i < nargs; ++i) {
+ x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[i]);
+ }
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
+
+ check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+ }
+ }
+
+ // mul
+ {
+ const int nargs = 2;
+
+ for (int ndims = 1; ndims <= 4; ++ndims) {
+ for (int i = 0; i < nargs; ++i) {
+ x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[i]);
+ }
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
+
+ check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // div
+ {
+ const int nargs = 2;
+
+ for (int ndims = 1; ndims <= 4; ++ndims) {
+ for (int i = 0; i < nargs; ++i) {
+ x[i] = get_random_tensor(ctx0, ndims, ne, 0.5f, 1.0f);
+ ggml_set_param(ctx0, x[i]);
+ }
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
+
+ check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f);
+ }
+ }
+
+ // sqr
+ {
+ const int nargs = 1;
+
+ for (int ndims = 1; ndims <= 2; ++ndims) {
+ for (int i = 0; i < nargs; ++i) {
+ x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[i]);
+ }
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
+
+ check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // sqrt
+ {
+ const int nargs = 1;
+
+ for (int ndims = 1; ndims <= 2; ++ndims) {
+ for (int i = 0; i < nargs; ++i) {
+ x[i] = get_random_tensor(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
+ ggml_set_param(ctx0, x[i]);
+ }
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
+
+ check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
+ }
+ }
+
+ // log
+ {
+ const int nargs = 1;
+
+ for (int ndims = 1; ndims <= 2; ++ndims) {
+ for (int i = 0; i < nargs; ++i) {
+ x[i] = get_random_tensor(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
+ ggml_set_param(ctx0, x[i]);
+ }
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
+
+ check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
+ }
+ }
+
+ // sum
+ {
+ const int nargs = 1;
+
+ for (int ndims = 1; ndims <= 2; ++ndims) {
+ for (int i = 0; i < nargs; ++i) {
+ x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[i]);
+ }
+
+ struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
+
+ check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+ }
+ }
+
+
+ // sum_rows
+ {
+ const int nargs = 1;
+
+ for (int ndims = 1; ndims <= 4; ++ndims) {
+ for (int i = 0; i < nargs; ++i) {
+ x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[i]);
+ }
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
+
+ check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
+ }
+ }
+
+ // repeat
+ {
+ int64_t ne2[4];
+ get_random_dims(ne2, 4);
+
+ ne2[0] = ne[0] * ne2[0];
+ ne2[1] = ne[1] * ne2[1];
+ ne2[2] = 1;
+ ne2[3] = 1;
+
+ const int nargs = 1;
+ for (int ndims = 1; ndims <= 2; ++ndims) {
+ x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ x[1] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[0]);
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
+
+ check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
+ }
+
+ }
+
+ // abs (finite differences do not work)
+ //{
+ // const int nargs = 1;
+
+ // for (int ndims = 1; ndims <= 2; ++ndims) {
+ // for (int i = 0; i < nargs; ++i) {
+ // x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ // ggml_set_param(ctx0, x[i]);
+ // }
+
+ // struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
+
+ // check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f);
+ // }
+ //}
+
+ // mul_mat
+ {
+ const int nargs = 2;
+
+ for (int ndims = 2; ndims <= 2; ++ndims) {
+ x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ {
+ int64_t ne2[4];
+ get_random_dims(ne2, 4);
+ ne2[0] = ne[0];
+ x[1] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+ }
+
+ ggml_set_param(ctx0, x[0]);
+ ggml_set_param(ctx0, x[1]);
+
+ struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
+ struct ggml_tensor * f = ggml_sum(ctx0, m);
+
+ GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
+
+ check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ check_mat_mul(m, x[1], x[0]);
+ }
+ }
+
+ // silu
+ {
+ const int nargs = 1;
+
+ for (int ndims = 1; ndims <= 2; ++ndims) {
+ for (int i = 0; i < nargs; ++i) {
+ x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[i]);
+ }
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0]));
+
+#ifdef GGML_SILU_FP16
+ // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
+ check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY);
+#else
+ check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+#endif
+ }
+ }
+
+ // rms_norm
+ {
+ const int nargs = 1;
+
+ for (int ndims = 1; ndims <= 2; ++ndims) {
+ for (int i = 0; i < nargs; ++i) {
+ x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[i]);
+ }
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0]));
+
+ check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
+ }
+ }
+
+ // scale
+ {
+ const int nargs = 2;
+
+ int64_t ne2[4];
+ ne2[0] = 1;
+
+ for (int ndims = 1; ndims <= 2; ++ndims) {
+ x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+ x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+
+ ggml_set_param(ctx0, x[0]);
+ ggml_set_param(ctx0, x[1]);
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], x[1]));
+
+ check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // cpy
+ {
+ const int nargs = 2;
+
+ for (int ndims = 1; ndims <= 2; ++ndims) {
+ for (int i = 0; i < nargs; ++i) {
+ x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[i]);
+ }
+ // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
+
+ check_gradient("cpy", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // reshape (1d->nd)
+ {
+ const int nargs = 1;
+
+ for (int ndims = 1; ndims <= 2; ++ndims) {
+ int64_t ne2[4];
+ ne2[0] = 1;
+ ne2[1] = 1;
+ ne2[2] = 1;
+ ne2[3] = 1;
+ for (int i = 0; i < ndims; ++i) {
+ ne2[0] *= ne[i];
+ }
+ x[0] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+ x[1] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[0]);
+
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
+ check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // reshape (nd->1d)
+ {
+ const int nargs = 1;
+
+ for (int ndims = 1; ndims <= 2; ++ndims) {
+ int64_t ne2[4];
+ ne2[0] = 1;
+ ne2[1] = 1;
+ ne2[2] = 1;
+ ne2[3] = 1;
+ for (int i = 0; i < ndims; ++i) {
+ ne2[0] *= ne[i];
+ }
+ x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[0]);
+
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
+ check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // acc 1d
+ {
+ int64_t ne2[4] = { 1, 1, 1, 1 };
+
+ const int nargs = 2;
+ for (int ndims = 1; ndims <= 4; ++ndims) {
+
+ x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[0]);
+
+ get_random_dims(ne2, 1);
+ while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
+ get_random_dims(ne2, 1);
+ }
+
+ x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[1]);
+
+ const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
+ const int offset = irand(max_offset) * ggml_element_size(x[0]);
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
+
+ check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // acc 2d
+ {
+ int64_t ne2[4] = { 1, 1, 1, 1 };
+ int64_t max_offsets[4] = { 0, 0, 0, 0 };
+ int64_t offsets[4] = { 0, 0, 0, 0 };
+
+ const int nargs = 2;
+ for (int ndims = 2; ndims <= 4; ++ndims) {
+
+ x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[0]);
+
+ get_random_dims(ne2, 2);
+ while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
+ get_random_dims(ne2, 2);
+ }
+
+ x[1] = get_random_tensor(ctx0, 2, ne2, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[1]);
+
+ max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
+ max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
+ offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
+ offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
+ const int offset = offsets[0] + offsets[1];
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
+
+ check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // acc 3d
+ {
+ int64_t ne2[4] = { 1, 1, 1, 1 };
+ int64_t max_offsets[4] = { 0, 0, 0, 0 };
+ int64_t offsets[4] = { 0, 0, 0, 0 };
+
+ const int nargs = 2;
+ for (int ndims = 3; ndims <= 4; ++ndims) {
+
+ x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[0]);
+
+ get_random_dims(ne2, 3);
+ while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) {
+ get_random_dims(ne2, 3);
+ }
+
+ x[1] = get_random_tensor(ctx0, 3, ne2, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[1]);
+
+ max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
+ max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
+ max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
+ offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
+ offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
+ offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
+ const int offset = offsets[0] + offsets[1] + offsets[2];
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
+
+ check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // acc 4d
+ {
+ int64_t ne2[4] = { 1, 1, 1, 1 };
+ int64_t max_offsets[4] = { 0, 0, 0, 0 };
+ int64_t offsets[4] = { 0, 0, 0, 0 };
+
+ const int nargs = 2;
+ for (int ndims = 4; ndims <= 4; ++ndims) {
+
+ x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[0]);
+
+ get_random_dims(ne2, 4);
+ while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) {
+ get_random_dims(ne2, 4);
+ }
+
+ x[1] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[1]);
+
+ max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
+ max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
+ max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
+ max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]);
+ offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
+ offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
+ offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
+ offsets[3] = irand(max_offsets[3]) * x[0]->nb[3];
+ const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3];
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
+
+ check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // set_1d
+ {
+ int64_t ne2[4];
+
+ const int nargs = 2;
+ for (int ndims = 1; ndims <= 4; ++ndims) {
+
+ x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[0]);
+
+ get_random_dims(ne2, 1);
+ while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
+ get_random_dims(ne2, 1);
+ }
+
+ x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[1]);
+
+ const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
+ const int offset = irand(max_offset) * ggml_element_size(x[0]);
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
+
+ check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // set_2d
+ {
+ int64_t ne2[4];
+ int64_t max_offsets[4] = { 0, 0, 0, 0 };
+ int64_t offsets[4] = { 0, 0, 0, 0 };
+
+ const int nargs = 1;
+ for (int ndims = 2; ndims <= 4; ++ndims) {
+
+ x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[0]);
+
+ get_random_dims(ne2, 2);
+ while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
+ get_random_dims(ne2, 2);
+ }
+
+ x[1] = get_random_tensor(ctx0, 2, ne2, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[1]);
+
+ max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
+ max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
+ offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
+ offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
+ const int offset = offsets[0] + offsets[1];
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
+
+ check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // view_1d
+ {
+ const int nargs = 1;
+ for (int ndims = 1; ndims <= 4; ++ndims) {
+
+ x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+
+ ggml_set_param(ctx0, x[0]);
+
+ const int k0 = irand(ggml_nelements(x[0]));
+ const int k1 = irand(ggml_nelements(x[0]));
+ const int i0 = MIN(k0, k1);
+ const int i1 = MAX(k0, k1);
+
+ const int offset = i0 * sizeof(float);
+ const int nelem = i1 - i0;
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
+
+ check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // view_2d
+ {
+ int64_t ne2[4];
+ int64_t nb2[4];
+
+ const int nargs = 1;
+ for (int ndims = 1; ndims <= 4; ++ndims) {
+
+ x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+
+ get_random_dims(ne2, 2);
+ while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
+ get_random_dims(ne2, 2);
+ }
+ const int count = ne2[0]*ne2[1];
+
+ nb2[0] = sizeof(float);
+ nb2[1] = nb2[0]*ne2[0];
+
+ ggml_set_param(ctx0, x[0]);
+
+ const int max_offset = ggml_nelements(x[0]) - count;
+ const int offset = irand(max_offset+1) * sizeof(float);
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
+
+ check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // view_3d
+ {
+ int64_t ne2[4] = {1,1,1,1};
+ int64_t nb2[4] = {0,0,0,0};
+
+ const int nargs = 1;
+ for (int ndims = 1; ndims <= 4; ++ndims) {
+
+ x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+
+ get_random_dims(ne2, 3);
+ while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
+ get_random_dims(ne2, 3);
+ }
+ const int count = ne2[0]*ne2[1]*ne2[2];
+
+ nb2[0] = sizeof(float);
+ nb2[1] = nb2[0]*ne2[0];
+ nb2[2] = nb2[1]*ne2[1];
+
+ ggml_set_param(ctx0, x[0]);
+
+ const int max_offset = ggml_nelements(x[0]) - count;
+ const int offset = irand(max_offset+1) * sizeof(float);
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
+
+ check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // permute
+ {
+ int64_t ne2[4];
+
+ const int nargs = 1;
+ for (int ndims = 1; ndims <= 4; ++ndims)
+ {
+ // ggml_permute will set axes of dimensions below n_dims to 1.
+ // to make ggml_permute work correctly on all axes,
+ // the input tensor needs maximal n_dim of 4.
+ for (int i=0; i<ndims; ++i) {
+ ne2[i] = ne[i];
+ }
+ for (int i=ndims; i<4; ++i) {
+ ne2[i] = 1;
+ }
+ x[0] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f);
+
+ ggml_set_param(ctx0, x[0]);
+
+ const int p = irand(NUM_PERMUTATIONS);
+ const int ax0 = all_permutations[p*4+0];
+ const int ax1 = all_permutations[p*4+1];
+ const int ax2 = all_permutations[p*4+2];
+ const int ax3 = all_permutations[p*4+3];
+
+ // sum requires contiguous tensor rows
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
+
+ check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // transpose
+ {
+ int64_t ne2[4];
+
+ const int nargs = 1;
+ for (int ndims = 1; ndims <= 4; ++ndims)
+ {
+ // ggml_transpose will set axes of dimensions below n_dims to 1.
+ // to make ggml_transpose work correctly on all axes,
+ // the input tensor needs maximal n_dim of 4.
+ for (int i=0; i<ndims; ++i) {
+ ne2[i] = ne[i];
+ }
+ for (int i=ndims; i<4; ++i) {
+ ne2[i] = 1;
+ }
+ x[0] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f);
+
+ ggml_set_param(ctx0, x[0]);
+
+ // sum requires contiguous tensor rows
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
+
+ check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // get_rows
+ {
+ int64_t ne2[4] = {ne[0], ne[1], 1, 1};
+ int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
+ const int nargs = 1;
+ const int ndims = 2;
+ x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+ x[1] = get_random_tensor_int(ctx0, 1, ne3, 0, ne2[1]);
+
+ ggml_set_param(ctx0, x[0]);
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
+
+ check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+
+ // diag_mask_inf
+ {
+ const int nargs = 1;
+ const int ndims = 2;
+
+ x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[0]);
+
+ int n_past = irand(ne[0]);
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
+
+ check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+
+ // diag_mask_zero
+ {
+ const int nargs = 1;
+ const int ndims = 2;
+
+ x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[0]);
+
+ int n_past = irand(ne[0]);
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
+
+ check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+
+ // softmax
+ {
+ const int nargs = 1;
+
+ int64_t ne2[4];
+ get_random_dims(ne2, 4);
+
+ for (int ndims = 1; ndims <= 3; ++ndims) {
+ x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+ ggml_set_param(ctx0, x[0]);
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_soft_max(ctx0, x[0]));
+
+ check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+ }
+ }
+
+ // rope
+ {
+ const int nargs = 1;
+
+ int64_t ne2[4];
+ get_random_dims(ne2, 4);
+ ne2[0] += ne2[0] % 2;
+ int n_rot = ne2[0];
+
+ for (int ndims = 3; ndims <= 4; ++ndims) {
+ for (int mode = 0; mode < 4; ++mode) {
+ for (int n_past = 1; n_past < ne2[2]; ++n_past) {
+ x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+
+ ggml_set_param(ctx0, x[0]);
+
+ const bool skip_past = (mode & 1);
+ if (skip_past) {
+ // we have no past, so this would have to work on uninitialized memory.
+ // we only test the gradients here;
+ // skip_past should have no influence on gradient computation.
+ // so when other modes work, we assume that this does as well.
+ continue;
+ }
+
+ struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode));
+
+ GGML_PRINT_DEBUG("rope: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
+ check_gradient("rope", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
+ }
+ }
+ }
+ }
+
+ ggml_free(ctx0);
+ }
+
+ return 0;
+}
diff --git a/tests/test-opt.c b/tests/test-opt.c
new file mode 100644
index 0000000..d001615
--- /dev/null
+++ b/tests/test-opt.c
@@ -0,0 +1,205 @@
+#include "ggml.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define MAX_NARGS 2
+
+
+//
+// logging
+//
+#define GGML_DEBUG 0
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+#define GGML_PRINT(...) printf(__VA_ARGS__)
+
+
+float frand() {
+ return (float)rand()/(float)RAND_MAX;
+}
+
+int irand(int n) {
+ return rand()%n;
+}
+
+void get_random_dims(int64_t * dims, int ndims) {
+ dims[0] = dims[1] = dims[2] = dims[3] = 1;
+
+ for (int i = 0; i < ndims; i++) {
+ dims[i] = 1 + irand(4);
+ }
+}
+
+void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
+ dims[0] = dims[1] = dims[2] = dims[3] = 1;
+
+ for (int i = 0; i < ndims; i++) {
+ dims[i] = min + irand(max-min);
+ }
+}
+
+
+struct ggml_tensor * get_random_tensor(
+ struct ggml_context * ctx0,
+ int ndims,
+ int64_t ne[],
+ float fmin,
+ float fmax) {
+ struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
+
+ switch (ndims) {
+ case 1:
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
+ }
+ break;
+ case 2:
+ for (int i1 = 0; i1 < ne[1]; i1++) {
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+ }
+ }
+ break;
+ case 3:
+ for (int i2 = 0; i2 < ne[2]; i2++) {
+ for (int i1 = 0; i1 < ne[1]; i1++) {
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+ }
+ }
+ }
+ break;
+ case 4:
+ for (int i3 = 0; i3 < ne[3]; i3++) {
+ for (int i2 = 0; i2 < ne[2]; i2++) {
+ for (int i1 = 0; i1 < ne[1]; i1++) {
+ for (int i0 = 0; i0 < ne[0]; i0++) {
+ ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+ }
+ }
+ }
+ }
+ break;
+ default:
+ assert(false);
+ };
+
+ return result;
+}
+
+float get_element(const struct ggml_tensor * t, int idx) {
+ return ((float *)t->data)[idx];
+}
+
+void set_element(struct ggml_tensor * t, int idx, float value) {
+ ((float *)t->data)[idx] = value;
+}
+
+int main(int argc, const char ** argv) {
+ struct ggml_init_params params = {
+ .mem_size = 1024*1024*1024,
+ .mem_buffer = NULL,
+ .no_alloc = false,
+ };
+ struct ggml_context * ctx = ggml_init(params);
+
+ int64_t ne1[4] = {4, 1024, 1, 1};
+ int64_t ne2[4] = {4, 2048, 1, 1};;
+ int64_t ne3[4] = {1024, 2048, 1, 1};
+
+ struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
+ struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);
+ ggml_set_param(ctx, a);
+ ggml_set_param(ctx, b);
+
+ struct ggml_tensor * c = get_random_tensor(ctx, 2, ne3, -1, +1);
+
+ struct ggml_tensor * ab = ggml_mul_mat(ctx, a, b);
+ struct ggml_tensor * d = ggml_sub(ctx, c, ab);
+ struct ggml_tensor * e = ggml_sum(ctx, ggml_sqr(ctx, d));
+
+
+ struct ggml_cgraph ge = ggml_build_forward(e);
+ ggml_graph_reset (&ge);
+ ggml_graph_compute(ctx, &ge);
+ const float fe = ggml_get_f32_1d(e, 0);
+ printf("%s: e = %.4f\n", __func__, fe);
+
+ struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
+
+ ggml_opt(ctx, opt_params, e);
+
+ ggml_graph_reset (&ge);
+ ggml_graph_compute(ctx, &ge);
+ const float fe_opt = ggml_get_f32_1d(e, 0);
+ printf("%s: original e = %.4f\n", __func__, fe);
+ printf("%s: optimized e = %.4f\n", __func__, fe_opt);
+
+ const bool success = (fe_opt <= fe);
+ assert(success);
+
+ ggml_free(ctx);
+ return success ? 0 : -1;
+}
+// int64_t ne1[4] = {4, 128, 1, 1};
+// int64_t ne2[4] = {4, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original e = 25890.9375
+// main: optimized e = 10094.7031
+
+// int64_t ne1[4] = {8, 128, 1, 1};
+// int64_t ne2[4] = {8, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original e = 39429.5078
+// main: optimized e = 9275.8936
+
+// int64_t ne1[4] = {16, 128, 1, 1};
+// int64_t ne2[4] = {16, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original e = 68371.1328
+// main: optimized e = 7854.4502
+
+
+// int64_t ne1[4] = {32, 128, 1, 1};
+// int64_t ne2[4] = {32, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original e = 126061.1953
+// main: optimized e = 5451.0166
+
+// int64_t ne1[4] = {4, 1024, 1, 1};
+// int64_t ne2[4] = {4, 2048, 1, 1};;
+// int64_t ne3[4] = {1024, 2048, 1, 1};
+// main: original e = 1620817.8750
+// main: optimized e = 698387.6875
+
+// another run on M1
+// int64_t ne1[4] = {4, 1024, 1, 1};
+// int64_t ne2[4] = {4, 2048, 1, 1};;
+// int64_t ne3[4] = {1024, 2048, 1, 1};
+// main: original e = 1629595.6250
+// main: optimized e = 698169.1250
+
+// int64_t ne1[4] = {32, 1024, 1, 1};
+// int64_t ne2[4] = {32, 2048, 1, 1};;
+// int64_t ne3[4] = {1024, 2048, 1, 1};
+// main: original e = 8146770.5000
+// main: optimized e = 651119.1250