From d01bccde9f759b24449fdaa16306b406a50eb367 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 18 Jul 2023 14:24:43 +0300
Subject: ci : integrate with ggml-org/ci (#2250)

* ci : run ctest

ggml-ci

* ci : add open llama 3B-v2 tests

ggml-ci

* ci : disable wget progress output

ggml-ci

* ci : add open llama 3B-v2 tg tests for q4 and q5 quantizations

ggml-ci

* tests : try to fix tail free sampling test

ggml-ci

* ci : add K-quants

ggml-ci

* ci : add short perplexity tests

ggml-ci

* ci : add README.md

* ppl : add --chunks argument to limit max number of chunks

ggml-ci

* ci : update README
---
 tests/test-sampling.cpp | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tests')

diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 64f9455..4437c39 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -200,4 +200,6 @@ int main(void) {
     test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
 
     printf("OK\n");
+
+    return 0;
 }
-- 
cgit v1.2.3


From b1f429095328a34556c0e9a7a2fefced3db3368c Mon Sep 17 00:00:00 2001
From: wzy <32936898+Freed-Wu@users.noreply.github.com>
Date: Wed, 19 Jul 2023 15:01:11 +0800
Subject: cmake : install targets (#2256)

fix #2252
---
 CMakeLists.txt                                  | 25 +++++++++++++++++++++++++
 convert-lora-to-ggml.py                         |  1 +
 convert.py                                      |  1 +
 examples/baby-llama/CMakeLists.txt              |  1 +
 examples/benchmark/CMakeLists.txt               |  1 +
 examples/embd-input/CMakeLists.txt              |  2 ++
 examples/embedding/CMakeLists.txt               |  1 +
 examples/main/CMakeLists.txt                    |  1 +
 examples/metal/CMakeLists.txt                   |  1 +
 examples/perplexity/CMakeLists.txt              |  1 +
 examples/quantize-stats/CMakeLists.txt          |  1 +
 examples/quantize/CMakeLists.txt                |  1 +
 examples/save-load-state/CMakeLists.txt         |  1 +
 examples/server/CMakeLists.txt                  |  1 +
 examples/simple/CMakeLists.txt                  |  1 +
 examples/train-text-from-scratch/CMakeLists.txt |  1 +
 tests/CMakeLists.txt                            |  1 +
 17 files changed, 42 insertions(+)
 mode change 100644 => 100755 convert-lora-to-ggml.py
 mode change 100644 => 100755 convert.py

(limited to 'tests')

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d9381da..abc9681 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -512,6 +512,7 @@ if (BUILD_SHARED_LIBS)
     set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
     add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
     target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
+    install(TARGETS ggml_shared LIBRARY)
 endif()
 
 add_library(llama
@@ -533,8 +534,32 @@ if (BUILD_SHARED_LIBS)
     if (LLAMA_METAL)
         set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
     endif()
+    install(TARGETS llama LIBRARY)
 endif()
 
+include(GNUInstallDirs)
+install(
+    FILES convert.py
+    PERMISSIONS
+        OWNER_READ
+        OWNER_WRITE
+        OWNER_EXECUTE
+        GROUP_READ
+        GROUP_EXECUTE
+        WORLD_READ
+        WORLD_EXECUTE
+    DESTINATION ${CMAKE_INSTALL_BINDIR})
+install(
+    FILES convert-lora-to-ggml.py
+    PERMISSIONS
+        OWNER_READ
+        OWNER_WRITE
+        OWNER_EXECUTE
+        GROUP_READ
+        GROUP_EXECUTE
+        WORLD_READ
+        WORLD_EXECUTE
+    DESTINATION ${CMAKE_INSTALL_BINDIR})
 
 #
 # programs, examples and tests
diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py
old mode 100644
new mode 100755
index f43c836..b4999ff
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 import json
 import os
 import re
diff --git a/convert.py b/convert.py
old mode 100644
new mode 100755
index 7a2705e..e3f1096
--- a/convert.py
+++ b/convert.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 import argparse
 import concurrent.futures
 import copy
diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt
index d2ce363..7b70227 100644
--- a/examples/baby-llama/CMakeLists.txt
+++ b/examples/baby-llama/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(TARGET baby-llama)
 add_executable(${TARGET} baby-llama.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt
index 0376961..3f34153 100644
--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET benchmark)
 add_executable(${TARGET} benchmark-matmult.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
diff --git a/examples/embd-input/CMakeLists.txt b/examples/embd-input/CMakeLists.txt
index 2b62395..5bbb1ea 100644
--- a/examples/embd-input/CMakeLists.txt
+++ b/examples/embd-input/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET embdinput)
 add_library(${TARGET} embd-input-lib.cpp embd-input.h)
+install(TARGETS ${TARGET} LIBRARY)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
@@ -8,6 +9,7 @@ endif()
 
 set(TARGET embd-input-test)
 add_executable(${TARGET} embd-input-test.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt
index db73b6b..0c752c7 100644
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET embedding)
 add_executable(${TARGET} embedding.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
index c364242..cc18889 100644
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET main)
 add_executable(${TARGET} main.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
diff --git a/examples/metal/CMakeLists.txt b/examples/metal/CMakeLists.txt
index a8c4284..f16d491 100644
--- a/examples/metal/CMakeLists.txt
+++ b/examples/metal/CMakeLists.txt
@@ -1,3 +1,4 @@
 set(TEST_TARGET metal)
 add_executable(${TEST_TARGET} metal.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml)
diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt
index 61b17b8..af00b4e 100644
--- a/examples/perplexity/CMakeLists.txt
+++ b/examples/perplexity/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET perplexity)
 add_executable(${TARGET} perplexity.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt
index 7bebc11..c5c3940 100644
--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(TARGET quantize-stats)
 add_executable(${TARGET} quantize-stats.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
index 475fc8b..47d0be7 100644
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt
index 08dbe5c..eadd13c 100644
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET save-load-state)
 add_executable(${TARGET} save-load-state.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index 07ba76a..812a24b 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -2,6 +2,7 @@ set(TARGET server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 add_executable(${TARGET} server.cpp json.hpp httplib.h)
+install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
     SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt
index 1568f73..0ac9cb0 100644
--- a/examples/simple/CMakeLists.txt
+++ b/examples/simple/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET simple)
 add_executable(${TARGET} simple.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
diff --git a/examples/train-text-from-scratch/CMakeLists.txt b/examples/train-text-from-scratch/CMakeLists.txt
index 1a44c49..4459516 100644
--- a/examples/train-text-from-scratch/CMakeLists.txt
+++ b/examples/train-text-from-scratch/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(TARGET train-text-from-scratch)
 add_executable(${TARGET} train-text-from-scratch.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 1acf050..11ec6c7 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,6 +1,7 @@
 function(llama_add_test source)
     get_filename_component(TEST_TARGET ${source} NAME_WE)
     add_executable(${TEST_TARGET} ${source})
+    install(TARGETS ${TEST_TARGET} RUNTIME)
     target_link_libraries(${TEST_TARGET} PRIVATE llama)
     add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
 endfunction()
-- 
cgit v1.2.3


From 5b2b2dc6ae8086bff7c9b3c17fb435cf319b7185 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 24 Jul 2023 14:46:21 +0300
Subject: ggml : sync (unary ops refactor, static-correctness) (#2370)

* ggml : sync (unary ops, tests)

ggml-ci

* tests : remove unnecessary funcs
---
 ggml-cuda.cu       |  29 +-
 ggml-metal.m       |  96 ++++---
 ggml.c             | 775 ++++++++++++++++++++++++-----------------------------
 ggml.h             |  60 ++++-
 tests/test-grad0.c | 479 +++++++++++++++++++++++++++------
 tests/test-opt.c   |   6 +-
 6 files changed, 870 insertions(+), 575 deletions(-)

(limited to 'tests')

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 6823adc..b8c9835 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -3962,18 +3962,23 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
             }
             func = ggml_cuda_mul;
             break;
-        case GGML_OP_GELU:
-            if (!any_on_device) {
-                return false;
-            }
-            func = ggml_cuda_gelu;
-            break;
-        case GGML_OP_SILU:
-            if (!any_on_device) {
-                return false;
-            }
-            func = ggml_cuda_silu;
-            break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(tensor)) {
+                case GGML_UNARY_OP_GELU:
+                    if (!any_on_device) {
+                        return false;
+                    }
+                    func = ggml_cuda_gelu;
+                    break;
+                case GGML_UNARY_OP_SILU:
+                    if (!any_on_device) {
+                        return false;
+                    }
+                    func = ggml_cuda_silu;
+                    break;
+                default:
+                    return false;
+            } break;
         case GGML_OP_NORM:
             if (!any_on_device) {
                 return false;
diff --git a/ggml-metal.m b/ggml-metal.m
index bf3f68f..1fd6e85 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -519,48 +519,56 @@ void ggml_metal_graph_compute(
 
                             [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                         } break;
-                    case GGML_OP_SILU:
-                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
-                            [encoder setComputePipelineState:ctx->pipeline_silu];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                            const int64_t n = ggml_nelements(dst);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    case GGML_OP_RELU:
-                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
-                            [encoder setComputePipelineState:ctx->pipeline_relu];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                            const int64_t n = ggml_nelements(dst);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    case GGML_OP_UNARY:
+                        switch (ggml_get_unary_op(gf->nodes[i])) {
+                            case GGML_UNARY_OP_SILU:
+                                {
+                                    if (encoder == nil) {
+                                        encoder = [command_buffer computeCommandEncoder];
+                                    }
+
+                                    [encoder setComputePipelineState:ctx->pipeline_silu];
+                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                                    const int64_t n = ggml_nelements(dst);
+
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                } break;
+                            case GGML_UNARY_OP_RELU:
+                                {
+                                    if (encoder == nil) {
+                                        encoder = [command_buffer computeCommandEncoder];
+                                    }
+
+                                    [encoder setComputePipelineState:ctx->pipeline_relu];
+                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                                    const int64_t n = ggml_nelements(dst);
+
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                } break;
+                            case GGML_UNARY_OP_GELU:
+                                {
+                                    if (encoder == nil) {
+                                        encoder = [command_buffer computeCommandEncoder];
+                                    }
+
+                                    [encoder setComputePipelineState:ctx->pipeline_gelu];
+                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                                    const int64_t n = ggml_nelements(dst);
+
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                } break;
+                            default:
+                                {
+                                    fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                                    GGML_ASSERT(false);
+                                }
                         } break;
-                    case GGML_OP_GELU:
-                    {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
-                            [encoder setComputePipelineState:ctx->pipeline_gelu];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                            const int64_t n = ggml_nelements(dst);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
                     case GGML_OP_SOFT_MAX:
                         {
                             if (encoder == nil) {
@@ -979,8 +987,10 @@ void ggml_metal_graph_compute(
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
                     default:
-                        fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                        GGML_ASSERT(false);
+                        {
+                            fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                            GGML_ASSERT(false);
+                        }
                 }
             }
 
diff --git a/ggml.c b/ggml.c
index 9ee4a8d..960b805 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3440,7 +3440,9 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
 
 //inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
 inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
-#if defined(GGML_SIMD)
+#if defined(GGML_USE_ACCELERATE)
+    vDSP_vsmul(y, 1, &v, y, 1, n);
+#elif defined(GGML_SIMD)
     const int np = (n & ~(GGML_F32_STEP - 1));
 
     GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
@@ -3603,7 +3605,7 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
 #endif
 }
 
-inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x) {
+inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
     ggml_float sum = 0.0;
     for (int i = 0; i < n; ++i) {
         sum += (ggml_float)x[i];
@@ -3611,6 +3613,14 @@ inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x
     *s = sum;
 }
 
+inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
+    float sum = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        sum += GGML_FP16_TO_FP32(x[i]);
+    }
+    *s = sum;
+}
+
 inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
 #ifndef GGML_USE_ACCELERATE
     float max = -INFINITY;
@@ -3750,16 +3760,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "ARGMAX",
     "REPEAT",
     "REPEAT_BACK",
-    "ABS",
-    "SGN",
-    "NEG",
-    "STEP",
-    "TANH",
-    "ELU",
-    "RELU",
-    "GELU",
-    "GELU_QUICK",
-    "SILU",
     "SILU_BACK",
     "NORM",
     "RMS_NORM",
@@ -3798,6 +3798,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "WIN_PART",
     "WIN_UNPART",
 
+    "UNARY",
+
     "MAP_UNARY",
     "MAP_BINARY",
 
@@ -3809,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
+static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -3830,16 +3832,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "argmax(x)",
     "repeat(x)",
     "repeat_back(x)",
-    "abs(x)",
-    "sgn(x)",
-    "-x",
-    "step(x)",
-    "tanh(x)",
-    "elu(x)",
-    "relu(x)",
-    "gelu(x)",
-    "gelu_quick(x)",
-    "silu(x)",
     "silu_back(x)",
     "norm(x)",
     "rms_norm(x)",
@@ -3878,6 +3870,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "win_part(x)",
     "win_unpart(x)",
 
+    "unary(x)",
+
     "f(x)",
     "f(x,y)",
 
@@ -3889,7 +3883,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
+static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -4145,6 +4139,10 @@ const char * ggml_op_name(enum ggml_op op) {
     return GGML_OP_NAME[op];
 }
 
+const char * ggml_op_symbol(enum ggml_op op) {
+    return GGML_OP_SYMBOL[op];
+}
+
 size_t ggml_element_size(const struct ggml_tensor * tensor) {
     return GGML_TYPE_SIZE[tensor->type];
 }
@@ -4443,6 +4441,10 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
     return result;
 }
 
+bool ggml_get_no_alloc(struct ggml_context * ctx) {
+    return ctx->no_alloc;
+}
+
 void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
     ctx->no_alloc = no_alloc;
 }
@@ -4480,7 +4482,7 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
 // this is an error prone process, but it is necessary to support inplace
 // operators when using scratch buffers
 // TODO: implement a better way
-void ggml_scratch_save(struct ggml_context * ctx) {
+static void ggml_scratch_save(struct ggml_context * ctx) {
     // this is needed to allow opt tensors to store their data
     // TODO: again, need to find a better way
     ctx->no_alloc_save = ctx->no_alloc;
@@ -4490,7 +4492,7 @@ void ggml_scratch_save(struct ggml_context * ctx) {
     ctx->scratch.data = NULL;
 }
 
-void ggml_scratch_load(struct ggml_context * ctx) {
+static void ggml_scratch_load(struct ggml_context * ctx) {
     ctx->no_alloc = ctx->no_alloc_save;
 
     ctx->scratch = ctx->scratch_save;
@@ -4498,7 +4500,7 @@ void ggml_scratch_load(struct ggml_context * ctx) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-struct ggml_tensor * ggml_new_tensor_impl(
+static struct ggml_tensor * ggml_new_tensor_impl(
         struct ggml_context * ctx,
         enum   ggml_type type,
         int    n_dims,
@@ -4621,6 +4623,21 @@ struct ggml_tensor * ggml_new_tensor_impl(
     return result;
 }
 
+static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
+    assert(params_size <= GGML_MAX_OP_PARAMS);
+    memcpy(tensor->op_params, params, params_size);
+}
+
+static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
+    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
+    return ((const int32_t *)(tensor->op_params))[i];
+}
+
+static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
+    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
+    ((int32_t *)(tensor->op_params))[i] = value;
+}
+
 struct ggml_tensor * ggml_new_tensor(
         struct ggml_context * ctx,
         enum   ggml_type type,
@@ -4952,6 +4969,16 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
     return (float *)(tensor->data);
 }
 
+enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor->op == GGML_OP_UNARY);
+    return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
+}
+
+static void ggml_set_unary_op(struct ggml_tensor * tensor, enum ggml_unary_op op) {
+    GGML_ASSERT(tensor->op = GGML_OP_UNARY);
+    ggml_set_op_params_i32(tensor, 0, (int32_t) op);
+}
+
 const char * ggml_get_name(const struct ggml_tensor * tensor) {
     return tensor->name;
 }
@@ -4970,11 +4997,6 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
     return tensor;
 }
 
-static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
-    assert(params_size <= GGML_MAX_OP_PARAMS);
-    memcpy(tensor->op_params, params, params_size);
-}
-
 struct ggml_tensor * ggml_view_tensor(
         struct ggml_context * ctx,
         const struct ggml_tensor * src) {
@@ -5010,7 +5032,7 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
 
 // ggml_dup
 
-struct ggml_tensor * ggml_dup_impl(
+static struct ggml_tensor * ggml_dup_impl(
         struct ggml_context * ctx,
         struct ggml_tensor * a,
         bool inplace) {
@@ -5043,7 +5065,7 @@ struct ggml_tensor * ggml_dup_inplace(
 
 // ggml_add
 
-struct ggml_tensor * ggml_add_impl(
+static struct ggml_tensor * ggml_add_impl(
         struct ggml_context * ctx,
         struct ggml_tensor * a,
         struct ggml_tensor * b,
@@ -5086,7 +5108,7 @@ struct ggml_tensor * ggml_add_inplace(
 
 // ggml_add1
 
-struct ggml_tensor * ggml_add1_impl(
+static struct ggml_tensor * ggml_add1_impl(
         struct ggml_context * ctx,
         struct ggml_tensor * a,
         struct ggml_tensor * b,
@@ -5126,7 +5148,7 @@ struct ggml_tensor * ggml_add1_inplace(
 
 // ggml_acc
 
-struct ggml_tensor * ggml_acc_impl(
+static struct ggml_tensor * ggml_acc_impl(
         struct ggml_context * ctx,
         struct ggml_tensor * a,
         struct ggml_tensor * b,
@@ -5183,7 +5205,7 @@ struct ggml_tensor * ggml_acc_inplace(
 
 // ggml_sub
 
-struct ggml_tensor * ggml_sub_impl(
+static struct ggml_tensor * ggml_sub_impl(
         struct ggml_context * ctx,
         struct ggml_tensor * a,
         struct ggml_tensor * b,
@@ -5222,7 +5244,7 @@ struct ggml_tensor * ggml_sub_inplace(
 
 // ggml_mul
 
-struct ggml_tensor * ggml_mul_impl(
+static struct ggml_tensor * ggml_mul_impl(
         struct ggml_context * ctx,
         struct ggml_tensor * a,
         struct ggml_tensor * b,
@@ -5269,7 +5291,7 @@ struct ggml_tensor * ggml_mul_inplace(
 
 // ggml_div
 
-struct ggml_tensor * ggml_div_impl(
+static struct ggml_tensor * ggml_div_impl(
         struct ggml_context * ctx,
         struct ggml_tensor * a,
         struct ggml_tensor * b,
@@ -5312,7 +5334,7 @@ struct ggml_tensor * ggml_div_inplace(
 
 // ggml_sqr
 
-struct ggml_tensor * ggml_sqr_impl(
+static struct ggml_tensor * ggml_sqr_impl(
         struct ggml_context * ctx,
         struct ggml_tensor * a,
         bool inplace) {
@@ -5345,7 +5367,7 @@ struct ggml_tensor * ggml_sqr_inplace(
 
 // ggml_sqrt
 
-struct ggml_tensor * ggml_sqrt_impl(
+static struct ggml_tensor * ggml_sqrt_impl(
         struct ggml_context * ctx,
         struct ggml_tensor * a,
         bool inplace) {
@@ -5379,7 +5401,7 @@ struct ggml_tensor * ggml_sqrt_inplace(
 
 // ggml_log
 
-struct ggml_tensor * ggml_log_impl(
+static struct ggml_tensor * ggml_log_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         bool inplace) {
@@ -5559,333 +5581,142 @@ struct ggml_tensor * ggml_repeat_back(
 
 // ggml_abs
 
-struct ggml_tensor * ggml_abs_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_ABS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
 struct ggml_tensor * ggml_abs(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_abs_impl(ctx, a, false);
+    return ggml_unary(ctx, a, GGML_UNARY_OP_ABS);
 }
 
 struct ggml_tensor * ggml_abs_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_abs_impl(ctx, a, true);
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS);
 }
 
-
 // ggml_sgn
 
-struct ggml_tensor * ggml_sgn_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SGN;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
 struct ggml_tensor * ggml_sgn(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_sgn_impl(ctx, a, false);
+    return ggml_unary(ctx, a, GGML_UNARY_OP_SGN);
 }
 
 struct ggml_tensor * ggml_sgn_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_sgn_impl(ctx, a, true);
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN);
 }
 
 // ggml_neg
 
-struct ggml_tensor * ggml_neg_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_NEG;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
 struct ggml_tensor * ggml_neg(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_neg_impl(ctx, a, false);
+    return ggml_unary(ctx, a, GGML_UNARY_OP_NEG);
 }
 
 struct ggml_tensor * ggml_neg_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_neg_impl(ctx, a, true);
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG);
 }
 
 // ggml_step
 
-struct ggml_tensor * ggml_step_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_STEP;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
 struct ggml_tensor * ggml_step(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_step_impl(ctx, a, false);
+    return ggml_unary(ctx, a, GGML_UNARY_OP_STEP);
 }
 
 struct ggml_tensor * ggml_step_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_step_impl(ctx, a, true);
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP);
 }
 
 // ggml_tanh
 
-struct ggml_tensor * ggml_tanh_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_TANH;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
 struct ggml_tensor * ggml_tanh(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_tanh_impl(ctx, a, false);
+    return ggml_unary(ctx, a, GGML_UNARY_OP_TANH);
 }
 
 struct ggml_tensor * ggml_tanh_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_tanh_impl(ctx, a, true);
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH);
 }
 
 // ggml_elu
 
-struct ggml_tensor * ggml_elu_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_ELU;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
 struct ggml_tensor * ggml_elu(
     struct ggml_context * ctx,
     struct ggml_tensor  * a) {
-    return ggml_elu_impl(ctx, a, false);
+    return ggml_unary(ctx, a, GGML_UNARY_OP_ELU);
 }
 
 struct ggml_tensor * ggml_elu_inplace(
     struct ggml_context * ctx,
     struct ggml_tensor  * a) {
-    return ggml_elu_impl(ctx, a, true);
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU);
 }
 
 // ggml_relu
 
-struct ggml_tensor * ggml_relu_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_RELU;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
 struct ggml_tensor * ggml_relu(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_relu_impl(ctx, a, false);
+    return ggml_unary(ctx, a, GGML_UNARY_OP_RELU);
 }
 
 struct ggml_tensor * ggml_relu_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_relu_impl(ctx, a, true);
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
 }
 
 // ggml_gelu
 
-struct ggml_tensor * ggml_gelu_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_GELU;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
 struct ggml_tensor * ggml_gelu(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_gelu_impl(ctx, a, false);
+    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU);
 }
 
 struct ggml_tensor * ggml_gelu_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_gelu_impl(ctx, a, true);
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
 }
 
 // ggml_gelu_quick
 
-struct ggml_tensor * ggml_gelu_quick_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_GELU_QUICK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
 struct ggml_tensor * ggml_gelu_quick(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_gelu_quick_impl(ctx, a, false);
+    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK);
 }
 
 struct ggml_tensor * ggml_gelu_quick_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_gelu_quick_impl(ctx, a, true);
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK);
 }
 
 // ggml_silu
 
-struct ggml_tensor * ggml_silu_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SILU;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
 struct ggml_tensor * ggml_silu(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_silu_impl(ctx, a, false);
+    return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
 }
 
 struct ggml_tensor * ggml_silu_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_silu_impl(ctx, a, true);
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
 }
 
 // ggml_silu_back
@@ -5913,7 +5744,7 @@ struct ggml_tensor * ggml_silu_back(
 
 // ggml_norm
 
-struct ggml_tensor * ggml_norm_impl(
+static struct ggml_tensor * ggml_norm_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         bool inplace) {
@@ -5947,7 +5778,7 @@ struct ggml_tensor * ggml_norm_inplace(
     return ggml_norm_impl(ctx, a, true);
 }
 
-struct ggml_tensor * ggml_rms_norm_impl(
+static struct ggml_tensor * ggml_rms_norm_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         bool inplace) {
@@ -6056,7 +5887,7 @@ struct ggml_tensor * ggml_out_prod(
 
 // ggml_scale
 
-struct ggml_tensor * ggml_scale_impl(
+static struct ggml_tensor * ggml_scale_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
@@ -6096,7 +5927,7 @@ struct ggml_tensor * ggml_scale_inplace(
 
 // ggml_set
 
-struct ggml_tensor * ggml_set_impl(
+static struct ggml_tensor * ggml_set_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
@@ -6186,7 +6017,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
 
 // ggml_cpy
 
-struct ggml_tensor * ggml_cpy_impl(
+static struct ggml_tensor * ggml_cpy_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
@@ -6231,7 +6062,7 @@ struct ggml_tensor * ggml_cpy_inplace(
 
 // ggml_cont
 
-struct ggml_tensor * ggml_cont_impl(
+static struct ggml_tensor * ggml_cont_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         bool inplace) {
@@ -6701,7 +6532,7 @@ struct ggml_tensor * ggml_diag(
 
 // ggml_diag_mask_inf
 
-struct ggml_tensor * ggml_diag_mask_inf_impl(
+static struct ggml_tensor * ggml_diag_mask_inf_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         int                   n_past,
@@ -6741,7 +6572,7 @@ struct ggml_tensor * ggml_diag_mask_inf_inplace(
 
 // ggml_diag_mask_zero
 
-struct ggml_tensor * ggml_diag_mask_zero_impl(
+static struct ggml_tensor * ggml_diag_mask_zero_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         int                   n_past,
@@ -6780,7 +6611,7 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
 
 // ggml_soft_max
 
-struct ggml_tensor * ggml_soft_max_impl(
+static struct ggml_tensor * ggml_soft_max_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         bool                  inplace) {
@@ -6814,7 +6645,7 @@ struct ggml_tensor * ggml_soft_max_inplace(
 
 // ggml_soft_max_back
 
-struct ggml_tensor * ggml_soft_max_back_impl(
+static struct ggml_tensor * ggml_soft_max_back_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
@@ -6851,7 +6682,7 @@ struct ggml_tensor * ggml_soft_max_back_inplace(
 
 // ggml_rope
 
-struct ggml_tensor * ggml_rope_impl(
+static struct ggml_tensor * ggml_rope_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         int                   n_past,
@@ -7363,9 +7194,47 @@ struct ggml_tensor * ggml_win_unpart(
     return result;
 }
 
+// gmml_unary
+
+static struct ggml_tensor * ggml_unary_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        enum ggml_unary_op op,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_unary_op(result, op);
+
+    result->op   = GGML_OP_UNARY;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_unary(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_unary_op op) {
+    return ggml_unary_impl(ctx, a, op, false);
+}
+
+struct ggml_tensor * ggml_unary_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_unary_op op) {
+    return ggml_unary_impl(ctx, a, op, true);
+}
+
 // ggml_map_unary
 
-struct ggml_tensor * ggml_map_unary_impl_f32(
+static struct ggml_tensor * ggml_map_unary_impl_f32(
         struct ggml_context        * ctx,
         struct ggml_tensor         * a,
         const  ggml_unary_op_f32_t fun,
@@ -7403,7 +7272,7 @@ struct ggml_tensor * ggml_map_unary_inplace_f32(
 
 // ggml_map_binary
 
-struct ggml_tensor * ggml_map_binary_impl_f32(
+static struct ggml_tensor * ggml_map_binary_impl_f32(
         struct ggml_context         * ctx,
         struct ggml_tensor          * a,
         struct ggml_tensor          * b,
@@ -7447,7 +7316,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
 
 // ggml_map_custom1
 
-struct ggml_tensor * ggml_map_custom1_impl_f32(
+static struct ggml_tensor * ggml_map_custom1_impl_f32(
         struct ggml_context          * ctx,
         struct ggml_tensor           * a,
         const  ggml_custom1_op_f32_t   fun,
@@ -7485,7 +7354,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
 
 // ggml_map_custom2
 
-struct ggml_tensor * ggml_map_custom2_impl_f32(
+static struct ggml_tensor * ggml_map_custom2_impl_f32(
         struct ggml_context          * ctx,
         struct ggml_tensor           * a,
         struct ggml_tensor           * b,
@@ -7527,7 +7396,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
 
 // ggml_map_custom3
 
-struct ggml_tensor * ggml_map_custom3_impl_f32(
+static struct ggml_tensor * ggml_map_custom3_impl_f32(
         struct ggml_context          * ctx,
         struct ggml_tensor           * a,
         struct ggml_tensor           * b,
@@ -9292,7 +9161,7 @@ static void ggml_compute_forward_sum_f32(
     for (int64_t i03 = 0; i03 < ne03; i03++) {
         for (int64_t i02 = 0; i02 < ne02; i02++) {
             for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_ggf(ne00,
+                ggml_vec_sum_f32_ggf(ne00,
                         &row_sum,
                         (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
                 sum += row_sum;
@@ -9302,6 +9171,38 @@ static void ggml_compute_forward_sum_f32(
     ((float *) dst->data)[0] = sum;
 }
 
+static void ggml_compute_forward_sum_f16(
+    const struct ggml_compute_params * params,
+    const struct ggml_tensor * src0,
+          struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_is_scalar(dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    assert(src0->nb[0] == sizeof(ggml_fp16_t));
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb);
+
+    float sum = 0;
+    float row_sum = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                ggml_vec_sum_f16_ggf(ne00,
+                    &row_sum,
+                    (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
+                sum += row_sum;
+            }
+        }
+    }
+    ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
+}
+
 static void ggml_compute_forward_sum(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -9311,6 +9212,10 @@ static void ggml_compute_forward_sum(
             {
                 ggml_compute_forward_sum_f32(params, src0, dst);
             } break;
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_sum_f16(params, src0, dst);
+            } break;
         default:
             {
                 GGML_ASSERT(false);
@@ -10077,7 +9982,6 @@ static void ggml_compute_forward_silu(
     }
 }
 
-
 // ggml_compute_forward_silu_back
 
 static void ggml_compute_forward_silu_back_f32(
@@ -14122,6 +14026,62 @@ static void ggml_compute_forward_win_unpart(
     }
 }
 
+//gmml_compute_forward_unary
+
+static void ggml_compute_forward_unary(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    const enum ggml_unary_op op = ggml_get_unary_op(dst);
+
+    switch (op) {
+        case GGML_UNARY_OP_ABS:
+            {
+                ggml_compute_forward_abs(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_SGN:
+            {
+                ggml_compute_forward_sgn(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_NEG:
+            {
+                ggml_compute_forward_neg(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_STEP:
+            {
+                ggml_compute_forward_step(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_TANH:
+            {
+                ggml_compute_forward_tanh(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_ELU:
+            {
+                ggml_compute_forward_elu(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_RELU:
+            {
+                ggml_compute_forward_relu(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_GELU:
+            {
+                ggml_compute_forward_gelu(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_GELU_QUICK:
+            {
+                ggml_compute_forward_gelu_quick(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_SILU:
+            {
+                ggml_compute_forward_silu(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_map_unary
 
 static void ggml_compute_forward_map_unary_f32(
@@ -14682,46 +14642,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
             } break;
-        case GGML_OP_ABS:
-            {
-                ggml_compute_forward_abs(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_SGN:
-            {
-                ggml_compute_forward_sgn(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_NEG:
-            {
-                ggml_compute_forward_neg(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_STEP:
-            {
-                ggml_compute_forward_step(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_TANH:
-            {
-                ggml_compute_forward_tanh(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_ELU:
-            {
-                ggml_compute_forward_elu(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_RELU:
-            {
-                ggml_compute_forward_relu(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_GELU:
-            {
-                ggml_compute_forward_gelu(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_GELU_QUICK:
-            {
-                ggml_compute_forward_gelu_quick(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_SILU:
-            {
-                ggml_compute_forward_silu(params, tensor->src[0], tensor);
-            } break;
         case GGML_OP_SILU_BACK:
             {
                 ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
@@ -14864,6 +14784,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
             } break;
+        case GGML_OP_UNARY:
+            {
+                ggml_compute_forward_unary(params, tensor->src[0], tensor);
+            } break;
         case GGML_OP_MAP_UNARY:
             {
                 ggml_unary_op_f32_t fun;
@@ -15112,73 +15036,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             inplace);
                 }
             } break;
-        case GGML_OP_ABS:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_impl(ctx,
-                                src0->grad,
-                                ggml_mul(ctx,
-                                    ggml_sgn(ctx, src0),
-                                    tensor->grad),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_SGN:
-            {
-                if (src0->grad) {
-                    // noop
-                }
-            } break;
-        case GGML_OP_NEG:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
-                }
-            } break;
-        case GGML_OP_STEP:
-            {
-                if (src0->grad) {
-                    // noop
-                }
-            } break;
-        case GGML_OP_TANH:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_ELU:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_RELU:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_sub_impl(ctx,
-                            src0->grad,
-                            ggml_mul(ctx,
-                                ggml_step(ctx, src0),
-                                tensor->grad),
-                            inplace);
-                }
-            } break;
-        case GGML_OP_GELU:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_GELU_QUICK:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_SILU:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    src0->grad = ggml_add_impl(ctx,
-                            src0->grad,
-                            ggml_silu_back(ctx, src0, tensor->grad),
-                            inplace);
-                }
-            } break;
         case GGML_OP_SILU_BACK:
             {
                 GGML_ASSERT(false); // TODO: not implemented
@@ -15440,9 +15297,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
                         inplace);
                 }
-                if (src1->grad) {
-                    // noop
-                }
             } break;
         case GGML_OP_DIAG_MASK_ZERO:
             {
@@ -15454,9 +15308,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
                         inplace);
                 }
-                if (src1->grad) {
-                    // noop
-                }
             } break;
         case GGML_OP_SOFT_MAX:
             {
@@ -15491,9 +15342,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 n_ctx),
                             inplace);
                 }
-                if (src1->grad) {
-                    // noop
-                }
             } break;
         case GGML_OP_ROPE_BACK:
             {
@@ -15512,9 +15360,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 n_ctx),
                             inplace);
                 }
-                if (src1->grad) {
-                    // noop
-                }
             } break;
         case GGML_OP_ALIBI:
             {
@@ -15707,6 +15552,80 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_WIN_PART:
         case GGML_OP_WIN_UNPART:
+        case GGML_OP_UNARY:
+            {
+                switch (ggml_get_unary_op(tensor)) {
+                    case GGML_UNARY_OP_ABS:
+                        {
+                            if (src0->grad) {
+                                src0->grad =
+                                    ggml_add_impl(ctx,
+                                            src0->grad,
+                                            ggml_mul(ctx,
+                                                ggml_sgn(ctx, src0),
+                                                tensor->grad),
+                                            inplace);
+                            }
+                        } break;
+                    case GGML_UNARY_OP_SGN:
+                        {
+                            if (src0->grad) {
+                                // noop
+                            }
+                        } break;
+                    case GGML_UNARY_OP_NEG:
+                        {
+                            if (src0->grad) {
+                                src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
+                            }
+                        } break;
+                    case GGML_UNARY_OP_STEP:
+                        {
+                            if (src0->grad) {
+                                // noop
+                            }
+                        } break;
+                    case GGML_UNARY_OP_TANH:
+                        {
+                            GGML_ASSERT(false); // TODO: not implemented
+                        } break;
+                    case GGML_UNARY_OP_ELU:
+                        {
+                            GGML_ASSERT(false); // TODO: not implemented
+                        } break;
+                    case GGML_UNARY_OP_RELU:
+                        {
+                            if (src0->grad) {
+                                src0->grad = ggml_add_impl(ctx,
+                                        src0->grad,
+                                        ggml_mul(ctx,
+                                            ggml_step(ctx, src0),
+                                            tensor->grad),
+                                        inplace);
+                            }
+                        } break;
+                    case GGML_UNARY_OP_GELU:
+                        {
+                            GGML_ASSERT(false); // TODO: not implemented
+                        } break;
+                    case GGML_UNARY_OP_GELU_QUICK:
+                        {
+                            GGML_ASSERT(false); // TODO: not implemented
+                        } break;
+                    case GGML_UNARY_OP_SILU:
+                        {
+                            // necessary for llama
+                            if (src0->grad) {
+                                src0->grad = ggml_add_impl(ctx,
+                                        src0->grad,
+                                        ggml_silu_back(ctx, src0, tensor->grad),
+                                        inplace);
+                            }
+                        } break;
+                    default:
+                        GGML_ASSERT(false);
+                }
+            } break;
         case GGML_OP_MAP_UNARY:
         case GGML_OP_MAP_BINARY:
         case GGML_OP_MAP_CUSTOM1:
@@ -15937,7 +15856,7 @@ typedef pthread_t ggml_thread_t;
 
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__linux__) && !defined(__BIONIC__)
-void set_numa_thread_affinity(int thread_n, int n_threads) {
+static void set_numa_thread_affinity(int thread_n, int n_threads) {
     if (!ggml_is_numa()) {
         return;
     }
@@ -15962,7 +15881,7 @@ void set_numa_thread_affinity(int thread_n, int n_threads) {
     CPU_FREE(cpus);
 }
 
-void clear_numa_thread_affinity(void) {
+static void clear_numa_thread_affinity(void) {
     if (!ggml_is_numa()) {
         return;
     }
@@ -15986,8 +15905,8 @@ void clear_numa_thread_affinity(void) {
 #else
 // TODO: Windows etc.
 // (the linux implementation may also work on BSD, someone should test)
-void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads);  }
-void clear_numa_thread_affinity(void) {}
+static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads);  }
+static void clear_numa_thread_affinity(void) {}
 #endif
 
 struct ggml_compute_state_shared {
@@ -16199,21 +16118,34 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
             case GGML_OP_ARGMAX:
             case GGML_OP_REPEAT:
             case GGML_OP_REPEAT_BACK:
-            case GGML_OP_ABS:
-            case GGML_OP_SGN:
-            case GGML_OP_NEG:
-            case GGML_OP_STEP:
-            case GGML_OP_TANH:
-            case GGML_OP_ELU:
-            case GGML_OP_RELU:
-                {
+            {
                     n_tasks = 1;
                 } break;
-            case GGML_OP_MUL:
-            case GGML_OP_GELU:
-            case GGML_OP_GELU_QUICK:
-            case GGML_OP_SILU:
+
+            case GGML_OP_UNARY:
+                {
+                    switch (ggml_get_unary_op(node)) {
+                        case GGML_UNARY_OP_ABS:
+                        case GGML_UNARY_OP_SGN:
+                        case GGML_UNARY_OP_NEG:
+                        case GGML_UNARY_OP_STEP:
+                        case GGML_UNARY_OP_TANH:
+                        case GGML_UNARY_OP_ELU:
+                        case GGML_UNARY_OP_RELU:
+                            {
+                                n_tasks = 1;
+                            } break;
+
+                        case GGML_UNARY_OP_GELU:
+                        case GGML_UNARY_OP_GELU_QUICK:
+                        case GGML_UNARY_OP_SILU:
+                            {
+                                n_tasks = n_threads;
+                            } break;
+                    }
+                } break;
             case GGML_OP_SILU_BACK:
+            case GGML_OP_MUL:
             case GGML_OP_NORM:
             case GGML_OP_RMS_NORM:
             case GGML_OP_RMS_NORM_BACK:
@@ -16728,7 +16660,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
                     fwrite(&nb, sizeof(uint64_t), 1, fout);
                 }
 
-                fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
+                fwrite(tensor->name,      sizeof(char), GGML_MAX_NAME,      fout);
+                fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
 
                 // dump the data
                 // TODO: pad this to 32 byte boundary
@@ -16761,7 +16694,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
                     fwrite(&nb, sizeof(uint64_t), 1, fout);
                 }
 
-                fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
+                fwrite(tensor->name,      sizeof(char), GGML_MAX_NAME,      fout);
+                fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
 
                 // output the op arguments
                 {
@@ -16942,7 +16876,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
 
                 tensor->op = (enum ggml_op) op;
 
-                memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
+                memcpy(tensor->name,      ptr, GGML_MAX_NAME);      ptr += GGML_MAX_NAME;
+                memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS;
 
                 tensor->data = (void *) ptr;
 
@@ -16987,7 +16922,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                     nb[j] = nb_cur;
                 }
 
-                const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
+                const char * ptr_name      = ptr; ptr += GGML_MAX_NAME;
+                const char * ptr_op_params = ptr; ptr += GGML_MAX_OP_PARAMS;
 
                 const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
 
@@ -17024,8 +16960,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                         {
                             tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
 
-                            uint64_t offs;
-                            memcpy(&offs, tensor->op_params, sizeof(offs));
+                            size_t offs;
+                            memcpy(&offs, ptr_op_params, sizeof(offs));
 
                             tensor->data = ((char *) tensor->data) + offs;
                         } break;
@@ -17045,7 +16981,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                         } break;
                 }
 
-                memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
+                memcpy(tensor->name,      ptr_name,      GGML_MAX_NAME);
+                memcpy(tensor->op_params, ptr_op_params, GGML_MAX_OP_PARAMS);
 
                 for (int j = 0; j < GGML_MAX_DIMS; ++j) {
                     tensor->nb[j] = nb[j];
@@ -17079,7 +17016,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
         GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
                 i,
                 node->ne[0], node->ne[1], node->ne[2],
-                GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
+                ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
                 (double) node->perf_cycles  / (double) ggml_cycles_per_ms(),
                 (double) node->perf_cycles  / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
                 (double) node->perf_time_us / 1000.0,
@@ -17093,7 +17030,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
         GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
                 i,
                 node->ne[0], node->ne[1],
-                GGML_OP_NAME[node->op]);
+                ggml_op_name(node->op));
     }
 
     for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -17101,7 +17038,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
             continue;
         }
 
-        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
+        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
     }
 
     GGML_PRINT("========================================\n");
@@ -17195,13 +17132,13 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
         }
 
         if (node->n_dims == 2) {
-            fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]);
+            fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
         } else {
-            fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
+            fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
         }
 
         if (node->grad) {
-            fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
+            fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(node->grad->op));
         } else {
             fprintf(fp, "\"; ]\n");
         }
diff --git a/ggml.h b/ggml.h
index 871c85a..de44fba 100644
--- a/ggml.h
+++ b/ggml.h
@@ -330,16 +330,6 @@ extern "C" {
         GGML_OP_ARGMAX,
         GGML_OP_REPEAT,
         GGML_OP_REPEAT_BACK,
-        GGML_OP_ABS,
-        GGML_OP_SGN,
-        GGML_OP_NEG,
-        GGML_OP_STEP,
-        GGML_OP_TANH,
-        GGML_OP_ELU,
-        GGML_OP_RELU,
-        GGML_OP_GELU,
-        GGML_OP_GELU_QUICK,
-        GGML_OP_SILU,
         GGML_OP_SILU_BACK,
         GGML_OP_NORM, // normalize
         GGML_OP_RMS_NORM,
@@ -378,6 +368,8 @@ extern "C" {
         GGML_OP_WIN_PART,
         GGML_OP_WIN_UNPART,
 
+        GGML_OP_UNARY,
+
         GGML_OP_MAP_UNARY,
         GGML_OP_MAP_BINARY,
 
@@ -391,6 +383,18 @@ extern "C" {
         GGML_OP_COUNT,
     };
 
+    enum ggml_unary_op {
+        GGML_UNARY_OP_ABS,
+        GGML_UNARY_OP_SGN,
+        GGML_UNARY_OP_NEG,
+        GGML_UNARY_OP_STEP,
+        GGML_UNARY_OP_TANH,
+        GGML_UNARY_OP_ELU,
+        GGML_UNARY_OP_RELU,
+        GGML_UNARY_OP_GELU,
+        GGML_UNARY_OP_GELU_QUICK,
+        GGML_UNARY_OP_SILU,
+    };
 
     // ggml object
     struct ggml_object {
@@ -535,6 +539,7 @@ extern "C" {
 
     GGML_API const char * ggml_type_name(enum ggml_type type);
     GGML_API const char * ggml_op_name  (enum ggml_op   op);
+    GGML_API const char * ggml_op_symbol(enum ggml_op   op);
 
     GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
 
@@ -558,6 +563,7 @@ extern "C" {
     GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
 
     GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
+    GGML_API bool    ggml_get_no_alloc(struct ggml_context * ctx);
     GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
 
     GGML_API void *  ggml_get_mem_buffer     (const struct ggml_context * ctx);
@@ -617,9 +623,11 @@ extern "C" {
     GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
     GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
 
-    GGML_API const char *         ggml_get_name(const struct ggml_tensor * tensor);
-    GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
-    GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
+    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
+
+    GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
+    GGML_API struct ggml_tensor * ggml_format_name(      struct ggml_tensor * tensor, const char * fmt, ...);
 
     //
     // operations on tensors with backpropagation
@@ -629,6 +637,11 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_dup_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     GGML_API struct ggml_tensor * ggml_add(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -952,11 +965,22 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    // a -> b, in-place, return view(b)
+    GGML_API struct ggml_tensor * ggml_cpy_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
     // make contiguous
     GGML_API struct ggml_tensor * ggml_cont(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    // make contiguous, in-place
+    GGML_API struct ggml_tensor * ggml_cont_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     // return view(a), b specifies the new shape
     // TODO: when we start computing gradient, make a copy instead of view
     GGML_API struct ggml_tensor * ggml_reshape(
@@ -1268,6 +1292,16 @@ extern "C" {
     typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
     typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
 
+    GGML_API struct ggml_tensor * ggml_unary(
+            struct ggml_context * ctx,
+             struct ggml_tensor * a,
+             enum ggml_unary_op op);
+
+    GGML_API struct ggml_tensor * ggml_unary_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_unary_op op);
+
     GGML_API struct ggml_tensor * ggml_map_unary_f32(
             struct ggml_context        * ctx,
             struct ggml_tensor         * a,
diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index 01467bc..ef20bce 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -64,7 +64,7 @@ void get_random_dims(int64_t * dims, int ndims) {
     }
 }
 
-struct ggml_tensor * get_random_tensor(
+struct ggml_tensor * get_random_tensor_f32(
         struct ggml_context * ctx0,
         int ndims,
         int64_t ne[],
@@ -112,7 +112,55 @@ struct ggml_tensor * get_random_tensor(
     return result;
 }
 
-struct ggml_tensor * get_random_tensor_int(
+struct ggml_tensor * get_random_tensor_f16(
+        struct ggml_context * ctx0,
+        int ndims,
+        int64_t ne[],
+        float fmin,
+        float fmax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+
+    return result;
+}
+
+struct ggml_tensor * get_random_tensor_i32(
         struct ggml_context * ctx0,
         int ndims,
         int64_t ne[],
@@ -160,23 +208,6 @@ struct ggml_tensor * get_random_tensor_int(
     return result;
 }
 
-float get_element(const struct ggml_tensor * t, int idx) {
-    if (t->type == GGML_TYPE_F32) {
-        return ((float *)t->data)[idx];
-    }
-
-    if (t->type == GGML_TYPE_I32) {
-        return ((int32_t *)t->data)[idx];
-    }
-
-    assert(false);
-    return INFINITY;
-}
-
-void set_element(struct ggml_tensor * t, int idx, float value) {
-    ((float *)t->data)[idx] = value;
-}
-
 void print_elements(const char* label, const struct ggml_tensor * t) {
     if (!t) {
         printf("%s: %s = null\n", __func__, label);
@@ -186,7 +217,7 @@ void print_elements(const char* label, const struct ggml_tensor * t) {
     printf("%s: %s = [", __func__, label);
     for (int k = 0; k < nelements; ++k) {
         if (k > 0) { printf(", "); }
-        printf("%.5f", get_element(t, k));
+        printf("%.5f", ggml_get_f32_1d(t, k));
     }
     printf("] shape: [");
     for (int k = 0; k < t->n_dims; ++k) {
@@ -237,23 +268,23 @@ bool check_gradient(
         const int nelements = ggml_nelements(x[i]);
         for (int k = 0; k < nelements; ++k) {
             // compute gradient using finite differences
-            const float x0 = get_element(x[i], k);
+            const float x0 = ggml_get_f32_1d(x[i], k);
             const float xm = x0 - eps;
             const float xp = x0 + eps;
-            set_element(x[i], k, xp);
+            ggml_set_f32_1d(x[i], k, xp);
 
             ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
 
             const float f0 = ggml_get_f32_1d(f, 0);
 
-            set_element(x[i], k, xm);
+            ggml_set_f32_1d(x[i], k, xm);
 
             ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
 
             const float f1 = ggml_get_f32_1d(f, 0);
             const float g0 = (f0 - f1)/(2.0f*eps);
 
-            set_element(x[i], k, x0);
+            ggml_set_f32_1d(x[i], k, x0);
 
             // compute gradient using backward graph
             ggml_graph_reset  (&gf);
@@ -261,7 +292,7 @@ bool check_gradient(
 
             ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
 
-            const float g1 = get_element(x[i]->grad, k);
+            const float g1 = ggml_get_f32_1d(x[i]->grad, k);
 
             const float error_abs = fabsf(g0 - g1);
             const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
@@ -392,19 +423,35 @@ int main(int argc, const char ** argv) {
 
         struct ggml_tensor * x[MAX_NARGS];
 
-        // add
+        // add f32
         {
             const int nargs = 2;
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
                 for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                     ggml_set_param(ctx0, x[i]);
                 }
 
                 struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
 
-                check_gradient("add", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
+                check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
+            }
+        }
+
+        // add f16
+        {
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
+
+                check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
             }
         }
 
@@ -414,7 +461,7 @@ int main(int argc, const char ** argv) {
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
                 for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                     ggml_set_param(ctx0, x[i]);
                 }
 
@@ -430,7 +477,7 @@ int main(int argc, const char ** argv) {
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
                 for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                     ggml_set_param(ctx0, x[i]);
                 }
 
@@ -446,7 +493,7 @@ int main(int argc, const char ** argv) {
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
                 for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, 0.5f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
                     ggml_set_param(ctx0, x[i]);
                 }
 
@@ -462,7 +509,7 @@ int main(int argc, const char ** argv) {
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
                 for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                     ggml_set_param(ctx0, x[i]);
                 }
 
@@ -478,7 +525,7 @@ int main(int argc, const char ** argv) {
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
                 for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
                     ggml_set_param(ctx0, x[i]);
                 }
 
@@ -494,7 +541,7 @@ int main(int argc, const char ** argv) {
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
                 for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
                     ggml_set_param(ctx0, x[i]);
                 }
 
@@ -510,7 +557,7 @@ int main(int argc, const char ** argv) {
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
                 for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                     ggml_set_param(ctx0, x[i]);
                 }
 
@@ -527,7 +574,7 @@ int main(int argc, const char ** argv) {
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
                 for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                     ggml_set_param(ctx0, x[i]);
                 }
 
@@ -537,6 +584,40 @@ int main(int argc, const char ** argv) {
             }
         }
 
+        // mean, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
+
+                check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // argmax
+        if (0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
+
+                check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
         // repeat
         {
             int64_t ne2[4];
@@ -549,15 +630,36 @@ int main(int argc, const char ** argv) {
 
             const int nargs = 1;
             for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[0]);
 
                 struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
 
                 check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
             }
+        }
+
+        // repeat back
+        {
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+
+            ne2[0] = ne[0] * ne2[0];
+            ne2[1] = ne[1] * ne2[1];
+            ne2[2] = 1;
+            ne2[3] = 1;
+
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
 
+                check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
+            }
         }
 
         // abs (finite differences do not work)
@@ -566,7 +668,7 @@ int main(int argc, const char ** argv) {
 
         //    for (int ndims = 1; ndims <= 2; ++ndims) {
         //        for (int i = 0; i < nargs; ++i) {
-        //            x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+        //            x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
         //            ggml_set_param(ctx0, x[i]);
         //        }
 
@@ -576,17 +678,82 @@ int main(int argc, const char ** argv) {
         //    }
         //}
 
+        // sgn
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
+
+                check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // neg
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
+
+                check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // step
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
+
+                check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // tanh, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
+
+                check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
         // mul_mat
         {
             const int nargs = 2;
 
             for (int ndims = 2; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                 {
                     int64_t ne2[4];
                     get_random_dims(ne2, 4);
                     ne2[0] = ne[0];
-                    x[1] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                    x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                 }
 
                 ggml_set_param(ctx0, x[0]);
@@ -602,13 +769,63 @@ int main(int argc, const char ** argv) {
             }
         }
 
+        // elu, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
+
+                check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // relu
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
+
+                check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // gelu, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
+
+                check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
         // silu
         {
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
                 for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                     ggml_set_param(ctx0, x[i]);
                 }
 
@@ -629,7 +846,7 @@ int main(int argc, const char ** argv) {
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
                 for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                     ggml_set_param(ctx0, x[i]);
                 }
 
@@ -647,8 +864,8 @@ int main(int argc, const char ** argv) {
             ne2[0] = 1;
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 
                 ggml_set_param(ctx0, x[0]);
                 ggml_set_param(ctx0, x[1]);
@@ -659,20 +876,37 @@ int main(int argc, const char ** argv) {
             }
         }
 
-        // cpy
+        // cpy f32
         {
             const int nargs = 2;
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
                 for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                     ggml_set_param(ctx0, x[i]);
                 }
                 // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
 
                 struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
 
-                check_gradient("cpy", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // cpy f16
+        {
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
+
+                check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
             }
         }
 
@@ -689,8 +923,8 @@ int main(int argc, const char ** argv) {
                 for (int i = 0; i < ndims; ++i) {
                     ne2[0] *= ne[i];
                 }
-                x[0] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
-                x[1] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[0]);
 
 
@@ -712,8 +946,8 @@ int main(int argc, const char ** argv) {
                 for (int i = 0; i < ndims; ++i) {
                     ne2[0] *= ne[i];
                 }
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[0]);
 
 
@@ -729,7 +963,7 @@ int main(int argc, const char ** argv) {
             const int nargs = 2;
             for (int ndims = 1; ndims <= 4; ++ndims) {
 
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[0]);
 
                 get_random_dims(ne2, 1);
@@ -737,7 +971,7 @@ int main(int argc, const char ** argv) {
                     get_random_dims(ne2, 1);
                 }
 
-                x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[1]);
 
                 const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
@@ -758,7 +992,7 @@ int main(int argc, const char ** argv) {
             const int nargs = 2;
             for (int ndims = 2; ndims <= 4; ++ndims) {
 
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[0]);
 
                 get_random_dims(ne2, 2);
@@ -766,7 +1000,7 @@ int main(int argc, const char ** argv) {
                     get_random_dims(ne2, 2);
                 }
 
-                x[1] = get_random_tensor(ctx0, 2, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[1]);
 
                 max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@@ -790,7 +1024,7 @@ int main(int argc, const char ** argv) {
             const int nargs = 2;
             for (int ndims = 3; ndims <= 4; ++ndims) {
 
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[0]);
 
                 get_random_dims(ne2, 3);
@@ -798,7 +1032,7 @@ int main(int argc, const char ** argv) {
                     get_random_dims(ne2, 3);
                 }
 
-                x[1] = get_random_tensor(ctx0, 3, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[1]);
 
                 max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@@ -824,7 +1058,7 @@ int main(int argc, const char ** argv) {
             const int nargs = 2;
             for (int ndims = 4; ndims <= 4; ++ndims) {
 
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[0]);
 
                 get_random_dims(ne2, 4);
@@ -832,7 +1066,7 @@ int main(int argc, const char ** argv) {
                     get_random_dims(ne2, 4);
                 }
 
-                x[1] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[1]);
 
                 max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@@ -858,7 +1092,7 @@ int main(int argc, const char ** argv) {
             const int nargs = 2;
             for (int ndims = 1; ndims <= 4; ++ndims) {
 
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[0]);
 
                 get_random_dims(ne2, 1);
@@ -866,7 +1100,7 @@ int main(int argc, const char ** argv) {
                     get_random_dims(ne2, 1);
                 }
 
-                x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[1]);
 
                 const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
@@ -887,7 +1121,7 @@ int main(int argc, const char ** argv) {
             const int nargs = 1;
             for (int ndims = 2; ndims <= 4; ++ndims) {
 
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[0]);
 
                 get_random_dims(ne2, 2);
@@ -895,7 +1129,7 @@ int main(int argc, const char ** argv) {
                     get_random_dims(ne2, 2);
                 }
 
-                x[1] = get_random_tensor(ctx0, 2, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[1]);
 
                 max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@@ -915,7 +1149,7 @@ int main(int argc, const char ** argv) {
             const int nargs = 1;
             for (int ndims = 1; ndims <= 4; ++ndims) {
 
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 
                 ggml_set_param(ctx0, x[0]);
 
@@ -941,7 +1175,7 @@ int main(int argc, const char ** argv) {
             const int nargs = 1;
             for (int ndims = 1; ndims <= 4; ++ndims) {
 
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 
                 get_random_dims(ne2, 2);
                 while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
@@ -971,7 +1205,7 @@ int main(int argc, const char ** argv) {
             const int nargs = 1;
             for (int ndims = 1; ndims <= 4; ++ndims) {
 
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 
                 get_random_dims(ne2, 3);
                 while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
@@ -1010,7 +1244,7 @@ int main(int argc, const char ** argv) {
                 for (int i=ndims; i<4; ++i) {
                     ne2[i] = 1;
                 }
-                x[0] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
 
                 ggml_set_param(ctx0, x[0]);
 
@@ -1043,7 +1277,7 @@ int main(int argc, const char ** argv) {
                 for (int i=ndims; i<4; ++i) {
                     ne2[i] = 1;
                 }
-                x[0] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
 
                 ggml_set_param(ctx0, x[0]);
 
@@ -1060,8 +1294,8 @@ int main(int argc, const char ** argv) {
             int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
             const int nargs = 1;
             const int ndims = 2;
-            x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
-            x[1] = get_random_tensor_int(ctx0, 1, ne3, 0, ne2[1]);
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+            x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
 
             ggml_set_param(ctx0, x[0]);
 
@@ -1075,7 +1309,7 @@ int main(int argc, const char ** argv) {
             const int nargs = 1;
             const int ndims = 2;
 
-            x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
             ggml_set_param(ctx0, x[0]);
 
             int n_past = irand(ne[0]);
@@ -1090,7 +1324,7 @@ int main(int argc, const char ** argv) {
             const int nargs = 1;
             const int ndims = 2;
 
-            x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
             ggml_set_param(ctx0, x[0]);
 
             int n_past = irand(ne[0]);
@@ -1108,7 +1342,7 @@ int main(int argc, const char ** argv) {
             get_random_dims(ne2, 4);
 
             for (int ndims = 1; ndims <= 3; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[0]);
 
                 struct ggml_tensor * f = ggml_sum(ctx0, ggml_soft_max(ctx0, x[0]));
@@ -1125,8 +1359,8 @@ int main(int argc, const char ** argv) {
             get_random_dims(ne2, 4);
 
             for (int ndims = 1; ndims <= 3; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
-                x[1] = get_random_tensor(ctx0, ndims, ne2, 0.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
                 ggml_set_param(ctx0, x[0]);
 
                 struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
@@ -1136,7 +1370,7 @@ int main(int argc, const char ** argv) {
             }
         }
 
-        // rope
+        // rope f32
         {
             const int nargs = 1;
 
@@ -1148,7 +1382,7 @@ int main(int argc, const char ** argv) {
             for (int ndims = 3; ndims <= 4; ++ndims) {
                 for (int mode = 0; mode < 4; ++mode) {
                     for (int n_past = 1; n_past < ne2[2]; ++n_past) {
-                        x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                        x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
 
                         ggml_set_param(ctx0, x[0]);
 
@@ -1163,14 +1397,89 @@ int main(int argc, const char ** argv) {
 
                         struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
 
-                        GGML_PRINT_DEBUG("rope: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
-                        check_gradient("rope", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
+                        GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
+                        check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
+                    }
+                }
+            }
+        }
+
+        // rope f16
+        {
+            const int nargs = 1;
+
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+            ne2[0] += ne2[0] % 2;
+            int n_rot = ne2[0];
+
+            for (int ndims = 3; ndims <= 4; ++ndims) {
+                for (int mode = 0; mode < 4; ++mode) {
+                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
+                        x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
+
+                        ggml_set_param(ctx0, x[0]);
+
+                        const bool skip_past = (mode & 1);
+                        if (skip_past) {
+                            // we have no past, so this would have to work on uninitialized memory.
+                            // we only test the gradients here;
+                            // skip_past should have no influence on gradient computation.
+                            // so when other modes work, we assume that this does as well.
+                            continue;
+                        }
+
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
+
+                        GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
+                        check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
+                    }
+                }
+            }
+        }
+
+        // flash_attn f32
+        {
+            const int nargs = 3;
+
+            int64_t ne2[4];
+
+            get_random_dims(ne2, 4);
+            int64_t D = ne2[0];
+            int64_t N = ne2[1];
+            int64_t M = ne2[2] + N;
+            int64_t B = ne2[3];
+
+            for (int masked = 0; masked <= 1; ++masked) {
+                for (int ndims = 2; ndims <= 4; ++ndims) {
+                    int64_t neq[4] = { D, N, B, ne[3] };
+                    int64_t nek[4] = { D, M, B, ne[3] };
+                    int64_t nev[4] = { M, D, B, ne[3] };
+                    if (ndims == 2) {
+                        neq[2] = 1; neq[3] = 1;
+                        nek[2] = 1; nek[3] = 1;
+                        nev[2] = 1; nev[3] = 1;
+                    } else if (ndims == 3) {
+                        neq[3] = 1;
+                        nek[3] = 1;
+                        nev[3] = 1;
                     }
+                    x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
+                    x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
+                    x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
+                    ggml_set_param(ctx0, x[0]);
+                    ggml_set_param(ctx0, x[1]);
+                    ggml_set_param(ctx0, x[2]);
+
+                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
+
+                    check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
                 }
             }
         }
 
-        // flash_attn
+        // flash_attn f16, not yet fully implemented
+        if(0)
         {
             const int nargs = 3;
 
@@ -1196,16 +1505,16 @@ int main(int argc, const char ** argv) {
                         nek[3] = 1;
                         nev[3] = 1;
                     }
-                    x[0] = get_random_tensor(ctx0, ndims, neq, -0.1250f, 0.1250f);
-                    x[1] = get_random_tensor(ctx0, ndims, nek, -0.1250f, 0.1250f);
-                    x[2] = get_random_tensor(ctx0, ndims, nev, -0.1250f, 0.1250f);
+                    x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
+                    x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
+                    x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
                     ggml_set_param(ctx0, x[0]);
                     ggml_set_param(ctx0, x[1]);
                     ggml_set_param(ctx0, x[2]);
 
                     struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
 
-                    check_gradient("flash_attn", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
+                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
                 }
             }
         }
diff --git a/tests/test-opt.c b/tests/test-opt.c
index 5531814..4eef62b 100644
--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@@ -125,9 +125,9 @@ int main(void) {
     };
     struct ggml_context * ctx = ggml_init(params);
 
-    int64_t ne1[4] = {4, 1024, 1, 1};
-    int64_t ne2[4] = {4, 2048, 1, 1};;
-    int64_t ne3[4] = {1024, 2048, 1, 1};
+    int64_t ne1[4] = {4, 128, 1, 1};
+    int64_t ne2[4] = {4, 256, 1, 1};;
+    int64_t ne3[4] = {128, 256, 1, 1};
 
     struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
     struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);
-- 
cgit v1.2.3


From 41c674161fb2459bdf7806d1eebead15bc5d046e Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Mon, 24 Jul 2023 17:57:12 +0200
Subject: make rms_norm_eps a parameter (#2374)

* make rms_norm_eps a parameter

* add rms_norm_eps to command line

* fix baby llama, test-grad0

* use scientific notation for eps param in the help

ggml-ci
---
 examples/baby-llama/baby-llama.cpp                 | 20 ++++++++------
 examples/common.cpp                                |  8 ++++++
 examples/common.h                                  | 23 ++++++++--------
 .../train-text-from-scratch.cpp                    | 32 ++++++++++++----------
 ggml-cuda.cu                                       | 13 +++++----
 ggml-metal.m                                       |  3 +-
 ggml.c                                             | 16 +++++++----
 ggml.h                                             |  7 +++--
 llama.cpp                                          | 20 ++++++++++----
 llama.h                                            |  1 +
 tests/test-grad0.c                                 |  2 +-
 11 files changed, 89 insertions(+), 56 deletions(-)

(limited to 'tests')

diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index 4965881..f9dc0aa 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -8,6 +8,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+static const float rms_norm_eps = 1e-6f;
+
 float frand() {
     return (float)rand()/(float)RAND_MAX;
 }
@@ -562,7 +564,7 @@ struct ggml_tensor * forward(
         // norm
         {
             // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
 
             // cur = attention_norm*cur
             cur = ggml_mul(ctx0,
@@ -685,7 +687,7 @@ struct ggml_tensor * forward(
             // norm
             {
                 // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
 
                 // cur = ffn_norm*cur
                 // cur shape [n_embd,N,1,1]
@@ -729,7 +731,7 @@ struct ggml_tensor * forward(
     {
 
         // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
 
         // inpL = norm*inpL
         // inpL shape [n_embd,N,1,1]
@@ -817,7 +819,7 @@ struct ggml_tensor * forward_batch(
         // norm
         {
             // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
             assert_shape_2d(cur, n_embd, N*n_batch);
 
             // cur = attention_norm*cur
@@ -981,7 +983,7 @@ struct ggml_tensor * forward_batch(
             // norm
             {
                 // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                 assert_shape_2d(cur, n_embd, N*n_batch);
 
                 // cur = ffn_norm*cur
@@ -1034,7 +1036,7 @@ struct ggml_tensor * forward_batch(
     {
 
         // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
         assert_shape_2d(inpL, n_embd, N*n_batch);
 
         // inpL = norm*inpL
@@ -1104,7 +1106,7 @@ struct ggml_tensor * forward_lora(
         // norm
         {
             // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
 
             // cur = attention_norm*cur
             cur = ggml_mul(ctx0,
@@ -1251,7 +1253,7 @@ struct ggml_tensor * forward_lora(
             // norm
             {
                 // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
 
                 // cur = ffn_norm*cur
                 // cur shape [n_embd,N,1,1]
@@ -1295,7 +1297,7 @@ struct ggml_tensor * forward_lora(
     {
 
         // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
 
         // inpL = norm*inpL
         // inpL shape [n_embd,N,1,1]
diff --git a/examples/common.cpp b/examples/common.cpp
index 779605f..0e88a12 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -177,6 +177,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.n_gqa = std::stoi(argv[i]);
+        } else if (arg == "-eps" || arg == "--rms-norm-eps") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rms_norm_eps = std::stof(argv[i]);
         } else if (arg == "--rope-freq-base") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -519,6 +525,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
     fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
+    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
     fprintf(stdout, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
     fprintf(stdout, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
     fprintf(stdout, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
@@ -615,6 +622,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     lparams.n_ctx           = params.n_ctx;
     lparams.n_batch         = params.n_batch;
     lparams.n_gqa           = params.n_gqa;
+    lparams.rms_norm_eps    = params.rms_norm_eps;
     lparams.n_gpu_layers    = params.n_gpu_layers;
     lparams.main_gpu        = params.main_gpu;
     lparams.tensor_split    = params.tensor_split;
diff --git a/examples/common.h b/examples/common.h
index 7086606..894a085 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -22,18 +22,19 @@
 int32_t get_num_physical_cores();
 
 struct gpt_params {
-    uint32_t seed                           = -1;  // RNG seed
+    uint32_t seed                           = -1;   // RNG seed
     int32_t n_threads                       = get_num_physical_cores();
-    int32_t n_predict                       = -1;  // new tokens to predict
-    int32_t n_ctx                           = 512; // context size
-    int32_t n_batch                         = 512; // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_gqa                           = 1;   // grouped-query attention factor (TODO: move to hparams)
-    int32_t n_keep                          = 0;   // number of tokens to keep from initial prompt
-    int32_t n_chunks                        = -1;  // max number of chunks to process (-1 = unlimited)
-    int32_t n_gpu_layers                    = 0;   // number of layers to store in VRAM
-    int32_t main_gpu                        = 0;   // the GPU that is used for scratch and small tensors
-    float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
-    int32_t n_probs                         = 0;   // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t n_predict                       = -1;   // new tokens to predict
+    int32_t n_ctx                           = 512;  // context size
+    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_gqa                           = 1;    // grouped-query attention factor (TODO: move to hparams)
+    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
+    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
+    int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM
+    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
+    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
+    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
+    float   rms_norm_eps                    = 1e-6; // rms norm epsilon
     float   rope_freq_base                  = 10000.0f; // RoPE base frequency
     float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
 
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 449b4e9..4bbf6b7 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -16,6 +16,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+static const float rms_norm_eps = 1e-6f;
+
 struct random_normal_distribution {
     std::mt19937 gen;
     std::normal_distribution<float> rd;
@@ -439,7 +441,7 @@ struct ggml_tensor * forward(
         // norm
         {
             // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
 
             // cur = attention_norm*cur
             cur = ggml_mul(ctx0,
@@ -562,7 +564,7 @@ struct ggml_tensor * forward(
             // norm
             {
                 // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
 
                 // cur = ffn_norm*cur
                 // cur shape [n_embd,N,1,1]
@@ -606,7 +608,7 @@ struct ggml_tensor * forward(
     {
 
         // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
 
         // inpL = norm*inpL
         // inpL shape [n_embd,N,1,1]
@@ -694,7 +696,7 @@ struct ggml_tensor * forward_batch(
         // norm
         {
             // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
             assert_shape_2d(cur, n_embd, N*n_batch);
 
             // cur = attention_norm*cur
@@ -857,7 +859,7 @@ struct ggml_tensor * forward_batch(
             // norm
             {
                 // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                 assert_shape_2d(cur, n_embd, N*n_batch);
 
                 // cur = ffn_norm*cur
@@ -910,7 +912,7 @@ struct ggml_tensor * forward_batch(
     {
 
         // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
         assert_shape_2d(inpL, n_embd, N*n_batch);
 
         // inpL = norm*inpL
@@ -979,7 +981,7 @@ struct ggml_tensor * forward_batch_wo_cache(
         // norm
         {
             // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
             assert_shape_2d(cur, n_embd, N*n_batch);
 
             // cur = attention_norm*cur
@@ -1085,7 +1087,7 @@ struct ggml_tensor * forward_batch_wo_cache(
             // norm
             {
                 // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                 assert_shape_2d(cur, n_embd, N*n_batch);
 
                 // cur = ffn_norm*cur
@@ -1138,7 +1140,7 @@ struct ggml_tensor * forward_batch_wo_cache(
     {
 
         // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
         assert_shape_2d(inpL, n_embd, N*n_batch);
 
         // inpL = norm*inpL
@@ -1203,7 +1205,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
 
         // norm
         {
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
             assert_shape_2d(cur, n_embd, N*n_batch);
 
             // cur = attention_norm*cur
@@ -1267,7 +1269,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
         {
             // norm
             {
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                 assert_shape_2d(cur, n_embd, N*n_batch);
 
                 // cur = ffn_norm*cur
@@ -1311,7 +1313,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
     // norm
     {
 
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
         assert_shape_2d(inpL, n_embd, N*n_batch);
 
         // inpL = norm*inpL
@@ -1603,7 +1605,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         struct my_llama_layer & layer = model->layers[il];
         // tensors with values necessary for backward pass are in persistent buf(-1)
         // other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed.
-        use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur));                                    assert_shape_2d(t02, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
         use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
         use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
         use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
@@ -1623,7 +1625,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
         use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
         use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21));                                    assert_shape_2d(t22, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
         use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
         use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
         use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
@@ -1666,7 +1668,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     }
     clr_buf(0);
     use_buf(0);
-    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur));                       assert_shape_2d(t31, n_embd, N*n_batch);
+    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur, rms_norm_eps));         assert_shape_2d(t31, n_embd, N*n_batch);
     struct ggml_tensor * t32   = expand(gf, ggml_repeat    (ctx0, model->norm, t31));          assert_shape_2d(t32, n_embd, N*n_batch);
     struct ggml_tensor * t33   = expand(gf, ggml_mul       (ctx0, t32, t31));                  assert_shape_2d(t33, n_embd, N*n_batch);
     use_buf(-1);
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index b8c9835..87a1660 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -332,12 +332,10 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
     }
 }
 
-static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
+static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
     const int tid = threadIdx.x;
 
-    const float eps = 1e-6f;
-
     float tmp = 0.0f; // partial sum for thread in warp
 
     for (int col = tid; col < ncols; col += WARP_SIZE) {
@@ -2122,10 +2120,10 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
     norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
 }
 
-static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
     GGML_ASSERT(ncols % WARP_SIZE == 0);
     const dim3 block_dims(WARP_SIZE, 1, 1);
-    rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
+    rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
 }
 
 static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
@@ -2876,8 +2874,11 @@ inline void ggml_cuda_op_rms_norm(
     const int64_t ne00 = src0->ne[0];
     const int64_t i01_diff = i01_high - i01_low;
 
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
     // compute
-    rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
+    rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
 
     (void) src1;
     (void) dst;
diff --git a/ggml-metal.m b/ggml-metal.m
index 1fd6e85..c1db3d1 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -812,7 +812,8 @@ void ggml_metal_graph_compute(
                                 encoder = [command_buffer computeCommandEncoder];
                             }
 
-                            const float eps = 1e-6f;
+                            float eps;
+                            memcpy(&eps, dst->op_params, sizeof(float));
 
                             const int nth = 512;
 
diff --git a/ggml.c b/ggml.c
index 960b805..11226c8 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5781,6 +5781,7 @@ struct ggml_tensor * ggml_norm_inplace(
 static struct ggml_tensor * ggml_rms_norm_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
+        float eps,
         bool inplace) {
     bool is_node = false;
 
@@ -5790,7 +5791,7 @@ static struct ggml_tensor * ggml_rms_norm_impl(
 
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    // TODO: maybe store epsilon here?
+    ggml_set_op_params(result, &eps, sizeof(eps));
 
     result->op   = GGML_OP_RMS_NORM;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5801,14 +5802,16 @@ static struct ggml_tensor * ggml_rms_norm_impl(
 
 struct ggml_tensor * ggml_rms_norm(
         struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_rms_norm_impl(ctx, a, false);
+        struct ggml_tensor  * a,
+        float  eps) {
+    return ggml_rms_norm_impl(ctx, a, eps, false);
 }
 
 struct ggml_tensor * ggml_rms_norm_inplace(
         struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_rms_norm_impl(ctx, a, true);
+        struct ggml_tensor  * a,
+        float eps) {
+    return ggml_rms_norm_impl(ctx, a, eps, true);
 }
 
 struct ggml_tensor * ggml_rms_norm_back(
@@ -10131,7 +10134,8 @@ static void ggml_compute_forward_rms_norm_f32(
 
     GGML_TENSOR_UNARY_OP_LOCALS;
 
-    const float eps = 1e-6f; // TODO: make this a parameter
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
 
     // TODO: optimize
     for (int64_t i03 = 0; i03 < ne03; i03++) {
diff --git a/ggml.h b/ggml.h
index de44fba..1870b62 100644
--- a/ggml.h
+++ b/ggml.h
@@ -866,14 +866,17 @@ extern "C" {
 
     GGML_API struct ggml_tensor * ggml_rms_norm(
             struct ggml_context * ctx,
-            struct ggml_tensor  * a);
+            struct ggml_tensor  * a,
+            float                 eps);
 
     GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
             struct ggml_context * ctx,
-            struct ggml_tensor  * a);
+            struct ggml_tensor  * a,
+            float                 eps);
 
     // a - x
     // b - dy
+    // TODO: update with configurable eps
     GGML_API struct ggml_tensor * ggml_rms_norm_back(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
diff --git a/llama.cpp b/llama.cpp
index 0288f7e..b42b410 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -186,6 +186,7 @@ struct llama_hparams {
     // LLaMAv2
     // TODO: load from model data hparams
     float f_ffn_mult = 1.0f;
+    float f_rms_norm_eps = 1e-6f;
 
     float rope_freq_base  = 10000.0f;
     float rope_freq_scale = 1.0f;
@@ -869,6 +870,7 @@ struct llama_context_params llama_context_default_params() {
         /*.n_ctx                       =*/ 512,
         /*.n_batch                     =*/ 512,
         /*.n_gqa                       =*/ 1,
+        /*.rms_norm_eps                =*/ 1e-6f,
         /*.gpu_layers                  =*/ 0,
         /*.main_gpu                    =*/ 0,
         /*.tensor_split                =*/ nullptr,
@@ -1000,6 +1002,7 @@ static void llama_model_load_internal(
         int n_ctx,
         int n_batch,
         int n_gqa,
+        float rms_norm_eps,
         int n_gpu_layers,
         int main_gpu,
         const float * tensor_split,
@@ -1024,6 +1027,9 @@ static void llama_model_load_internal(
 
     auto & hparams = model.hparams;
 
+    // TODO: read from file
+    hparams.f_rms_norm_eps = rms_norm_eps;
+
     {
         switch (hparams.n_layer) {
             case 26: model.type = e_model::MODEL_3B; break;
@@ -1072,6 +1078,7 @@ static void llama_model_load_internal(
         fprintf(stderr, "%s: n_layer    = %u\n",   __func__, hparams.n_layer);
         fprintf(stderr, "%s: n_rot      = %u\n",   __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
         fprintf(stderr, "%s: n_gqa      = %u\n",   __func__, hparams.n_gqa());
+        fprintf(stderr, "%s: rnorm_eps  = %.1e\n", __func__, hparams.f_rms_norm_eps);
         fprintf(stderr, "%s: n_ff       = %u\n",   __func__, n_ff);
         fprintf(stderr, "%s: freq_base  = %.1f\n", __func__, hparams.rope_freq_base);
         fprintf(stderr, "%s: freq_scale = %g\n",   __func__, hparams.rope_freq_scale);
@@ -1330,6 +1337,7 @@ static bool llama_model_load(
         int n_ctx,
         int n_batch,
         int n_gqa,
+        float rms_norm_eps,
         int n_gpu_layers,
         int main_gpu,
         const float * tensor_split,
@@ -1343,7 +1351,7 @@ static bool llama_model_load(
         llama_progress_callback progress_callback,
         void *progress_callback_user_data) {
     try {
-        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
+        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
                                   use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
         return true;
     } catch (const std::exception & err) {
@@ -1396,10 +1404,12 @@ static bool llama_eval_internal(
     const int64_t n_vocab     = hparams.n_vocab;
     const int64_t n_embd_gqa  = hparams.n_embd_gqa();
 
+
     LLAMA_ASSERT(n_embd_head == hparams.n_rot);
 
     const float freq_base  = hparams.rope_freq_base;
     const float freq_scale = hparams.rope_freq_scale;
+    const float rms_norm_eps = hparams.f_rms_norm_eps;
 
     const int n_gpu_layers = model.n_gpu_layers;
 
@@ -1479,7 +1489,7 @@ static bool llama_eval_internal(
 
         // norm
         {
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
             offload_func(cur);
             ggml_set_name(cur, "rms_norm_0");
 
@@ -1627,7 +1637,7 @@ static bool llama_eval_internal(
         {
             // norm
             {
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                 offload_func(cur);
                 ggml_set_name(cur, "rms_norm_1");
 
@@ -1680,7 +1690,7 @@ static bool llama_eval_internal(
 
     // norm
     {
-        cur = ggml_rms_norm(ctx0, inpL);
+        cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
         offload_func_nr(cur);
         ggml_set_name(cur, "rms_norm_2");
 
@@ -3084,7 +3094,7 @@ struct llama_model * llama_load_model_from_file(
 
     ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
 
-    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.n_gpu_layers,
+    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
                 params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
                 memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
                 params.progress_callback_user_data)) {
diff --git a/llama.h b/llama.h
index 81a30e1..843b0bf 100644
--- a/llama.h
+++ b/llama.h
@@ -87,6 +87,7 @@ extern "C" {
         int32_t  n_ctx;        // text context
         int32_t  n_batch;      // prompt processing batch size
         int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
+        float    rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
         int32_t  n_gpu_layers; // number of layers to store in VRAM
         int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
 
diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index ef20bce..6d31221 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -850,7 +850,7 @@ int main(int argc, const char ** argv) {
                     ggml_set_param(ctx0, x[i]);
                 }
 
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0]));
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
 
                 check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
             }
-- 
cgit v1.2.3


From 81844fbcfd93a162b7aeaea9e4f2ab1358f7f97e Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Wed, 2 Aug 2023 04:06:19 -0400
Subject: tests : Fix compilation warnings (Linux/GCC) (#2451)

* fix hellaswag print format, cast away warning in test-double-float

* c++11 cannot use designated initializers

* add static to test-grad0.c internal functions

* use memcpy in test-double-float.c

* port c tests to c++

* use initializer list for ggml_init_params
---
 Makefile                    |    6 +-
 examples/common.cpp         |    2 +-
 scripts/sync-ggml.sh        |    4 +-
 tests/CMakeLists.txt        |    6 +-
 tests/test-double-float.c   |   53 --
 tests/test-double-float.cpp |   55 ++
 tests/test-grad0.c          | 1525 -------------------------------------------
 tests/test-grad0.cpp        | 1525 +++++++++++++++++++++++++++++++++++++++++++
 tests/test-opt.c            |  211 ------
 tests/test-opt.cpp          |  212 ++++++
 10 files changed, 1801 insertions(+), 1798 deletions(-)
 delete mode 100644 tests/test-double-float.c
 create mode 100644 tests/test-double-float.cpp
 delete mode 100644 tests/test-grad0.c
 create mode 100644 tests/test-grad0.cpp
 delete mode 100644 tests/test-opt.c
 create mode 100644 tests/test-opt.cpp

(limited to 'tests')

diff --git a/Makefile b/Makefile
index 100614b..a692a39 100644
--- a/Makefile
+++ b/Makefile
@@ -411,13 +411,13 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-tests/test-double-float: tests/test-double-float.c build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
 
-tests/test-grad0: tests/test-grad0.c build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
 
-tests/test-opt: tests/test-opt.c build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
 
 tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
diff --git a/examples/common.cpp b/examples/common.cpp
index e643984..3e7c3b6 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -572,7 +572,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stdout, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
     fprintf(stdout, "  --perplexity          compute perplexity over each ctx window of the prompt\n");
     fprintf(stdout, "  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
-    fprintf(stdout, "  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %d)\n", params.hellaswag_tasks);
+    fprintf(stdout, "  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
     fprintf(stdout, "  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
     fprintf(stdout, "  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
     if (llama_mlock_supported()) {
diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh
index 02ea6ec..3d13e85 100755
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -10,5 +10,5 @@ cp -rpv ../ggml/src/ggml-metal.m     ./ggml-metal.m
 cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
 cp -rpv ../ggml/include/ggml/ggml.h  ./ggml.h
 
-cp -rpv ../ggml/tests/test-opt.c    ./tests/test-opt.c
-cp -rpv ../ggml/tests/test-grad0.c  ./tests/test-grad0.c
+cp -rpv ../ggml/tests/test-opt.cpp    ./tests/test-opt.cpp
+cp -rpv ../ggml/tests/test-grad0.cpp  ./tests/test-grad0.cpp
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 11ec6c7..1a40edb 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -6,10 +6,10 @@ function(llama_add_test source)
     add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
 endfunction()
 
-# llama_add_test(test-double-float.c) # SLOW
+# llama_add_test(test-double-float.cpp) # SLOW
 llama_add_test(test-quantize-fns.cpp)
 llama_add_test(test-quantize-perf.cpp)
 llama_add_test(test-sampling.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
-llama_add_test(test-grad0.c) # SLOW
-# llama_add_test(test-opt.c) # SLOW
+llama_add_test(test-grad0.cpp) # SLOW
+# llama_add_test(test-opt.cpp) # SLOW
diff --git a/tests/test-double-float.c b/tests/test-double-float.c
deleted file mode 100644
index 89dafc9..0000000
--- a/tests/test-double-float.c
+++ /dev/null
@@ -1,53 +0,0 @@
-// These tests may take a long time!
-// They are to prove that conversion from double to float of various functions in ggml.c doesn't affect the result.
-// This is done by checking all finite (non-NaN, non-infinite) floats.
-
-#undef NDEBUG
-#include <assert.h>
-#include <immintrin.h>
-#include <math.h>
-#include <stdint.h>
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-
-// ggml.c::quantize_row_q4_0_reference
-inline static uint8_t round_orig(float v0) { return ((int8_t) (round(v0))) + 8; }
-
-// ggml.c::ggml_silu_f32
-inline static float silu_orig(float x) {
-    return x/(1.0 + exp(-x));
-}
-
-#pragma GCC diagnostic pop
-
-// ggml.c::quantize_row_q4_0_reference
-inline static uint8_t round_float(float v0) { return (int8_t)roundf(v0) + 8; }
-
-// ggml.c::ggml_silu_f32
-inline static float silu_float(float x) {
-    return x/(1.0f + expf(-x));
-}
-
-int main(void) {
-    uint32_t x = UINT32_MAX;
-    do {
-        float f = *(float *)&x;
-        assert(!isfinite(f) || (round_orig(f) == round_float(f)));
-    } while (x--);
-
-#ifdef __F16C__
-    // GELU and SILU implementations are used with a FP16 lookup table.
-    // The original and float-only results are not equal for all inputs after converting to FP16.
-    // GELU is an approximation anyway (tanh), not tested here.
-    // For SILU, verify that the results are at least the closest floating point numbers, if the FP16 values don't match.
-    for (x = 0; x <= UINT16_MAX; x++) {
-        float f = _cvtsh_ss(x);
-        const float so = silu_orig(f);
-        const float sf = silu_float(f);
-        assert(   (_cvtss_sh(so, 0) == _cvtss_sh(sf, 0))
-               || (nextafterf(so, sf) == sf)
-               || (nextafterf(sf, so) == so));
-    }
-#endif
-}
diff --git a/tests/test-double-float.cpp b/tests/test-double-float.cpp
new file mode 100644
index 0000000..b506f27
--- /dev/null
+++ b/tests/test-double-float.cpp
@@ -0,0 +1,55 @@
+// These tests may take a long time!
+// They are to prove that conversion from double to float of various functions in ggml.c doesn't affect the result.
+// This is done by checking all finite (non-NaN, non-infinite) floats.
+
+#undef NDEBUG
+#include <cassert>
+#include <immintrin.h>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+
+// ggml.c::quantize_row_q4_0_reference
+inline static uint8_t round_orig(float v0) { return ((int8_t) (round(v0))) + 8; }
+
+// ggml.c::ggml_silu_f32
+inline static float silu_orig(float x) {
+    return x/(1.0 + exp(-x));
+}
+
+#pragma GCC diagnostic pop
+
+// ggml.c::quantize_row_q4_0_reference
+inline static uint8_t round_float(float v0) { return (int8_t)roundf(v0) + 8; }
+
+// ggml.c::ggml_silu_f32
+inline static float silu_float(float x) {
+    return x/(1.0f + expf(-x));
+}
+
+int main(void) {
+    uint32_t x = UINT32_MAX;
+    do {
+        float f;
+        memcpy(&f, &x, sizeof(x));
+        assert(!std::isfinite(f) || (round_orig(f) == round_float(f)));
+    } while (x--);
+
+#ifdef __F16C__
+    // GELU and SILU implementations are used with a FP16 lookup table.
+    // The original and float-only results are not equal for all inputs after converting to FP16.
+    // GELU is an approximation anyway (tanh), not tested here.
+    // For SILU, verify that the results are at least the closest floating point numbers, if the FP16 values don't match.
+    for (x = 0; x <= UINT16_MAX; x++) {
+        float f = _cvtsh_ss(x);
+        const float so = silu_orig(f);
+        const float sf = silu_float(f);
+        assert(   (_cvtss_sh(so, 0) == _cvtss_sh(sf, 0))
+               || (nextafterf(so, sf) == sf)
+               || (nextafterf(sf, so) == so));
+    }
+#endif
+}
diff --git a/tests/test-grad0.c b/tests/test-grad0.c
deleted file mode 100644
index 6d31221..0000000
--- a/tests/test-grad0.c
+++ /dev/null
@@ -1,1525 +0,0 @@
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
-#include "ggml.h"
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif
-
-#define MAX_NARGS 3
-
-#undef MIN
-#undef MAX
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-#define GGML_SILU_FP16
-
-//
-// logging
-//
-
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-float frand(void) {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-int irand(int n) {
-    if (n == 0) return 0;
-    return rand()%n;
-}
-
-void get_random_dims(int64_t * dims, int ndims) {
-    dims[0] = dims[1] = dims[2] = dims[3] = 1;
-
-    for (int i = 0; i < ndims; i++) {
-        dims[i] = 1 + irand(4);
-    }
-}
-
-struct ggml_tensor * get_random_tensor_f32(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        float fmin,
-        float fmax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    };
-
-    return result;
-}
-
-struct ggml_tensor * get_random_tensor_f16(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        float fmin,
-        float fmax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    };
-
-    return result;
-}
-
-struct ggml_tensor * get_random_tensor_i32(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        int32_t imin,
-        int32_t imax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((int32_t *)result->data)[i0] = irand(imax - imin) + imin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    };
-
-    return result;
-}
-
-void print_elements(const char* label, const struct ggml_tensor * t) {
-    if (!t) {
-        printf("%s: %s = null\n", __func__, label);
-        return;
-    }
-    const int nelements = ggml_nelements(t);
-    printf("%s: %s = [", __func__, label);
-    for (int k = 0; k < nelements; ++k) {
-        if (k > 0) { printf(", "); }
-        printf("%.5f", ggml_get_f32_1d(t, k));
-    }
-    printf("] shape: [");
-    for (int k = 0; k < t->n_dims; ++k) {
-        if (k > 0) { printf(", "); }
-        printf("%d", (int)t->ne[k]);
-    }
-    printf("]\n");
-
-}
-
-bool check_gradient(
-        const char * op_name,
-        struct ggml_context * ctx0,
-        struct ggml_tensor * x[],
-        struct ggml_tensor * f,
-        int ndims,
-        int nargs,
-        float eps,
-        float max_error_abs,
-        float max_error_rel) {
-
-    static int n_threads = -1;
-    if (n_threads < 0) {
-        n_threads = GGML_DEFAULT_N_THREADS;
-
-        const char *env = getenv("GGML_N_THREADS");
-        if (env) {
-            n_threads = atoi(env);
-        }
-
-        printf("GGML_N_THREADS = %d\n", n_threads);
-    }
-
-    struct ggml_cgraph gf = ggml_build_forward (f);
-    struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
-
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    ggml_graph_reset  (&gf);
-    ggml_set_f32      (f->grad, 1.0f);
-
-    ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
-
-    // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
-    // ggml_graph_dump_dot(&gb, &gf,  "test-grad0-backward.dot");
-
-    for (int i = 0; i < nargs; ++i) {
-        const int nelements = ggml_nelements(x[i]);
-        for (int k = 0; k < nelements; ++k) {
-            // compute gradient using finite differences
-            const float x0 = ggml_get_f32_1d(x[i], k);
-            const float xm = x0 - eps;
-            const float xp = x0 + eps;
-            ggml_set_f32_1d(x[i], k, xp);
-
-            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-            const float f0 = ggml_get_f32_1d(f, 0);
-
-            ggml_set_f32_1d(x[i], k, xm);
-
-            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-            const float f1 = ggml_get_f32_1d(f, 0);
-            const float g0 = (f0 - f1)/(2.0f*eps);
-
-            ggml_set_f32_1d(x[i], k, x0);
-
-            // compute gradient using backward graph
-            ggml_graph_reset  (&gf);
-            ggml_set_f32      (f->grad, 1.0f);
-
-            ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
-
-            const float g1 = ggml_get_f32_1d(x[i]->grad, k);
-
-            const float error_abs = fabsf(g0 - g1);
-            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
-
-            if (error_abs > max_error_abs || error_rel > max_error_rel) {
-                printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
-                            op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel);
-                //assert(false);
-                return false;
-            }
-        }
-    }
-
-    return true;
-}
-
-// TODO: clean-up this ..
-bool check_mat_mul(
-        const struct ggml_tensor * y,
-        const struct ggml_tensor * x0,
-        const struct ggml_tensor * x1) {
-    float * dst  = (float *) y->data;
-    float * src0 = (float *) x0->data;
-    float * src1 = (float *) x1->data;
-
-    const int nc = x0->ne[1];
-    const int nr = x1->ne[1];
-    const int nk = x0->ne[0];
-
-    GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk);
-
-    GGML_PRINT_DEBUG("x0:\n");
-    for (int j = 0; j < x0->ne[1]; ++j) {
-        for (int i = 0; i < x0->ne[0]; ++i) {
-            GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]);
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-    GGML_PRINT_DEBUG("\n");
-
-    GGML_PRINT_DEBUG("x1:\n");
-    for (int j = 0; j < x1->ne[1]; ++j) {
-        for (int i = 0; i < x1->ne[0]; ++i) {
-            GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]);
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-    GGML_PRINT_DEBUG("\n");
-
-    GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]);
-    for (int j = 0; j < y->ne[1]; ++j) {
-        for (int i = 0; i < y->ne[0]; ++i) {
-            GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]);
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-
-    for (int i = 0; i < nr; ++i) {
-        for (int j = 0; j < nc; ++j) {
-            float sum = 0.0f;
-
-            for (int k = 0; k < nk; ++k) {
-                sum += src0[j*nk + k]*src1[i*nk + k];
-            }
-
-            if (fabsf(dst[i*nc + j] - sum) > 1e-5f) {
-                fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum);
-                assert(false);
-                return false;
-            }
-        }
-    }
-
-    return true;
-}
-
-#define NUM_PERMUTATIONS (4*3*2*1)
-
-int main(int argc, const char ** argv) {
-    struct ggml_init_params params = {
-        .mem_size   = 128*1024*1024,
-        .mem_buffer = NULL,
-        .no_alloc   = false,
-    };
-
-    int64_t ne[4];
-
-    int all_permutations[4 * NUM_PERMUTATIONS];
-    {
-        int count = 0;
-        for (int ax0=0; ax0<4; ++ax0) {
-            for (int ax1=0; ax1<4; ++ax1) {
-                if (ax1 == ax0) continue;
-                for (int ax2=0; ax2<4; ++ax2) {
-                    if (ax2 == ax0) continue;
-                    if (ax2 == ax1) continue;
-                    for (int ax3=0; ax3<4; ++ax3) {
-                        if (ax3 == ax0) continue;
-                        if (ax3 == ax1) continue;
-                        if (ax3 == ax2) continue;
-                        assert(count < NUM_PERMUTATIONS);
-                        all_permutations[count*4+0] = ax0;
-                        all_permutations[count*4+1] = ax1;
-                        all_permutations[count*4+2] = ax2;
-                        all_permutations[count*4+3] = ax3;
-                        ++count;
-                    }
-                }
-            }
-        }
-    }
-
-
-    // original loop: 1000
-    int niter = 4;
-    const char *env = getenv("GGML_NLOOP");
-    if (env != NULL) {
-        niter = atoi(env);
-    }
-    if (argc > 1) {
-        niter = atoi(argv[1]);
-    }
-    for (int iter = 0; iter < niter; ++iter) {
-        printf("test-grad0: iter:%d/%d\n", iter, niter);
-        struct ggml_context * ctx0 = ggml_init(params);
-
-        get_random_dims(ne, 4);
-
-        struct ggml_tensor * x[MAX_NARGS];
-
-        // add f32
-        {
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
-
-                check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
-            }
-        }
-
-        // add f16
-        {
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
-
-                check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
-            }
-        }
-
-        // sub
-        {
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
-
-                check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // mul
-        {
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
-
-                check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // div
-        {
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
-
-                check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f);
-            }
-        }
-
-        // sqr
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
-
-                check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // sqrt
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
-
-                check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
-            }
-        }
-
-        // log
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
-
-                check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
-            }
-        }
-
-        // sum
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
-
-                check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-
-        // sum_rows
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
-
-                check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
-            }
-        }
-
-        // mean, not yet fully implemented
-        if(0)
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
-
-                check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // argmax
-        if (0)
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
-
-                check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // repeat
-        {
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            ne2[0] = ne[0] * ne2[0];
-            ne2[1] = ne[1] * ne2[1];
-            ne2[2] = 1;
-            ne2[3] = 1;
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
-
-                check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
-            }
-        }
-
-        // repeat back
-        {
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            ne2[0] = ne[0] * ne2[0];
-            ne2[1] = ne[1] * ne2[1];
-            ne2[2] = 1;
-            ne2[3] = 1;
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
-
-                check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
-            }
-        }
-
-        // abs (finite differences do not work)
-        //{
-        //    const int nargs = 1;
-
-        //    for (int ndims = 1; ndims <= 2; ++ndims) {
-        //        for (int i = 0; i < nargs; ++i) {
-        //            x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-        //            ggml_set_param(ctx0, x[i]);
-        //        }
-
-        //        struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
-
-        //        check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f);
-        //    }
-        //}
-
-        // sgn
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
-
-                check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // neg
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
-
-                check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // step
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
-
-                check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // tanh, not yet fully implemented
-        if(0)
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
-
-                check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // mul_mat
-        {
-            const int nargs = 2;
-
-            for (int ndims = 2; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                {
-                    int64_t ne2[4];
-                    get_random_dims(ne2, 4);
-                    ne2[0] = ne[0];
-                    x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                }
-
-                ggml_set_param(ctx0, x[0]);
-                ggml_set_param(ctx0, x[1]);
-
-                struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
-                struct ggml_tensor * f = ggml_sum(ctx0, m);
-
-                GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
-
-                check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-                check_mat_mul(m, x[1], x[0]);
-            }
-        }
-
-        // elu, not yet fully implemented
-        if(0)
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
-
-                check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // relu
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
-
-                check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // gelu, not yet fully implemented
-        if(0)
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
-
-                check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // silu
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0]));
-
-#ifdef GGML_SILU_FP16
-                // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
-                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY);
-#else
-                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-#endif
-            }
-        }
-
-        // rms_norm
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
-
-                check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
-            }
-        }
-
-        // scale
-        {
-            const int nargs = 2;
-
-            int64_t ne2[4];
-            ne2[0] = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-                ggml_set_param(ctx0, x[1]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], x[1]));
-
-                check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // cpy f32
-        {
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
-
-                check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // cpy f16
-        {
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
-
-                check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
-            }
-        }
-
-        // reshape (1d->nd)
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                int64_t ne2[4];
-                ne2[0] = 1;
-                ne2[1] = 1;
-                ne2[2] = 1;
-                ne2[3] = 1;
-                for (int i = 0; i < ndims; ++i) {
-                    ne2[0] *= ne[i];
-                }
-                x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
-                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // reshape (nd->1d)
-        {
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                int64_t ne2[4];
-                ne2[0] = 1;
-                ne2[1] = 1;
-                ne2[2] = 1;
-                ne2[3] = 1;
-                for (int i = 0; i < ndims; ++i) {
-                    ne2[0] *= ne[i];
-                }
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
-                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // acc 1d
-        {
-            int64_t ne2[4] = { 1, 1, 1, 1 };
-
-            const int nargs = 2;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 1);
-                while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 1);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
-                const int offset = irand(max_offset) * ggml_element_size(x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // acc 2d
-        {
-            int64_t ne2[4]         = { 1, 1, 1, 1 };
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 2;
-            for (int ndims = 2; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 2);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 2);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                const int offset = offsets[0] + offsets[1];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // acc 3d
-        {
-            int64_t ne2[4]         = { 1, 1, 1, 1 };
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 2;
-            for (int ndims = 3; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 3);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 3);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
-                const int offset = offsets[0] + offsets[1] + offsets[2];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // acc 4d
-        {
-            int64_t ne2[4]         = { 1, 1, 1, 1 };
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 2;
-            for (int ndims = 4; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 4);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 4);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
-                max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
-                offsets[3] = irand(max_offsets[3]) * x[0]->nb[3];
-                const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // set_1d
-        {
-            int64_t ne2[4];
-
-            const int nargs = 2;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 1);
-                while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 1);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
-                const int offset = irand(max_offset) * ggml_element_size(x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
-
-                check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // set_2d
-        {
-            int64_t ne2[4];
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 1;
-            for (int ndims = 2; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 2);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 2);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                const int offset = offsets[0] + offsets[1];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
-
-                check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // view_1d
-        {
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int k0 = irand(ggml_nelements(x[0]));
-                const int k1 = irand(ggml_nelements(x[0]));
-                const int i0 = MIN(k0, k1);
-                const int i1 = MAX(k0, k1);
-
-                const int offset = i0 * sizeof(float);
-                const int nelem  = i1 - i0;
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
-
-                check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // view_2d
-        {
-            int64_t ne2[4];
-            int64_t nb2[4];
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                get_random_dims(ne2, 2);
-                while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
-                    get_random_dims(ne2, 2);
-                }
-                const int count = ne2[0]*ne2[1];
-
-                nb2[0] = sizeof(float);
-                nb2[1] = nb2[0]*ne2[0];
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int max_offset = ggml_nelements(x[0]) - count;
-                const int offset = irand(max_offset+1) * sizeof(float);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
-
-                check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // view_3d
-        {
-            int64_t ne2[4] = {1,1,1,1};
-            int64_t nb2[4] = {0,0,0,0};
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                get_random_dims(ne2, 3);
-                while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
-                    get_random_dims(ne2, 3);
-                }
-                const int count = ne2[0]*ne2[1]*ne2[2];
-
-                nb2[0] = sizeof(float);
-                nb2[1] = nb2[0]*ne2[0];
-                nb2[2] = nb2[1]*ne2[1];
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int max_offset = ggml_nelements(x[0]) - count;
-                const int offset = irand(max_offset+1) * sizeof(float);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
-
-                check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // permute
-        {
-            int64_t ne2[4];
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims)
-            {
-                // ggml_permute will set axes of dimensions below n_dims to 1.
-                // to make ggml_permute work correctly on all axes,
-                // the input tensor needs maximal n_dim of 4.
-                for (int i=0; i<ndims; ++i) {
-                    ne2[i] = ne[i];
-                }
-                for (int i=ndims; i<4; ++i) {
-                    ne2[i] = 1;
-                }
-                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int p = irand(NUM_PERMUTATIONS);
-                const int ax0 = all_permutations[p*4+0];
-                const int ax1 = all_permutations[p*4+1];
-                const int ax2 = all_permutations[p*4+2];
-                const int ax3 = all_permutations[p*4+3];
-
-                // sum requires contiguous tensor rows
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
-
-                check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // transpose
-        {
-            int64_t ne2[4];
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims)
-            {
-                // ggml_transpose will set axes of dimensions below n_dims to 1.
-                // to make ggml_transpose work correctly on all axes,
-                // the input tensor needs maximal n_dim of 4.
-                for (int i=0; i<ndims; ++i) {
-                    ne2[i] = ne[i];
-                }
-                for (int i=ndims; i<4; ++i) {
-                    ne2[i] = 1;
-                }
-                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                // sum requires contiguous tensor rows
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
-
-                check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // get_rows
-        {
-            int64_t ne2[4] = {ne[0], ne[1], 1, 1};
-            int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
-            const int nargs = 1;
-            const int ndims = 2;
-            x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-            x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
-
-            ggml_set_param(ctx0, x[0]);
-
-            struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
-
-            check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-        }
-
-        // diag_mask_inf
-        {
-            const int nargs = 1;
-            const int ndims = 2;
-
-            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-            ggml_set_param(ctx0, x[0]);
-
-            int n_past = irand(ne[0]);
-
-            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
-
-            check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-        }
-
-        // diag_mask_zero
-        {
-            const int nargs = 1;
-            const int ndims = 2;
-
-            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-            ggml_set_param(ctx0, x[0]);
-
-            int n_past = irand(ne[0]);
-
-            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
-
-            check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-        }
-
-        // softmax
-        {
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            for (int ndims = 1; ndims <= 3; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_soft_max(ctx0, x[0]));
-
-                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // cross_entropy_loss
-        {
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            for (int ndims = 1; ndims <= 3; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
-
-                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-1f, 1e-2f, INFINITY);
-                // finite differences regularly fails!
-            }
-        }
-
-        // rope f32
-        {
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-            ne2[0] += ne2[0] % 2;
-            int n_rot = ne2[0];
-
-            for (int ndims = 3; ndims <= 4; ++ndims) {
-                for (int mode = 0; mode < 4; ++mode) {
-                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
-                        x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-
-                        ggml_set_param(ctx0, x[0]);
-
-                        const bool skip_past = (mode & 1);
-                        if (skip_past) {
-                            // we have no past, so this would have to work on uninitialized memory.
-                            // we only test the gradients here;
-                            // skip_past should have no influence on gradient computation.
-                            // so when other modes work, we assume that this does as well.
-                            continue;
-                        }
-
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
-
-                        GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
-                        check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
-                    }
-                }
-            }
-        }
-
-        // rope f16
-        {
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-            ne2[0] += ne2[0] % 2;
-            int n_rot = ne2[0];
-
-            for (int ndims = 3; ndims <= 4; ++ndims) {
-                for (int mode = 0; mode < 4; ++mode) {
-                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
-                        x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
-
-                        ggml_set_param(ctx0, x[0]);
-
-                        const bool skip_past = (mode & 1);
-                        if (skip_past) {
-                            // we have no past, so this would have to work on uninitialized memory.
-                            // we only test the gradients here;
-                            // skip_past should have no influence on gradient computation.
-                            // so when other modes work, we assume that this does as well.
-                            continue;
-                        }
-
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
-
-                        GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
-                        check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
-                    }
-                }
-            }
-        }
-
-        // flash_attn f32
-        {
-            const int nargs = 3;
-
-            int64_t ne2[4];
-
-            get_random_dims(ne2, 4);
-            int64_t D = ne2[0];
-            int64_t N = ne2[1];
-            int64_t M = ne2[2] + N;
-            int64_t B = ne2[3];
-
-            for (int masked = 0; masked <= 1; ++masked) {
-                for (int ndims = 2; ndims <= 4; ++ndims) {
-                    int64_t neq[4] = { D, N, B, ne[3] };
-                    int64_t nek[4] = { D, M, B, ne[3] };
-                    int64_t nev[4] = { M, D, B, ne[3] };
-                    if (ndims == 2) {
-                        neq[2] = 1; neq[3] = 1;
-                        nek[2] = 1; nek[3] = 1;
-                        nev[2] = 1; nev[3] = 1;
-                    } else if (ndims == 3) {
-                        neq[3] = 1;
-                        nek[3] = 1;
-                        nev[3] = 1;
-                    }
-                    x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
-                    x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
-                    x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
-                    ggml_set_param(ctx0, x[0]);
-                    ggml_set_param(ctx0, x[1]);
-                    ggml_set_param(ctx0, x[2]);
-
-                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
-
-                    check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
-                }
-            }
-        }
-
-        // flash_attn f16, not yet fully implemented
-        if(0)
-        {
-            const int nargs = 3;
-
-            int64_t ne2[4];
-
-            get_random_dims(ne2, 4);
-            int64_t D = ne2[0];
-            int64_t N = ne2[1];
-            int64_t M = ne2[2] + N;
-            int64_t B = ne2[3];
-
-            for (int masked = 0; masked <= 1; ++masked) {
-                for (int ndims = 2; ndims <= 4; ++ndims) {
-                    int64_t neq[4] = { D, N, B, ne[3] };
-                    int64_t nek[4] = { D, M, B, ne[3] };
-                    int64_t nev[4] = { M, D, B, ne[3] };
-                    if (ndims == 2) {
-                        neq[2] = 1; neq[3] = 1;
-                        nek[2] = 1; nek[3] = 1;
-                        nev[2] = 1; nev[3] = 1;
-                    } else if (ndims == 3) {
-                        neq[3] = 1;
-                        nek[3] = 1;
-                        nev[3] = 1;
-                    }
-                    x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
-                    x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
-                    x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
-                    ggml_set_param(ctx0, x[0]);
-                    ggml_set_param(ctx0, x[1]);
-                    ggml_set_param(ctx0, x[2]);
-
-                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
-
-                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
-                }
-            }
-        }
-        ggml_free(ctx0);
-    }
-
-    return 0;
-}
diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
new file mode 100644
index 0000000..75a698d
--- /dev/null
+++ b/tests/test-grad0.cpp
@@ -0,0 +1,1525 @@
+#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
+#include "ggml.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+#endif
+
+#define MAX_NARGS 3
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define GGML_SILU_FP16
+
+//
+// logging
+//
+
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+#define GGML_PRINT(...) printf(__VA_ARGS__)
+
+static float frand(void) {
+    return (float)rand()/(float)RAND_MAX;
+}
+
+static int irand(int n) {
+    if (n == 0) return 0;
+    return rand()%n;
+}
+
+static void get_random_dims(int64_t * dims, int ndims) {
+    dims[0] = dims[1] = dims[2] = dims[3] = 1;
+
+    for (int i = 0; i < ndims; i++) {
+        dims[i] = 1 + irand(4);
+    }
+}
+
+static struct ggml_tensor * get_random_tensor_f32(
+        struct ggml_context * ctx0,
+        int ndims,
+        int64_t ne[],
+        float fmin,
+        float fmax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+
+    return result;
+}
+
+static struct ggml_tensor * get_random_tensor_f16(
+        struct ggml_context * ctx0,
+        int ndims,
+        int64_t ne[],
+        float fmin,
+        float fmax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+
+    return result;
+}
+
+static struct ggml_tensor * get_random_tensor_i32(
+        struct ggml_context * ctx0,
+        int ndims,
+        int64_t ne[],
+        int32_t imin,
+        int32_t imax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((int32_t *)result->data)[i0] = irand(imax - imin) + imin;
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin;
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+
+    return result;
+}
+
+static void print_elements(const char* label, const struct ggml_tensor * t) {
+    if (!t) {
+        printf("%s: %s = null\n", __func__, label);
+        return;
+    }
+    const int nelements = ggml_nelements(t);
+    printf("%s: %s = [", __func__, label);
+    for (int k = 0; k < nelements; ++k) {
+        if (k > 0) { printf(", "); }
+        printf("%.5f", ggml_get_f32_1d(t, k));
+    }
+    printf("] shape: [");
+    for (int k = 0; k < t->n_dims; ++k) {
+        if (k > 0) { printf(", "); }
+        printf("%d", (int)t->ne[k]);
+    }
+    printf("]\n");
+
+}
+
+static bool check_gradient(
+        const char * op_name,
+        struct ggml_context * ctx0,
+        struct ggml_tensor * x[],
+        struct ggml_tensor * f,
+        int ndims,
+        int nargs,
+        float eps,
+        float max_error_abs,
+        float max_error_rel) {
+
+    static int n_threads = -1;
+    if (n_threads < 0) {
+        n_threads = GGML_DEFAULT_N_THREADS;
+
+        const char *env = getenv("GGML_N_THREADS");
+        if (env) {
+            n_threads = atoi(env);
+        }
+
+        printf("GGML_N_THREADS = %d\n", n_threads);
+    }
+
+    struct ggml_cgraph gf = ggml_build_forward (f);
+    struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
+
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
+    ggml_graph_reset  (&gf);
+    ggml_set_f32      (f->grad, 1.0f);
+
+    ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+
+    // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
+    // ggml_graph_dump_dot(&gb, &gf,  "test-grad0-backward.dot");
+
+    for (int i = 0; i < nargs; ++i) {
+        const int nelements = ggml_nelements(x[i]);
+        for (int k = 0; k < nelements; ++k) {
+            // compute gradient using finite differences
+            const float x0 = ggml_get_f32_1d(x[i], k);
+            const float xm = x0 - eps;
+            const float xp = x0 + eps;
+            ggml_set_f32_1d(x[i], k, xp);
+
+            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
+            const float f0 = ggml_get_f32_1d(f, 0);
+
+            ggml_set_f32_1d(x[i], k, xm);
+
+            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
+            const float f1 = ggml_get_f32_1d(f, 0);
+            const float g0 = (f0 - f1)/(2.0f*eps);
+
+            ggml_set_f32_1d(x[i], k, x0);
+
+            // compute gradient using backward graph
+            ggml_graph_reset  (&gf);
+            ggml_set_f32      (f->grad, 1.0f);
+
+            ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+
+            const float g1 = ggml_get_f32_1d(x[i]->grad, k);
+
+            const float error_abs = fabsf(g0 - g1);
+            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
+
+            if (error_abs > max_error_abs || error_rel > max_error_rel) {
+                printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
+                            op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel);
+                //assert(false);
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+// TODO: clean-up this ..
+static bool check_mat_mul(
+        const struct ggml_tensor * y,
+        const struct ggml_tensor * x0,
+        const struct ggml_tensor * x1) {
+    float * dst  = (float *) y->data;
+    float * src0 = (float *) x0->data;
+    float * src1 = (float *) x1->data;
+
+    const int nc = x0->ne[1];
+    const int nr = x1->ne[1];
+    const int nk = x0->ne[0];
+
+    GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk);
+
+    GGML_PRINT_DEBUG("x0:\n");
+    for (int j = 0; j < x0->ne[1]; ++j) {
+        for (int i = 0; i < x0->ne[0]; ++i) {
+            GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]);
+        }
+        GGML_PRINT_DEBUG("\n");
+    }
+    GGML_PRINT_DEBUG("\n");
+
+    GGML_PRINT_DEBUG("x1:\n");
+    for (int j = 0; j < x1->ne[1]; ++j) {
+        for (int i = 0; i < x1->ne[0]; ++i) {
+            GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]);
+        }
+        GGML_PRINT_DEBUG("\n");
+    }
+    GGML_PRINT_DEBUG("\n");
+
+    GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]);
+    for (int j = 0; j < y->ne[1]; ++j) {
+        for (int i = 0; i < y->ne[0]; ++i) {
+            GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]);
+        }
+        GGML_PRINT_DEBUG("\n");
+    }
+
+    for (int i = 0; i < nr; ++i) {
+        for (int j = 0; j < nc; ++j) {
+            float sum = 0.0f;
+
+            for (int k = 0; k < nk; ++k) {
+                sum += src0[j*nk + k]*src1[i*nk + k];
+            }
+
+            if (fabsf(dst[i*nc + j] - sum) > 1e-5f) {
+                fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum);
+                assert(false);
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+#define NUM_PERMUTATIONS (4*3*2*1)
+
+int main(int argc, const char ** argv) {
+    struct ggml_init_params params = {
+        /* .mem_size   = */ 128*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+
+    int64_t ne[4];
+
+    int all_permutations[4 * NUM_PERMUTATIONS];
+    {
+        int count = 0;
+        for (int ax0=0; ax0<4; ++ax0) {
+            for (int ax1=0; ax1<4; ++ax1) {
+                if (ax1 == ax0) continue;
+                for (int ax2=0; ax2<4; ++ax2) {
+                    if (ax2 == ax0) continue;
+                    if (ax2 == ax1) continue;
+                    for (int ax3=0; ax3<4; ++ax3) {
+                        if (ax3 == ax0) continue;
+                        if (ax3 == ax1) continue;
+                        if (ax3 == ax2) continue;
+                        assert(count < NUM_PERMUTATIONS);
+                        all_permutations[count*4+0] = ax0;
+                        all_permutations[count*4+1] = ax1;
+                        all_permutations[count*4+2] = ax2;
+                        all_permutations[count*4+3] = ax3;
+                        ++count;
+                    }
+                }
+            }
+        }
+    }
+
+
+    // original loop: 1000
+    int niter = 4;
+    const char *env = getenv("GGML_NLOOP");
+    if (env != NULL) {
+        niter = atoi(env);
+    }
+    if (argc > 1) {
+        niter = atoi(argv[1]);
+    }
+    for (int iter = 0; iter < niter; ++iter) {
+        printf("test-grad0: iter:%d/%d\n", iter, niter);
+        struct ggml_context * ctx0 = ggml_init(params);
+
+        get_random_dims(ne, 4);
+
+        struct ggml_tensor * x[MAX_NARGS];
+
+        // add f32
+        {
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
+
+                check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
+            }
+        }
+
+        // add f16
+        {
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
+
+                check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
+            }
+        }
+
+        // sub
+        {
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
+
+                check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // mul
+        {
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
+
+                check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // div
+        {
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
+
+                check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f);
+            }
+        }
+
+        // sqr
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
+
+                check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // sqrt
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
+
+                check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
+            }
+        }
+
+        // log
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
+
+                check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
+            }
+        }
+
+        // sum
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
+
+                check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+
+        // sum_rows
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
+
+                check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
+            }
+        }
+
+        // mean, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
+
+                check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // argmax
+        if (0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
+
+                check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // repeat
+        {
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+
+            ne2[0] = ne[0] * ne2[0];
+            ne2[1] = ne[1] * ne2[1];
+            ne2[2] = 1;
+            ne2[3] = 1;
+
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
+
+                check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
+            }
+        }
+
+        // repeat back
+        {
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+
+            ne2[0] = ne[0] * ne2[0];
+            ne2[1] = ne[1] * ne2[1];
+            ne2[2] = 1;
+            ne2[3] = 1;
+
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
+
+                check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
+            }
+        }
+
+        // abs (finite differences do not work)
+        //{
+        //    const int nargs = 1;
+
+        //    for (int ndims = 1; ndims <= 2; ++ndims) {
+        //        for (int i = 0; i < nargs; ++i) {
+        //            x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+        //            ggml_set_param(ctx0, x[i]);
+        //        }
+
+        //        struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
+
+        //        check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f);
+        //    }
+        //}
+
+        // sgn
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
+
+                check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // neg
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
+
+                check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // step
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
+
+                check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // tanh, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
+
+                check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // mul_mat
+        {
+            const int nargs = 2;
+
+            for (int ndims = 2; ndims <= 2; ++ndims) {
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                {
+                    int64_t ne2[4];
+                    get_random_dims(ne2, 4);
+                    ne2[0] = ne[0];
+                    x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                }
+
+                ggml_set_param(ctx0, x[0]);
+                ggml_set_param(ctx0, x[1]);
+
+                struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
+                struct ggml_tensor * f = ggml_sum(ctx0, m);
+
+                GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
+
+                check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_mat_mul(m, x[1], x[0]);
+            }
+        }
+
+        // elu, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
+
+                check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // relu
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
+
+                check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // gelu, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
+
+                check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
+            }
+        }
+
+        // silu
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0]));
+
+#ifdef GGML_SILU_FP16
+                // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
+                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY);
+#else
+                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+#endif
+            }
+        }
+
+        // rms_norm
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
+
+                check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
+            }
+        }
+
+        // scale
+        {
+            const int nargs = 2;
+
+            int64_t ne2[4];
+            ne2[0] = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+
+                ggml_set_param(ctx0, x[0]);
+                ggml_set_param(ctx0, x[1]);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], x[1]));
+
+                check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // cpy f32
+        {
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
+
+                check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // cpy f16
+        {
+            const int nargs = 2;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                for (int i = 0; i < nargs; ++i) {
+                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
+                    ggml_set_param(ctx0, x[i]);
+                }
+                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
+
+                check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
+            }
+        }
+
+        // reshape (1d->nd)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                int64_t ne2[4];
+                ne2[0] = 1;
+                ne2[1] = 1;
+                ne2[2] = 1;
+                ne2[3] = 1;
+                for (int i = 0; i < ndims; ++i) {
+                    ne2[0] *= ne[i];
+                }
+                x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
+                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // reshape (nd->1d)
+        {
+            const int nargs = 1;
+
+            for (int ndims = 1; ndims <= 2; ++ndims) {
+                int64_t ne2[4];
+                ne2[0] = 1;
+                ne2[1] = 1;
+                ne2[2] = 1;
+                ne2[3] = 1;
+                for (int i = 0; i < ndims; ++i) {
+                    ne2[0] *= ne[i];
+                }
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
+                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // acc 1d
+        {
+            int64_t ne2[4] = { 1, 1, 1, 1 };
+
+            const int nargs = 2;
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                get_random_dims(ne2, 1);
+                while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
+                    get_random_dims(ne2, 1);
+                }
+
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[1]);
+
+                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
+                const int offset = irand(max_offset) * ggml_element_size(x[0]);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
+
+                check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // acc 2d
+        {
+            int64_t ne2[4]         = { 1, 1, 1, 1 };
+            int64_t max_offsets[4] = { 0, 0, 0, 0 };
+            int64_t offsets[4]     = { 0, 0, 0, 0 };
+
+            const int nargs = 2;
+            for (int ndims = 2; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                get_random_dims(ne2, 2);
+                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
+                    get_random_dims(ne2, 2);
+                }
+
+                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[1]);
+
+                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
+                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
+                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
+                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
+                const int offset = offsets[0] + offsets[1];
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
+
+                check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // acc 3d
+        {
+            int64_t ne2[4]         = { 1, 1, 1, 1 };
+            int64_t max_offsets[4] = { 0, 0, 0, 0 };
+            int64_t offsets[4]     = { 0, 0, 0, 0 };
+
+            const int nargs = 2;
+            for (int ndims = 3; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                get_random_dims(ne2, 3);
+                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) {
+                    get_random_dims(ne2, 3);
+                }
+
+                x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[1]);
+
+                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
+                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
+                max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
+                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
+                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
+                offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
+                const int offset = offsets[0] + offsets[1] + offsets[2];
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
+
+                check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // acc 4d
+        {
+            int64_t ne2[4]         = { 1, 1, 1, 1 };
+            int64_t max_offsets[4] = { 0, 0, 0, 0 };
+            int64_t offsets[4]     = { 0, 0, 0, 0 };
+
+            const int nargs = 2;
+            for (int ndims = 4; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                get_random_dims(ne2, 4);
+                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) {
+                    get_random_dims(ne2, 4);
+                }
+
+                x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[1]);
+
+                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
+                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
+                max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
+                max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]);
+                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
+                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
+                offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
+                offsets[3] = irand(max_offsets[3]) * x[0]->nb[3];
+                const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3];
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
+
+                check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // set_1d
+        {
+            int64_t ne2[4];
+
+            const int nargs = 2;
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                get_random_dims(ne2, 1);
+                while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
+                    get_random_dims(ne2, 1);
+                }
+
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[1]);
+
+                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
+                const int offset = irand(max_offset) * ggml_element_size(x[0]);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
+
+                check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // set_2d
+        {
+            int64_t ne2[4];
+            int64_t max_offsets[4] = { 0, 0, 0, 0 };
+            int64_t offsets[4]     = { 0, 0, 0, 0 };
+
+            const int nargs = 1;
+            for (int ndims = 2; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                get_random_dims(ne2, 2);
+                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
+                    get_random_dims(ne2, 2);
+                }
+
+                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[1]);
+
+                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
+                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
+                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
+                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
+                const int offset = offsets[0] + offsets[1];
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
+
+                check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // view_1d
+        {
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+
+                ggml_set_param(ctx0, x[0]);
+
+                const int k0 = irand(ggml_nelements(x[0]));
+                const int k1 = irand(ggml_nelements(x[0]));
+                const int i0 = MIN(k0, k1);
+                const int i1 = MAX(k0, k1);
+
+                const int offset = i0 * sizeof(float);
+                const int nelem  = i1 - i0;
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
+
+                check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // view_2d
+        {
+            int64_t ne2[4];
+            int64_t nb2[4];
+
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+
+                get_random_dims(ne2, 2);
+                while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
+                    get_random_dims(ne2, 2);
+                }
+                const int count = ne2[0]*ne2[1];
+
+                nb2[0] = sizeof(float);
+                nb2[1] = nb2[0]*ne2[0];
+
+                ggml_set_param(ctx0, x[0]);
+
+                const int max_offset = ggml_nelements(x[0]) - count;
+                const int offset = irand(max_offset+1) * sizeof(float);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
+
+                check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // view_3d
+        {
+            int64_t ne2[4] = {1,1,1,1};
+            int64_t nb2[4] = {0,0,0,0};
+
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+
+                get_random_dims(ne2, 3);
+                while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
+                    get_random_dims(ne2, 3);
+                }
+                const int count = ne2[0]*ne2[1]*ne2[2];
+
+                nb2[0] = sizeof(float);
+                nb2[1] = nb2[0]*ne2[0];
+                nb2[2] = nb2[1]*ne2[1];
+
+                ggml_set_param(ctx0, x[0]);
+
+                const int max_offset = ggml_nelements(x[0]) - count;
+                const int offset = irand(max_offset+1) * sizeof(float);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
+
+                check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // permute
+        {
+            int64_t ne2[4];
+
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 4; ++ndims)
+            {
+                // ggml_permute will set axes of dimensions below n_dims to 1.
+                // to make ggml_permute work correctly on all axes,
+                // the input tensor needs maximal n_dim of 4.
+                for (int i=0; i<ndims; ++i) {
+                    ne2[i] = ne[i];
+                }
+                for (int i=ndims; i<4; ++i) {
+                    ne2[i] = 1;
+                }
+                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
+
+                ggml_set_param(ctx0, x[0]);
+
+                const int p = irand(NUM_PERMUTATIONS);
+                const int ax0 = all_permutations[p*4+0];
+                const int ax1 = all_permutations[p*4+1];
+                const int ax2 = all_permutations[p*4+2];
+                const int ax3 = all_permutations[p*4+3];
+
+                // sum requires contiguous tensor rows
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
+
+                check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // transpose
+        {
+            int64_t ne2[4];
+
+            const int nargs = 1;
+            for (int ndims = 1; ndims <= 4; ++ndims)
+            {
+                // ggml_transpose will set axes of dimensions below n_dims to 1.
+                // to make ggml_transpose work correctly on all axes,
+                // the input tensor needs maximal n_dim of 4.
+                for (int i=0; i<ndims; ++i) {
+                    ne2[i] = ne[i];
+                }
+                for (int i=ndims; i<4; ++i) {
+                    ne2[i] = 1;
+                }
+                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
+
+                ggml_set_param(ctx0, x[0]);
+
+                // sum requires contiguous tensor rows
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
+
+                check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // get_rows
+        {
+            int64_t ne2[4] = {ne[0], ne[1], 1, 1};
+            int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
+            const int nargs = 1;
+            const int ndims = 2;
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+            x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
+
+            ggml_set_param(ctx0, x[0]);
+
+            struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
+
+            check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+        }
+
+        // diag_mask_inf
+        {
+            const int nargs = 1;
+            const int ndims = 2;
+
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+            ggml_set_param(ctx0, x[0]);
+
+            int n_past = irand(ne[0]);
+
+            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
+
+            check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+        }
+
+        // diag_mask_zero
+        {
+            const int nargs = 1;
+            const int ndims = 2;
+
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+            ggml_set_param(ctx0, x[0]);
+
+            int n_past = irand(ne[0]);
+
+            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
+
+            check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+        }
+
+        // softmax
+        {
+            const int nargs = 1;
+
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+
+            for (int ndims = 1; ndims <= 3; ++ndims) {
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_soft_max(ctx0, x[0]));
+
+                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+            }
+        }
+
+        // cross_entropy_loss
+        {
+            const int nargs = 1;
+
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+
+            for (int ndims = 1; ndims <= 3; ++ndims) {
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
+                ggml_set_param(ctx0, x[0]);
+
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
+
+                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-1f, 1e-2f, INFINITY);
+                // finite differences regularly fails!
+            }
+        }
+
+        // rope f32
+        {
+            const int nargs = 1;
+
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+            ne2[0] += ne2[0] % 2;
+            int n_rot = ne2[0];
+
+            for (int ndims = 3; ndims <= 4; ++ndims) {
+                for (int mode = 0; mode < 4; ++mode) {
+                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
+                        x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+
+                        ggml_set_param(ctx0, x[0]);
+
+                        const bool skip_past = (mode & 1);
+                        if (skip_past) {
+                            // we have no past, so this would have to work on uninitialized memory.
+                            // we only test the gradients here;
+                            // skip_past should have no influence on gradient computation.
+                            // so when other modes work, we assume that this does as well.
+                            continue;
+                        }
+
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
+
+                        GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
+                        check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
+                    }
+                }
+            }
+        }
+
+        // rope f16
+        {
+            const int nargs = 1;
+
+            int64_t ne2[4];
+            get_random_dims(ne2, 4);
+            ne2[0] += ne2[0] % 2;
+            int n_rot = ne2[0];
+
+            for (int ndims = 3; ndims <= 4; ++ndims) {
+                for (int mode = 0; mode < 4; ++mode) {
+                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
+                        x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
+
+                        ggml_set_param(ctx0, x[0]);
+
+                        const bool skip_past = (mode & 1);
+                        if (skip_past) {
+                            // we have no past, so this would have to work on uninitialized memory.
+                            // we only test the gradients here;
+                            // skip_past should have no influence on gradient computation.
+                            // so when other modes work, we assume that this does as well.
+                            continue;
+                        }
+
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
+
+                        GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
+                        check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
+                    }
+                }
+            }
+        }
+
+        // flash_attn f32
+        {
+            const int nargs = 3;
+
+            int64_t ne2[4];
+
+            get_random_dims(ne2, 4);
+            int64_t D = ne2[0];
+            int64_t N = ne2[1];
+            int64_t M = ne2[2] + N;
+            int64_t B = ne2[3];
+
+            for (int masked = 0; masked <= 1; ++masked) {
+                for (int ndims = 2; ndims <= 4; ++ndims) {
+                    int64_t neq[4] = { D, N, B, ne[3] };
+                    int64_t nek[4] = { D, M, B, ne[3] };
+                    int64_t nev[4] = { M, D, B, ne[3] };
+                    if (ndims == 2) {
+                        neq[2] = 1; neq[3] = 1;
+                        nek[2] = 1; nek[3] = 1;
+                        nev[2] = 1; nev[3] = 1;
+                    } else if (ndims == 3) {
+                        neq[3] = 1;
+                        nek[3] = 1;
+                        nev[3] = 1;
+                    }
+                    x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
+                    x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
+                    x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
+                    ggml_set_param(ctx0, x[0]);
+                    ggml_set_param(ctx0, x[1]);
+                    ggml_set_param(ctx0, x[2]);
+
+                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
+
+                    check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
+                }
+            }
+        }
+
+        // flash_attn f16, not yet fully implemented
+        if(0)
+        {
+            const int nargs = 3;
+
+            int64_t ne2[4];
+
+            get_random_dims(ne2, 4);
+            int64_t D = ne2[0];
+            int64_t N = ne2[1];
+            int64_t M = ne2[2] + N;
+            int64_t B = ne2[3];
+
+            for (int masked = 0; masked <= 1; ++masked) {
+                for (int ndims = 2; ndims <= 4; ++ndims) {
+                    int64_t neq[4] = { D, N, B, ne[3] };
+                    int64_t nek[4] = { D, M, B, ne[3] };
+                    int64_t nev[4] = { M, D, B, ne[3] };
+                    if (ndims == 2) {
+                        neq[2] = 1; neq[3] = 1;
+                        nek[2] = 1; nek[3] = 1;
+                        nev[2] = 1; nev[3] = 1;
+                    } else if (ndims == 3) {
+                        neq[3] = 1;
+                        nek[3] = 1;
+                        nev[3] = 1;
+                    }
+                    x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
+                    x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
+                    x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
+                    ggml_set_param(ctx0, x[0]);
+                    ggml_set_param(ctx0, x[1]);
+                    ggml_set_param(ctx0, x[2]);
+
+                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
+
+                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
+                }
+            }
+        }
+        ggml_free(ctx0);
+    }
+
+    return 0;
+}
diff --git a/tests/test-opt.c b/tests/test-opt.c
deleted file mode 100644
index 4eef62b..0000000
--- a/tests/test-opt.c
+++ /dev/null
@@ -1,211 +0,0 @@
-#include "ggml.h"
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#define MAX_NARGS 2
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif
-
-//
-// logging
-//
-#define GGML_DEBUG 0
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-
-float frand(void) {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-int irand(int n) {
-    return rand()%n;
-}
-
-void get_random_dims(int64_t * dims, int ndims) {
-    dims[0] = dims[1] = dims[2] = dims[3] = 1;
-
-    for (int i = 0; i < ndims; i++) {
-        dims[i] = 1 + irand(4);
-    }
-}
-
-void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
-    dims[0] = dims[1] = dims[2] = dims[3] = 1;
-
-    for (int i = 0; i < ndims; i++) {
-        dims[i] = min + irand(max-min);
-    }
-}
-
-
-struct ggml_tensor * get_random_tensor(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        float fmin,
-        float fmax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    };
-
-    return result;
-}
-
-float get_element(const struct ggml_tensor * t, int idx) {
-    return ((float *)t->data)[idx];
-}
-
-void set_element(struct ggml_tensor * t, int idx, float value) {
-    ((float *)t->data)[idx] = value;
-}
-
-int main(void) {
-    struct ggml_init_params params = {
-        .mem_size   = 1024*1024*1024,
-        .mem_buffer = NULL,
-        .no_alloc   = false,
-    };
-    struct ggml_context * ctx = ggml_init(params);
-
-    int64_t ne1[4] = {4, 128, 1, 1};
-    int64_t ne2[4] = {4, 256, 1, 1};;
-    int64_t ne3[4] = {128, 256, 1, 1};
-
-    struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
-    struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);
-    ggml_set_param(ctx, a);
-    ggml_set_param(ctx, b);
-
-    struct ggml_tensor * c = get_random_tensor(ctx, 2, ne3, -1, +1);
-
-    struct ggml_tensor * ab = ggml_mul_mat(ctx, a, b);
-    struct ggml_tensor * d  = ggml_sub(ctx, c, ab);
-    struct ggml_tensor * e  = ggml_sum(ctx, ggml_sqr(ctx, d));
-
-    struct ggml_cgraph ge = ggml_build_forward(e);
-    ggml_graph_reset(&ge);
-
-    ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1);
-
-    const float fe = ggml_get_f32_1d(e, 0);
-    printf("%s: e = %.4f\n", __func__, fe);
-
-    struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
-
-    ggml_opt(ctx, opt_params, e);
-
-    ggml_graph_reset(&ge);
-
-    ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1);
-
-    const float fe_opt = ggml_get_f32_1d(e, 0);
-    printf("%s: original  e = %.4f\n", __func__, fe);
-    printf("%s: optimized e = %.4f\n", __func__, fe_opt);
-
-    const bool success = (fe_opt <= fe);
-    assert(success);
-
-    ggml_free(ctx);
-    return success ? 0 : -1;
-}
-// int64_t ne1[4] = {4, 128, 1, 1};
-// int64_t ne2[4] = {4, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 25890.9375
-// main: optimized e = 10094.7031
-
-// int64_t ne1[4] = {8, 128, 1, 1};
-// int64_t ne2[4] = {8, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 39429.5078
-// main: optimized e = 9275.8936
-
-// int64_t ne1[4] = {16, 128, 1, 1};
-// int64_t ne2[4] = {16, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 68371.1328
-// main: optimized e = 7854.4502
-
-
-// int64_t ne1[4] = {32, 128, 1, 1};
-// int64_t ne2[4] = {32, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 126061.1953
-// main: optimized e = 5451.0166
-
-// int64_t ne1[4] = {4, 1024, 1, 1};
-// int64_t ne2[4] = {4, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 1620817.8750
-// main: optimized e = 698387.6875
-
-// another run on M1
-// int64_t ne1[4] = {4, 1024, 1, 1};
-// int64_t ne2[4] = {4, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 1629595.6250
-// main: optimized e = 698169.1250
-
-// int64_t ne1[4] = {32, 1024, 1, 1};
-// int64_t ne2[4] = {32, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 8146770.5000
-// main: optimized e = 651119.1250
diff --git a/tests/test-opt.cpp b/tests/test-opt.cpp
new file mode 100644
index 0000000..8ab2402
--- /dev/null
+++ b/tests/test-opt.cpp
@@ -0,0 +1,212 @@
+#include "ggml.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+
+#define MAX_NARGS 2
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+#endif
+
+//
+// logging
+//
+#define GGML_DEBUG 0
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+#define GGML_PRINT(...) printf(__VA_ARGS__)
+
+
+float frand(void) {
+    return (float)rand()/(float)RAND_MAX;
+}
+
+int irand(int n) {
+    return rand()%n;
+}
+
+void get_random_dims(int64_t * dims, int ndims) {
+    dims[0] = dims[1] = dims[2] = dims[3] = 1;
+
+    for (int i = 0; i < ndims; i++) {
+        dims[i] = 1 + irand(4);
+    }
+}
+
+void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
+    dims[0] = dims[1] = dims[2] = dims[3] = 1;
+
+    for (int i = 0; i < ndims; i++) {
+        dims[i] = min + irand(max-min);
+    }
+}
+
+
+struct ggml_tensor * get_random_tensor(
+        struct ggml_context * ctx0,
+        int ndims,
+        int64_t ne[],
+        float fmin,
+        float fmax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+
+    return result;
+}
+
+float get_element(const struct ggml_tensor * t, int idx) {
+    return ((float *)t->data)[idx];
+}
+
+void set_element(struct ggml_tensor * t, int idx, float value) {
+    ((float *)t->data)[idx] = value;
+}
+
+int main(void) {
+    struct ggml_init_params params = {
+        /* .mem_size   = */ 1024*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+
+    struct ggml_context * ctx = ggml_init(params);
+
+    int64_t ne1[4] = {4, 128, 1, 1};
+    int64_t ne2[4] = {4, 256, 1, 1};;
+    int64_t ne3[4] = {128, 256, 1, 1};
+
+    struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
+    struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);
+    ggml_set_param(ctx, a);
+    ggml_set_param(ctx, b);
+
+    struct ggml_tensor * c = get_random_tensor(ctx, 2, ne3, -1, +1);
+
+    struct ggml_tensor * ab = ggml_mul_mat(ctx, a, b);
+    struct ggml_tensor * d  = ggml_sub(ctx, c, ab);
+    struct ggml_tensor * e  = ggml_sum(ctx, ggml_sqr(ctx, d));
+
+    struct ggml_cgraph ge = ggml_build_forward(e);
+    ggml_graph_reset(&ge);
+
+    ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1);
+
+    const float fe = ggml_get_f32_1d(e, 0);
+    printf("%s: e = %.4f\n", __func__, fe);
+
+    struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
+
+    ggml_opt(ctx, opt_params, e);
+
+    ggml_graph_reset(&ge);
+
+    ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1);
+
+    const float fe_opt = ggml_get_f32_1d(e, 0);
+    printf("%s: original  e = %.4f\n", __func__, fe);
+    printf("%s: optimized e = %.4f\n", __func__, fe_opt);
+
+    const bool success = (fe_opt <= fe);
+    assert(success);
+
+    ggml_free(ctx);
+    return success ? 0 : -1;
+}
+// int64_t ne1[4] = {4, 128, 1, 1};
+// int64_t ne2[4] = {4, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original  e = 25890.9375
+// main: optimized e = 10094.7031
+
+// int64_t ne1[4] = {8, 128, 1, 1};
+// int64_t ne2[4] = {8, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original  e = 39429.5078
+// main: optimized e = 9275.8936
+
+// int64_t ne1[4] = {16, 128, 1, 1};
+// int64_t ne2[4] = {16, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original  e = 68371.1328
+// main: optimized e = 7854.4502
+
+
+// int64_t ne1[4] = {32, 128, 1, 1};
+// int64_t ne2[4] = {32, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original  e = 126061.1953
+// main: optimized e = 5451.0166
+
+// int64_t ne1[4] = {4, 1024, 1, 1};
+// int64_t ne2[4] = {4, 2048, 1, 1};;
+// int64_t ne3[4] = {1024, 2048, 1, 1};
+// main: original  e = 1620817.8750
+// main: optimized e = 698387.6875
+
+// another run on M1
+// int64_t ne1[4] = {4, 1024, 1, 1};
+// int64_t ne2[4] = {4, 2048, 1, 1};;
+// int64_t ne3[4] = {1024, 2048, 1, 1};
+// main: original  e = 1629595.6250
+// main: optimized e = 698169.1250
+
+// int64_t ne1[4] = {32, 1024, 1, 1};
+// int64_t ne2[4] = {32, 2048, 1, 1};;
+// int64_t ne3[4] = {1024, 2048, 1, 1};
+// main: original  e = 8146770.5000
+// main: optimized e = 651119.1250
-- 
cgit v1.2.3