ggml, llama : avoid heavy V transpose + improvements (#775)

ggml : - added ggml_view_3d() - ggml_view_tensor() now inherits the stride too - reimplement ggml_cpy() to account for dst stride - no longer require tensor->data to be memory aligned llama : - compute RoPE on 32-bit tensors (should be more accurate) - store RoPE-ed K in the KV cache - store transposed V in the KV cache (significant speed-up) - avoid unnecessary Q copy
author: Georgi Gerganov <ggerganov@gmail.com> 2023-04-05 22:07:33 +0300
committer: GitHub <noreply@github.com> 2023-04-05 22:07:33 +0300
commit: 986b6ce9f99503c51ec5afd8a10baa32359434c6 (patch)
tree: f4655b45b130b908729eb1407ca9e016c05f21a4 /ggml.h
parent: 34162989297fdfe3ab7305451ce55bc87e3f4c9c (diff)
1 files changed, 10 insertions, 0 deletions
diff --git a/ggml.h b/ggml.h
index ad962b1..3c94efc 100644
--- a/ggml.h
+++ b/ggml.h
@@ -558,6 +558,16 @@ struct ggml_tensor * ggml_view_2d(
         size_t                nb1, // row stride in bytes
         size_t                offset);
 
+struct ggml_tensor * ggml_view_3d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        size_t                nb1, // row   stride in bytes
+        size_t                nb2, // slice stride in bytes
+        size_t                offset);
+
 struct ggml_tensor * ggml_permute(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
author	Georgi Gerganov <ggerganov@gmail.com>	2023-04-05 22:07:33 +0300
committer	GitHub <noreply@github.com>	2023-04-05 22:07:33 +0300
commit	986b6ce9f99503c51ec5afd8a10baa32359434c6 (patch)
tree	f4655b45b130b908729eb1407ca9e016c05f21a4 /ggml.h
parent	34162989297fdfe3ab7305451ce55bc87e3f4c9c (diff)