Avoid the transposed X branch in the Z = X * Y matrix multiplication (#439)

Should make results reproducible for different number of threads and batch sizes
author: Georgi Gerganov <ggerganov@gmail.com> 2023-03-23 23:22:01 +0200
committer: GitHub <noreply@github.com> 2023-03-23 23:22:01 +0200
commit: 483bab2e3d4a868fe679d8bb32827d2a4df214dc (patch)
tree: 0ae7226ed8874d487ea0dfb7703a05bc11616967
parent: 404e1da38ec8025707031a8027da14dc1590f952 (diff)
1 files changed, 7 insertions, 5 deletions
diff --git a/llama.cpp b/llama.cpp
index 7de3c19..d552192 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -727,11 +727,13 @@ static bool llama_eval_internal(
 
             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
             struct ggml_tensor * V_trans =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        1, 2, 0, 3);
+                ggml_cpy(ctx0,
+                    ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
+                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                                n_embd/n_head, n_head, n_past + N),
+                            1, 2, 0, 3),
+                    ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
 
             // KQV = transpose(V) * KQ_soft_max
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
author	Georgi Gerganov <ggerganov@gmail.com>	2023-03-23 23:22:01 +0200
committer	GitHub <noreply@github.com>	2023-03-23 23:22:01 +0200
commit	483bab2e3d4a868fe679d8bb32827d2a4df214dc (patch)
tree	0ae7226ed8874d487ea0dfb7703a05bc11616967
parent	404e1da38ec8025707031a8027da14dc1590f952 (diff)