diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2023-03-23 23:22:01 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-03-23 23:22:01 +0200 |
commit | 483bab2e3d4a868fe679d8bb32827d2a4df214dc (patch) | |
tree | 0ae7226ed8874d487ea0dfb7703a05bc11616967 | |
parent | 404e1da38ec8025707031a8027da14dc1590f952 (diff) |
Avoid the transposed X branch in the Z = X * Y matrix multiplication (#439)
Should make results reproducible for different number of threads and batch sizes
-rw-r--r-- | llama.cpp | 12 |
1 files changed, 7 insertions, 5 deletions
@@ -727,11 +727,13 @@ static bool llama_eval_internal( // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() struct ggml_tensor * V_trans = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), - n_embd/n_head, n_head, n_past + N), - 1, 2, 0, 3); + ggml_cpy(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + n_embd/n_head, n_head, n_past + N), + 1, 2, 0, 3), + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); // KQV = transpose(V) * KQ_soft_max struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); |