From 254a7a7a5ff4c874ff8488f1f5cbdd7e9c89d682 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Wed, 14 Jun 2023 19:47:19 +0200
Subject: CUDA full GPU acceleration, KV cache in VRAM (#1827)

* Fixed CUDA RoPE

* ggml_cuda_mul_mat_vec_p021

* ggml_cuda_scale

* ggml_cuda_diag_mask_inf

* ggml_is_permuted

* ggml_cuda_cpy

* flatten rows for ggml_cuda_op

* Added a --low-vram option

* Fixed Windows performance

* Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
---
 ggml.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'ggml.h')

diff --git a/ggml.h b/ggml.h
index f2a9176..9b0c846 100644
--- a/ggml.h
+++ b/ggml.h
@@ -485,6 +485,7 @@ extern "C" {
 
     GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
     GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
 
     // use this to compute the memory overhead of a tensor
     GGML_API size_t ggml_tensor_overhead(void);
-- 
cgit v1.2.3