cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)

* cuBLAS: fall back to pageable memory if pinned alloc fails * cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
author: slaren <2141330+slaren@users.noreply.github.com> 2023-05-01 13:32:22 +0200
committer: GitHub <noreply@github.com> 2023-05-01 13:32:22 +0200
commit: b925f1f1b082319ee69943f8d1a83ac9b6ff09ca (patch)
tree: cb636a894e6b11918aafce061f3836a24b021e4f /llama-util.h
parent: 90b19bd6eee943832584f9cac0b6f9ea29cc42a4 (diff)
1 files changed, 38 insertions, 4 deletions
diff --git a/llama-util.h b/llama-util.h
index ca4dd16..5f9f70e 100644
--- a/llama-util.h
+++ b/llama-util.h
@@ -395,6 +395,8 @@ struct llama_buffer {
     uint8_t * addr = NULL;
     size_t size = 0;
 
+    llama_buffer() = default;
+
     void resize(size_t size) {
         delete[] addr;
         addr = new uint8_t[size];
@@ -404,27 +406,59 @@ struct llama_buffer {
     ~llama_buffer() {
         delete[] addr;
     }
+
+    // disable copy and move
+    llama_buffer(const llama_buffer&) = delete;
+    llama_buffer(llama_buffer&&) = delete;
+    llama_buffer& operator=(const llama_buffer&) = delete;
+    llama_buffer& operator=(llama_buffer&&) = delete;
 };
 
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 struct llama_ctx_buffer {
     uint8_t * addr = NULL;
+    bool is_cuda;
     size_t size = 0;
 
+    llama_ctx_buffer() = default;
+
     void resize(size_t size) {
+        free();
+
+        addr = (uint8_t *) ggml_cuda_host_malloc(size);
         if (addr) {
-            ggml_cuda_host_free(addr);
+            is_cuda = true;
+        }
+        else {
+            // fall back to pageable memory
+            addr = new uint8_t[size];
+            is_cuda = false;
         }
-        addr = (uint8_t *) ggml_cuda_host_malloc(size);
         this->size = size;
     }
 
-    ~llama_ctx_buffer() {
+    void free() {
         if (addr) {
-            ggml_cuda_host_free(addr);
+            if (is_cuda) {
+                ggml_cuda_host_free(addr);
+            }
+            else {
+                delete[] addr;
+            }
         }
+        addr = NULL;
     }
+
+    ~llama_ctx_buffer() {
+        free();
+    }
+
+    // disable copy and move
+    llama_ctx_buffer(const llama_ctx_buffer&) = delete;
+    llama_ctx_buffer(llama_ctx_buffer&&) = delete;
+    llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
+    llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
 };
 #else
 typedef llama_buffer llama_ctx_buffer;
author	slaren <2141330+slaren@users.noreply.github.com>	2023-05-01 13:32:22 +0200
committer	GitHub <noreply@github.com>	2023-05-01 13:32:22 +0200
commit	b925f1f1b082319ee69943f8d1a83ac9b6ff09ca (patch)
tree	cb636a894e6b11918aafce061f3836a24b021e4f /llama-util.h
parent	90b19bd6eee943832584f9cac0b6f9ea29cc42a4 (diff)