cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)

* cuBLAS: fall back to pageable memory if pinned alloc fails * cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
author: slaren <2141330+slaren@users.noreply.github.com> 2023-05-01 13:32:22 +0200
committer: GitHub <noreply@github.com> 2023-05-01 13:32:22 +0200
commit: b925f1f1b082319ee69943f8d1a83ac9b6ff09ca (patch)
tree: cb636a894e6b11918aafce061f3836a24b021e4f /llama.cpp
parent: 90b19bd6eee943832584f9cac0b6f9ea29cc42a4 (diff)
1 files changed, 1 insertions, 2 deletions
diff --git a/llama.cpp b/llama.cpp
index 3d82113..0d094a5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -727,8 +727,7 @@ struct llama_model_loader {
             LLAMA_ASSERT(offset == lt.size);
         } else if (lt.split_type == SPLIT_BY_COLUMNS) {
             // Let's load the data into temporary buffers to ensure the OS performs large loads.
-            std::vector<llama_buffer> tmp_bufs;
-            tmp_bufs.resize(lt.shards.size());
+            std::vector<llama_buffer> tmp_bufs(lt.shards.size());
             for (size_t i = 0; i < lt.shards.size(); i++) {
                 llama_load_tensor_shard & shard = lt.shards.at(i);
                 llama_file & file = file_loaders.at(shard.file_idx)->file;
author	slaren <2141330+slaren@users.noreply.github.com>	2023-05-01 13:32:22 +0200
committer	GitHub <noreply@github.com>	2023-05-01 13:32:22 +0200
commit	b925f1f1b082319ee69943f8d1a83ac9b6ff09ca (patch)
tree	cb636a894e6b11918aafce061f3836a24b021e4f /llama.cpp
parent	90b19bd6eee943832584f9cac0b6f9ea29cc42a4 (diff)