diff options
author | slaren <2141330+slaren@users.noreply.github.com> | 2023-05-01 13:32:22 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-01 13:32:22 +0200 |
commit | b925f1f1b082319ee69943f8d1a83ac9b6ff09ca (patch) | |
tree | cb636a894e6b11918aafce061f3836a24b021e4f /llama.cpp | |
parent | 90b19bd6eee943832584f9cac0b6f9ea29cc42a4 (diff) |
cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)
* cuBLAS: fall back to pageable memory if pinned alloc fails
* cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 3 |
1 files changed, 1 insertions, 2 deletions
@@ -727,8 +727,7 @@ struct llama_model_loader { LLAMA_ASSERT(offset == lt.size); } else if (lt.split_type == SPLIT_BY_COLUMNS) { // Let's load the data into temporary buffers to ensure the OS performs large loads. - std::vector<llama_buffer> tmp_bufs; - tmp_bufs.resize(lt.shards.size()); + std::vector<llama_buffer> tmp_bufs(lt.shards.size()); for (size_t i = 0; i < lt.shards.size(); i++) { llama_load_tensor_shard & shard = lt.shards.at(i); llama_file & file = file_loaders.at(shard.file_idx)->file; |