diff options
author | slaren <2141330+slaren@users.noreply.github.com> | 2023-05-01 13:32:22 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-01 13:32:22 +0200 |
commit | b925f1f1b082319ee69943f8d1a83ac9b6ff09ca (patch) | |
tree | cb636a894e6b11918aafce061f3836a24b021e4f | |
parent | 90b19bd6eee943832584f9cac0b6f9ea29cc42a4 (diff) |
cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)
* cuBLAS: fall back to pageable memory if pinned alloc fails
* cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
-rw-r--r-- | ggml-cuda.cu | 14 | ||||
-rw-r--r-- | llama-util.h | 42 | ||||
-rw-r--r-- | llama.cpp | 3 |
3 files changed, 51 insertions, 8 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5a2701c..c1ec306 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -355,8 +355,18 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, } void * ggml_cuda_host_malloc(size_t size) { - void * ptr; - CUDA_CHECK(cudaMallocHost((void **) &ptr, size)); + if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { + return nullptr; + } + + void * ptr = nullptr; + cudaError_t err = cudaMallocHost((void **) &ptr, size); + if (err != cudaSuccess) { + fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", + size/1024.0/1024.0, cudaGetErrorString(err)); + return nullptr; + } + return ptr; } diff --git a/llama-util.h b/llama-util.h index ca4dd16..5f9f70e 100644 --- a/llama-util.h +++ b/llama-util.h @@ -395,6 +395,8 @@ struct llama_buffer { uint8_t * addr = NULL; size_t size = 0; + llama_buffer() = default; + void resize(size_t size) { delete[] addr; addr = new uint8_t[size]; @@ -404,27 +406,59 @@ struct llama_buffer { ~llama_buffer() { delete[] addr; } + + // disable copy and move + llama_buffer(const llama_buffer&) = delete; + llama_buffer(llama_buffer&&) = delete; + llama_buffer& operator=(const llama_buffer&) = delete; + llama_buffer& operator=(llama_buffer&&) = delete; }; #ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" struct llama_ctx_buffer { uint8_t * addr = NULL; + bool is_cuda; size_t size = 0; + llama_ctx_buffer() = default; + void resize(size_t size) { + free(); + + addr = (uint8_t *) ggml_cuda_host_malloc(size); if (addr) { - ggml_cuda_host_free(addr); + is_cuda = true; + } + else { + // fall back to pageable memory + addr = new uint8_t[size]; + is_cuda = false; } - addr = (uint8_t *) ggml_cuda_host_malloc(size); this->size = size; } - ~llama_ctx_buffer() { + void free() { if (addr) { - ggml_cuda_host_free(addr); + if (is_cuda) { + ggml_cuda_host_free(addr); + } + else { + delete[] addr; + } } + addr = NULL; } + + ~llama_ctx_buffer() { + free(); + } + + // disable copy and move + llama_ctx_buffer(const llama_ctx_buffer&) = delete; + llama_ctx_buffer(llama_ctx_buffer&&) = delete; + llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete; + llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete; }; #else typedef llama_buffer llama_ctx_buffer; @@ -727,8 +727,7 @@ struct llama_model_loader { LLAMA_ASSERT(offset == lt.size); } else if (lt.split_type == SPLIT_BY_COLUMNS) { // Let's load the data into temporary buffers to ensure the OS performs large loads. - std::vector<llama_buffer> tmp_bufs; - tmp_bufs.resize(lt.shards.size()); + std::vector<llama_buffer> tmp_bufs(lt.shards.size()); for (size_t i = 0; i < lt.shards.size(); i++) { llama_load_tensor_shard & shard = lt.shards.at(i); llama_file & file = file_loaders.at(shard.file_idx)->file; |