aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorslaren <2141330+slaren@users.noreply.github.com>2023-05-01 13:32:22 +0200
committerGitHub <noreply@github.com>2023-05-01 13:32:22 +0200
commitb925f1f1b082319ee69943f8d1a83ac9b6ff09ca (patch)
treecb636a894e6b11918aafce061f3836a24b021e4f
parent90b19bd6eee943832584f9cac0b6f9ea29cc42a4 (diff)
cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)
* cuBLAS: fall back to pageable memory if pinned alloc fails * cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
-rw-r--r--ggml-cuda.cu14
-rw-r--r--llama-util.h42
-rw-r--r--llama.cpp3
3 files changed, 51 insertions, 8 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 5a2701c..c1ec306 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -355,8 +355,18 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src,
}
void * ggml_cuda_host_malloc(size_t size) {
- void * ptr;
- CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
+ if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
+ return nullptr;
+ }
+
+ void * ptr = nullptr;
+ cudaError_t err = cudaMallocHost((void **) &ptr, size);
+ if (err != cudaSuccess) {
+ fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
+ size/1024.0/1024.0, cudaGetErrorString(err));
+ return nullptr;
+ }
+
return ptr;
}
diff --git a/llama-util.h b/llama-util.h
index ca4dd16..5f9f70e 100644
--- a/llama-util.h
+++ b/llama-util.h
@@ -395,6 +395,8 @@ struct llama_buffer {
uint8_t * addr = NULL;
size_t size = 0;
+ llama_buffer() = default;
+
void resize(size_t size) {
delete[] addr;
addr = new uint8_t[size];
@@ -404,27 +406,59 @@ struct llama_buffer {
~llama_buffer() {
delete[] addr;
}
+
+ // disable copy and move
+ llama_buffer(const llama_buffer&) = delete;
+ llama_buffer(llama_buffer&&) = delete;
+ llama_buffer& operator=(const llama_buffer&) = delete;
+ llama_buffer& operator=(llama_buffer&&) = delete;
};
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
struct llama_ctx_buffer {
uint8_t * addr = NULL;
+ bool is_cuda;
size_t size = 0;
+ llama_ctx_buffer() = default;
+
void resize(size_t size) {
+ free();
+
+ addr = (uint8_t *) ggml_cuda_host_malloc(size);
if (addr) {
- ggml_cuda_host_free(addr);
+ is_cuda = true;
+ }
+ else {
+ // fall back to pageable memory
+ addr = new uint8_t[size];
+ is_cuda = false;
}
- addr = (uint8_t *) ggml_cuda_host_malloc(size);
this->size = size;
}
- ~llama_ctx_buffer() {
+ void free() {
if (addr) {
- ggml_cuda_host_free(addr);
+ if (is_cuda) {
+ ggml_cuda_host_free(addr);
+ }
+ else {
+ delete[] addr;
+ }
}
+ addr = NULL;
}
+
+ ~llama_ctx_buffer() {
+ free();
+ }
+
+ // disable copy and move
+ llama_ctx_buffer(const llama_ctx_buffer&) = delete;
+ llama_ctx_buffer(llama_ctx_buffer&&) = delete;
+ llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
+ llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
};
#else
typedef llama_buffer llama_ctx_buffer;
diff --git a/llama.cpp b/llama.cpp
index 3d82113..0d094a5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -727,8 +727,7 @@ struct llama_model_loader {
LLAMA_ASSERT(offset == lt.size);
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
// Let's load the data into temporary buffers to ensure the OS performs large loads.
- std::vector<llama_buffer> tmp_bufs;
- tmp_bufs.resize(lt.shards.size());
+ std::vector<llama_buffer> tmp_bufs(lt.shards.size());
for (size_t i = 0; i < lt.shards.size(); i++) {
llama_load_tensor_shard & shard = lt.shards.at(i);
llama_file & file = file_loaders.at(shard.file_idx)->file;