diff options
author | Johannes Gäßler <johannesg@5d6.de> | 2023-07-01 21:47:26 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-01 21:47:26 +0200 |
commit | befb3a35627432473f143c90994557d78ff5bc67 (patch) | |
tree | 877206bf325771a2dd83f7a730424b5d05c2e8b5 /llama.cpp | |
parent | b2132270678c473f7cd9ba871b03d694126bc33a (diff) |
Test-based VRAM scratch size + context adjustment (#2056)
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 38 |
1 files changed, 35 insertions, 3 deletions
@@ -66,6 +66,7 @@ enum e_model { MODEL_65B, }; +static const size_t kB = 1024; static const size_t MB = 1024*1024; // computed for n_ctx == 2048 @@ -129,6 +130,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL() return k_sizes; } +// amount of VRAM needed per batch size to hold temporary results +// the values for 3b and 65b are not derived from testing but instead chosen conservatively +static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE() +{ + static std::map<e_model, size_t> k_sizes = { + { MODEL_3B, 512ull * kB }, + { MODEL_7B, 512ull * kB }, + { MODEL_13B, 640ull * kB }, + { MODEL_30B, 768ull * kB }, + { MODEL_65B, 1536ull * kB }, + }; + return k_sizes; +} + +// amount of VRAM needed per batch size and context to hold temporary results +// the values for 3b and 65b are not derived from testing but instead chosen conservatively +static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT() +{ + static std::map<e_model, size_t> k_sizes = { + { MODEL_3B, 128ull }, + { MODEL_7B, 128ull }, + { MODEL_13B, 160ull }, + { MODEL_30B, 208ull }, + { MODEL_65B, 416ull }, + }; + return k_sizes; +} + // default hparams (LLaMA 7B) struct llama_hparams { uint32_t n_vocab = 32000; @@ -1118,11 +1147,14 @@ static void llama_model_load_internal( fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); ggml_cuda_set_scratch_size(0); // disable scratch } else { - vram_scratch = n_batch * MB; + const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type); + const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type); + vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context); ggml_cuda_set_scratch_size(vram_scratch); if (n_gpu_layers > 0) { - fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n", - __func__, vram_scratch / MB); + fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n", + __func__, vram_scratch_base / kB, vram_scratch_per_context, + (vram_scratch + MB - 1) / MB); // round up } } #endif // GGML_USE_CUBLAS |