diff options
author | Johannes Gäßler <johannesg@5d6.de> | 2023-08-08 14:38:16 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-08-08 14:38:16 +0200 |
commit | acfc5478ff3446ca3b54553967a3dea09b7c771a (patch) | |
tree | aa8fd4955191bb4374e67906a5c3fe7cefb8bc25 | |
parent | 7ed8d1fe7f8cbe6a6763e6b46759795ac8d21e12 (diff) |
CUDA: tighter VRAM scratch size for 65b/70b (#2551)
-rw-r--r-- | llama.cpp | 12 |
1 files changed, 6 insertions, 6 deletions
@@ -149,7 +149,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL() } // amount of VRAM needed per batch size to hold temporary results -// the values for 3b and 65b are not derived from testing but instead chosen conservatively +// the values for 3b are not derived from testing but instead chosen conservatively static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE() { static std::map<e_model, size_t> k_sizes = { @@ -157,14 +157,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE() { MODEL_7B, 512ull * kB }, { MODEL_13B, 640ull * kB }, { MODEL_30B, 768ull * kB }, - { MODEL_65B, 1536ull * kB }, - { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced) + { MODEL_65B, 1280ull * kB }, + { MODEL_70B, 1280ull * kB }, }; return k_sizes; } // amount of VRAM needed per batch size and context to hold temporary results -// the values for 3b and 65b are not derived from testing but instead chosen conservatively +// the values for 3b are not derived from testing but instead chosen conservatively static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT() { static std::map<e_model, size_t> k_sizes = { @@ -172,8 +172,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT() { MODEL_7B, 128ull }, { MODEL_13B, 160ull }, { MODEL_30B, 208ull }, - { MODEL_65B, 416ull }, - { MODEL_70B, 416ull }, // TODO (likely can be reduced) + { MODEL_65B, 256ull }, + { MODEL_70B, 256ull }, }; return k_sizes; } |