CUDA: tighter VRAM scratch size for 65b/70b (#2551)

author: Johannes Gäßler <johannesg@5d6.de> 2023-08-08 14:38:16 +0200
committer: GitHub <noreply@github.com> 2023-08-08 14:38:16 +0200
commit: acfc5478ff3446ca3b54553967a3dea09b7c771a (patch)
tree: aa8fd4955191bb4374e67906a5c3fe7cefb8bc25 /llama.cpp
parent: 7ed8d1fe7f8cbe6a6763e6b46759795ac8d21e12 (diff)
1 files changed, 6 insertions, 6 deletions
diff --git a/llama.cpp b/llama.cpp
index 39aefd4..71061aa 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -149,7 +149,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
 }
 
 // amount of VRAM needed per batch size to hold temporary results
-// the values for 3b and 65b are not derived from testing but instead chosen conservatively
+// the values for 3b are not derived from testing but instead chosen conservatively
 static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
 {
     static std::map<e_model, size_t> k_sizes = {
@@ -157,14 +157,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
         { MODEL_7B,   512ull * kB },
         { MODEL_13B,  640ull * kB },
         { MODEL_30B,  768ull * kB },
-        { MODEL_65B, 1536ull * kB },
-        { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
+        { MODEL_65B, 1280ull * kB },
+        { MODEL_70B, 1280ull * kB },
     };
     return k_sizes;
 }
 
 // amount of VRAM needed per batch size and context to hold temporary results
-// the values for 3b and 65b are not derived from testing but instead chosen conservatively
+// the values for 3b are not derived from testing but instead chosen conservatively
 static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
 {
     static std::map<e_model, size_t> k_sizes = {
@@ -172,8 +172,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
         { MODEL_7B,  128ull },
         { MODEL_13B, 160ull },
         { MODEL_30B, 208ull },
-        { MODEL_65B, 416ull },
-        { MODEL_70B, 416ull }, // TODO (likely can be reduced)
+        { MODEL_65B, 256ull },
+        { MODEL_70B, 256ull },
     };
     return k_sizes;
 }
author	Johannes Gäßler <johannesg@5d6.de>	2023-08-08 14:38:16 +0200
committer	GitHub <noreply@github.com>	2023-08-08 14:38:16 +0200
commit	acfc5478ff3446ca3b54553967a3dea09b7c771a (patch)
tree	aa8fd4955191bb4374e67906a5c3fe7cefb8bc25 /llama.cpp
parent	7ed8d1fe7f8cbe6a6763e6b46759795ac8d21e12 (diff)