aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohannes Gäßler <johannesg@5d6.de>2023-08-08 14:38:16 +0200
committerGitHub <noreply@github.com>2023-08-08 14:38:16 +0200
commitacfc5478ff3446ca3b54553967a3dea09b7c771a (patch)
treeaa8fd4955191bb4374e67906a5c3fe7cefb8bc25
parent7ed8d1fe7f8cbe6a6763e6b46759795ac8d21e12 (diff)
CUDA: tighter VRAM scratch size for 65b/70b (#2551)
-rw-r--r--llama.cpp12
1 files changed, 6 insertions, 6 deletions
diff --git a/llama.cpp b/llama.cpp
index 39aefd4..71061aa 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -149,7 +149,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
}
// amount of VRAM needed per batch size to hold temporary results
-// the values for 3b and 65b are not derived from testing but instead chosen conservatively
+// the values for 3b are not derived from testing but instead chosen conservatively
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
{
static std::map<e_model, size_t> k_sizes = {
@@ -157,14 +157,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
{ MODEL_7B, 512ull * kB },
{ MODEL_13B, 640ull * kB },
{ MODEL_30B, 768ull * kB },
- { MODEL_65B, 1536ull * kB },
- { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
+ { MODEL_65B, 1280ull * kB },
+ { MODEL_70B, 1280ull * kB },
};
return k_sizes;
}
// amount of VRAM needed per batch size and context to hold temporary results
-// the values for 3b and 65b are not derived from testing but instead chosen conservatively
+// the values for 3b are not derived from testing but instead chosen conservatively
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
{
static std::map<e_model, size_t> k_sizes = {
@@ -172,8 +172,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
{ MODEL_7B, 128ull },
{ MODEL_13B, 160ull },
{ MODEL_30B, 208ull },
- { MODEL_65B, 416ull },
- { MODEL_70B, 416ull }, // TODO (likely can be reduced)
+ { MODEL_65B, 256ull },
+ { MODEL_70B, 256ull },
};
return k_sizes;
}