aboutsummaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorJohannes Gäßler <johannesg@5d6.de>2023-07-31 15:44:35 +0200
committerGitHub <noreply@github.com>2023-07-31 15:44:35 +0200
commit0728c5a8b9569183ffca0399caac099afef87595 (patch)
tree58915b38ddcc28bda0171925548d6b4d6fea2707 /llama.cpp
parent1215ed7d5ccf854a55eccb52661427bb985e7f2c (diff)
CUDA: mmq CLI option, fixed mmq build issues (#2453)
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp10
1 files changed, 8 insertions, 2 deletions
diff --git a/llama.cpp b/llama.cpp
index 50da427..d427054 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -901,6 +901,7 @@ struct llama_context_params llama_context_default_params() {
/*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr,
/*.low_vram =*/ false,
+ /*.mul_mat_q =*/ false,
/*.f16_kv =*/ true,
/*.logits_all =*/ false,
/*.vocab_only =*/ false,
@@ -1028,6 +1029,7 @@ static void llama_model_load_internal(
int n_gpu_layers,
int main_gpu,
const float * tensor_split,
+ const bool mul_mat_q,
float rope_freq_base,
float rope_freq_scale,
bool low_vram,
@@ -1156,9 +1158,11 @@ static void llama_model_load_internal(
}
(void) main_gpu;
+ (void) mul_mat_q;
#if defined(GGML_USE_CUBLAS)
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
ggml_cuda_set_main_device(main_gpu);
+ ggml_cuda_set_mul_mat_q(mul_mat_q);
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
#elif defined(GGML_USE_CLBLAST)
@@ -1367,6 +1371,7 @@ static bool llama_model_load(
int n_gpu_layers,
int main_gpu,
const float * tensor_split,
+ const bool mul_mat_q,
float rope_freq_base,
float rope_freq_scale,
bool low_vram,
@@ -1377,7 +1382,8 @@ static bool llama_model_load(
llama_progress_callback progress_callback,
void *progress_callback_user_data) {
try {
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
+ main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
return true;
} catch (const std::exception & err) {
@@ -3192,7 +3198,7 @@ struct llama_model * llama_load_model_from_file(
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
- params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
+ params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
params.progress_callback_user_data)) {
delete model;