diff options
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 10 |
1 files changed, 8 insertions, 2 deletions
@@ -901,6 +901,7 @@ struct llama_context_params llama_context_default_params() { /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.low_vram =*/ false, + /*.mul_mat_q =*/ false, /*.f16_kv =*/ true, /*.logits_all =*/ false, /*.vocab_only =*/ false, @@ -1028,6 +1029,7 @@ static void llama_model_load_internal( int n_gpu_layers, int main_gpu, const float * tensor_split, + const bool mul_mat_q, float rope_freq_base, float rope_freq_scale, bool low_vram, @@ -1156,9 +1158,11 @@ static void llama_model_load_internal( } (void) main_gpu; + (void) mul_mat_q; #if defined(GGML_USE_CUBLAS) fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); ggml_cuda_set_main_device(main_gpu); + ggml_cuda_set_mul_mat_q(mul_mat_q); #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT #elif defined(GGML_USE_CLBLAST) @@ -1367,6 +1371,7 @@ static bool llama_model_load( int n_gpu_layers, int main_gpu, const float * tensor_split, + const bool mul_mat_q, float rope_freq_base, float rope_freq_scale, bool low_vram, @@ -1377,7 +1382,8 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type, + llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, + main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); return true; } catch (const std::exception & err) { @@ -3192,7 +3198,7 @@ struct llama_model * llama_load_model_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers, - params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram, + params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { delete model; |