From 7f9753fa1263c4eded9a3de19778562f0e1093d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 28 Jun 2023 18:35:54 +0200 Subject: CUDA GPU acceleration for LoRAs + f16 models (#1970) --- llama.cpp | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) (limited to 'llama.cpp') diff --git a/llama.cpp b/llama.cpp index 5a142ab..5f3761b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2976,7 +2976,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const return false; } } - ggml_tensor* lora_tensor; + ggml_tensor * lora_tensor; if (n_dims == 2) { lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]); } @@ -2984,6 +2984,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims); return 1; } + ggml_set_name(lora_tensor, "lora_tensor"); // load tensor data size_t offset = fin.tellg(); @@ -2999,6 +3000,21 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) { ggml_tensor * dest_t = model_tensors[base_name]; + + offload_func_t offload_func = llama_nop; + offload_func_t offload_func_force_inplace = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { + if (dest_t->type != GGML_TYPE_F16) { + throw std::runtime_error(format( + "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__)); + } + offload_func = ggml_cuda_assign_buffers; + offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace; + } +#endif // GGML_USE_CUBLAS + ggml_tensor * base_t; if (model_loader) { // load from base model @@ -3026,7 +3042,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const } ggml_tensor * loraA = lora_tensors[base_name + ".loraA"]; + GGML_ASSERT(loraA->type == GGML_TYPE_F32); + ggml_set_name(loraA, "loraA"); + ggml_tensor * loraB = lora_tensors[base_name + ".loraB"]; + GGML_ASSERT(loraB->type == GGML_TYPE_F32); + ggml_set_name(loraB, "loraB"); if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" @@ -3036,19 +3057,32 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const // w = w + BA*s ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); + offload_func(BA); + ggml_set_name(BA, "BA"); if (scaling != 1.0f) { ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); + ggml_set_name(scale_tensor, "scale_tensor"); + BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); + offload_func(BA); + ggml_set_name(BA, "BA_scaled"); } ggml_tensor * r; if (base_t == dest_t) { r = ggml_add_inplace(lora_ctx, dest_t, BA); + offload_func_force_inplace(r); + ggml_set_name(r, "r_add_inplace"); } else { r = ggml_add(lora_ctx, base_t, BA); + offload_func(r); + ggml_set_name(r, "r_add"); + r = ggml_cpy(lora_ctx, r, dest_t); + offload_func(r); + ggml_set_name(r, "r_cpy"); } struct ggml_cgraph gf = ggml_build_forward(r); -- cgit v1.2.3