ggml : generalize `quantize_fns` for simpler FP16 handling (#1237)

* Generalize quantize_fns for simpler FP16 handling * Remove call to ggml_cuda_mul_mat_get_wsize * ci : disable FMA for mac os actions --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
author: Stephan Walter <stephan@walter.name> 2023-07-05 16:13:06 +0000
committer: GitHub <noreply@github.com> 2023-07-05 19:13:06 +0300
commit: 1b107b8550dced48dc5f41184640061354226b96 (patch)
tree: a09a4c33c865828cd753c19af71c580f98735be5 /llama.cpp
parent: 8567c76b5326e862be0755a8dc1dd988223fcae3 (diff)
1 files changed, 5 insertions, 5 deletions
diff --git a/llama.cpp b/llama.cpp
index e04fbfc..7a866cb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2257,10 +2257,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
     }
     float * f32_output = (float *) output.addr;
 
-    quantize_fns_t qtype;
+    ggml_type_traits_t qtype;
     if (ggml_is_quantized(tensor.type)) {
-        qtype = ggml_internal_get_quantize_fn(tensor.type);
-        if (qtype.dequantize_row_q == NULL) {
+        qtype = ggml_internal_get_type_traits(tensor.type);
+        if (qtype.to_float == NULL) {
             throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
         }
     } else if (tensor.type != GGML_TYPE_F16) {
@@ -2271,7 +2271,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
         if (tensor.type == GGML_TYPE_F16) {
             ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
         } else if (ggml_is_quantized(tensor.type)) {
-            qtype.dequantize_row_q(tensor.data, f32_output, nelements);
+            qtype.to_float(tensor.data, f32_output, nelements);
         } else {
             LLAMA_ASSERT(false); // unreachable
         }
@@ -2296,7 +2296,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
             if (typ == GGML_TYPE_F16) {
                 ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
             } else {
-                qtype.dequantize_row_q(inbuf, outbuf, nels);
+                qtype.to_float(inbuf, outbuf, nels);
             }
         };
         workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
author	Stephan Walter <stephan@walter.name>	2023-07-05 16:13:06 +0000
committer	GitHub <noreply@github.com>	2023-07-05 19:13:06 +0300
commit	1b107b8550dced48dc5f41184640061354226b96 (patch)
tree	a09a4c33c865828cd753c19af71c580f98735be5 /llama.cpp
parent	8567c76b5326e862be0755a8dc1dd988223fcae3 (diff)