diff options
author | Stephan Walter <stephan@walter.name> | 2023-07-05 16:13:06 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-05 19:13:06 +0300 |
commit | 1b107b8550dced48dc5f41184640061354226b96 (patch) | |
tree | a09a4c33c865828cd753c19af71c580f98735be5 /llama.cpp | |
parent | 8567c76b5326e862be0755a8dc1dd988223fcae3 (diff) |
ggml : generalize `quantize_fns` for simpler FP16 handling (#1237)
* Generalize quantize_fns for simpler FP16 handling
* Remove call to ggml_cuda_mul_mat_get_wsize
* ci : disable FMA for mac os actions
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 10 |
1 files changed, 5 insertions, 5 deletions
@@ -2257,10 +2257,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam } float * f32_output = (float *) output.addr; - quantize_fns_t qtype; + ggml_type_traits_t qtype; if (ggml_is_quantized(tensor.type)) { - qtype = ggml_internal_get_quantize_fn(tensor.type); - if (qtype.dequantize_row_q == NULL) { + qtype = ggml_internal_get_type_traits(tensor.type); + if (qtype.to_float == NULL) { throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type))); } } else if (tensor.type != GGML_TYPE_F16) { @@ -2271,7 +2271,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam if (tensor.type == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements); } else if (ggml_is_quantized(tensor.type)) { - qtype.dequantize_row_q(tensor.data, f32_output, nelements); + qtype.to_float(tensor.data, f32_output, nelements); } else { LLAMA_ASSERT(false); // unreachable } @@ -2296,7 +2296,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam if (typ == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels); } else { - qtype.dequantize_row_q(inbuf, outbuf, nels); + qtype.to_float(inbuf, outbuf, nels); } }; workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems)); |