aboutsummaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorStephan Walter <stephan@walter.name>2023-07-05 16:13:06 +0000
committerGitHub <noreply@github.com>2023-07-05 19:13:06 +0300
commit1b107b8550dced48dc5f41184640061354226b96 (patch)
treea09a4c33c865828cd753c19af71c580f98735be5 /llama.cpp
parent8567c76b5326e862be0755a8dc1dd988223fcae3 (diff)
ggml : generalize `quantize_fns` for simpler FP16 handling (#1237)
* Generalize quantize_fns for simpler FP16 handling * Remove call to ggml_cuda_mul_mat_get_wsize * ci : disable FMA for mac os actions --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp10
1 files changed, 5 insertions, 5 deletions
diff --git a/llama.cpp b/llama.cpp
index e04fbfc..7a866cb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2257,10 +2257,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
}
float * f32_output = (float *) output.addr;
- quantize_fns_t qtype;
+ ggml_type_traits_t qtype;
if (ggml_is_quantized(tensor.type)) {
- qtype = ggml_internal_get_quantize_fn(tensor.type);
- if (qtype.dequantize_row_q == NULL) {
+ qtype = ggml_internal_get_type_traits(tensor.type);
+ if (qtype.to_float == NULL) {
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
}
} else if (tensor.type != GGML_TYPE_F16) {
@@ -2271,7 +2271,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
if (tensor.type == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
} else if (ggml_is_quantized(tensor.type)) {
- qtype.dequantize_row_q(tensor.data, f32_output, nelements);
+ qtype.to_float(tensor.data, f32_output, nelements);
} else {
LLAMA_ASSERT(false); // unreachable
}
@@ -2296,7 +2296,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
if (typ == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
} else {
- qtype.dequantize_row_q(inbuf, outbuf, nels);
+ qtype.to_float(inbuf, outbuf, nels);
}
};
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));