From 1b107b8550dced48dc5f41184640061354226b96 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Wed, 5 Jul 2023 16:13:06 +0000 Subject: ggml : generalize `quantize_fns` for simpler FP16 handling (#1237) * Generalize quantize_fns for simpler FP16 handling * Remove call to ggml_cuda_mul_mat_get_wsize * ci : disable FMA for mac os actions --------- Co-authored-by: Georgi Gerganov --- tests/test-quantize-fns.cpp | 30 ++++++++++++++++-------------- tests/test-quantize-perf.cpp | 25 +++++++++++++------------ 2 files changed, 29 insertions(+), 26 deletions(-) (limited to 'tests') diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index c40f1b2..8d3c162 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -40,26 +40,26 @@ float array_rmse(const float * a1, const float * a2, size_t n) { } // Total quantization error on test data -float total_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) { +float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { std::vector tmp_q(2*test_size); std::vector tmp_out(test_size); - qfns.quantize_row_q(test_data, tmp_q.data(), test_size); - qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size); + qfns.from_float(test_data, tmp_q.data(), test_size); + qfns.to_float(tmp_q.data(), tmp_out.data(), test_size); return array_rmse(test_data, tmp_out.data(), test_size); } // Total quantization error on test data -float reference_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) { +float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { std::vector tmp_q(2*test_size); std::vector tmp_out(test_size); std::vector tmp_out_ref(test_size); - qfns.quantize_row_q(test_data, tmp_q.data(), test_size); - qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size); + qfns.from_float(test_data, tmp_q.data(), test_size); + qfns.to_float(tmp_q.data(), tmp_out.data(), test_size); - qfns.quantize_row_q_reference(test_data, tmp_q.data(), test_size); - qfns.dequantize_row_q(tmp_q.data(), tmp_out_ref.data(), test_size); + qfns.from_float_reference(test_data, tmp_q.data(), test_size); + qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size); return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size); } @@ -73,15 +73,17 @@ float dot_product(const float * a1, const float * a2, size_t test_size) { } // Total dot product error -float dot_product_error(quantize_fns_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) { +float dot_product_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) { std::vector tmp_q1(2*test_size); std::vector tmp_q2(2*test_size); - qfns.quantize_row_q (test_data1, tmp_q1.data(), test_size); - qfns.quantize_row_q_dot(test_data2, tmp_q2.data(), test_size); + auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type); + + qfns.from_float(test_data1, tmp_q1.data(), test_size); + vdot.from_float(test_data2, tmp_q2.data(), test_size); float result = INFINITY; - qfns.vec_dot_q(test_size, &result, tmp_q1.data(), tmp_q2.data()); + qfns.vec_dot(test_size, &result, tmp_q1.data(), tmp_q2.data()); const float dot_ref = dot_product(test_data1, test_data2, test_size); @@ -123,9 +125,9 @@ int main(int argc, char * argv[]) { for (int i = 0; i < GGML_TYPE_COUNT; i++) { ggml_type type = (ggml_type) i; - quantize_fns_t qfns = ggml_internal_get_quantize_fn(i); + ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); - if (qfns.quantize_row_q && qfns.dequantize_row_q) { + if (qfns.from_float && qfns.to_float) { const float total_error = total_quantization_error(qfns, test_size, test_data.data()); const float max_quantization_error = type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index c0e361e..0bb9537 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -123,9 +123,9 @@ void usage(char * argv[]) { printf(" --type TYPE set test type as"); for (int i = 0; i < GGML_TYPE_COUNT; i++) { ggml_type type = (ggml_type) i; - quantize_fns_t qfns = ggml_internal_get_quantize_fn(type); + ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); if (ggml_type_name(type) != NULL) { - if (qfns.quantize_row_q && qfns.dequantize_row_q) { + if (qfns.from_float && qfns.to_float) { printf(" %s", ggml_type_name(type)); } } @@ -271,12 +271,12 @@ int main(int argc, char * argv[]) { for (int i = 0; i < GGML_TYPE_COUNT; i++) { ggml_type type = (ggml_type) i; - quantize_fns_t qfns = ggml_internal_get_quantize_fn(i); + ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) { continue; } - if (qfns.quantize_row_q && qfns.dequantize_row_q) { + if (qfns.from_float && qfns.to_float) { printf("%s\n", ggml_type_name(type)); if (params.op_quantize_row_q_reference) { @@ -284,7 +284,7 @@ int main(int argc, char * argv[]) { for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void ) { - qfns.quantize_row_q_reference(test_data1, test_q1, size); + qfns.from_float_reference(test_data1, test_q1, size); return test_q1[0]; }; size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); @@ -298,7 +298,7 @@ int main(int argc, char * argv[]) { for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void ) { - qfns.quantize_row_q(test_data1, test_q1, size); + qfns.from_float(test_data1, test_q1, size); return test_q1[0]; }; size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); @@ -309,11 +309,11 @@ int main(int argc, char * argv[]) { if (params.op_dequantize_row_q) { printf(" dequantize_row_q\n"); - qfns.quantize_row_q(test_data1, test_q1, largest); + qfns.from_float(test_data1, test_q1, largest); for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void ) { - qfns.dequantize_row_q(test_q1, test_out, size); + qfns.to_float(test_q1, test_out, size); return test_out[0]; }; size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); @@ -327,7 +327,8 @@ int main(int argc, char * argv[]) { for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void ) { - qfns.quantize_row_q_dot(test_data1, test_q1, size); + auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type); + vdot.from_float(test_data1, test_q1, size); return test_q1[0]; }; size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); @@ -338,13 +339,13 @@ int main(int argc, char * argv[]) { if (params.op_vec_dot_q) { printf(" vec_dot_q\n"); - qfns.quantize_row_q(test_data1, test_q1, largest); - qfns.quantize_row_q(test_data2, test_q2, largest); + qfns.from_float(test_data1, test_q1, largest); + qfns.from_float(test_data2, test_q2, largest); for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void ) { float result; - qfns.vec_dot_q(size, &result, test_q1, test_q2); + qfns.vec_dot(size, &result, test_q1, test_q2); return result; }; size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); -- cgit v1.2.3