From 1b107b8550dced48dc5f41184640061354226b96 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Wed, 5 Jul 2023 16:13:06 +0000 Subject: ggml : generalize `quantize_fns` for simpler FP16 handling (#1237) * Generalize quantize_fns for simpler FP16 handling * Remove call to ggml_cuda_mul_mat_get_wsize * ci : disable FMA for mac os actions --------- Co-authored-by: Georgi Gerganov --- pocs/vdot/q8dot.cpp | 6 +++--- pocs/vdot/vdot.cpp | 13 +++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'pocs/vdot') diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp index 5748c8a..4e0e023 100644 --- a/pocs/vdot/q8dot.cpp +++ b/pocs/vdot/q8dot.cpp @@ -136,7 +136,7 @@ int main(int argc, char** argv) { auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1; - auto funcs = ggml_internal_get_quantize_fn(ggml_type); + auto funcs = ggml_internal_get_type_traits(ggml_type); Stat simple, ggml; @@ -156,8 +156,8 @@ int main(int argc, char** argv) { t1 = std::chrono::high_resolution_clock::now(); float fs; - if (type == 0) funcs.vec_dot_q(kVecSize * QK4_1, &fs, x40.data(), y.data()); - else funcs.vec_dot_q(kVecSize * QK4_1, &fs, x41.data(), y.data()); + if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, x40.data(), y.data()); + else funcs.vec_dot(kVecSize * QK4_1, &fs, x41.data(), y.data()); t2 = std::chrono::high_resolution_clock::now(); t = 1e-3*std::chrono::duration_cast(t2-t1).count(); if (iloop > 3) ggml.addResult(fs, t); diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp index 7b18090..48758cd 100644 --- a/pocs/vdot/vdot.cpp +++ b/pocs/vdot/vdot.cpp @@ -235,7 +235,7 @@ int main(int argc, char** argv) { int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64); int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64); - auto funcs = useQ4_1 ? ggml_internal_get_quantize_fn(GGML_TYPE_Q4_1) : ggml_internal_get_quantize_fn(GGML_TYPE_Q4_0); + auto funcs = useQ4_1 ? ggml_internal_get_type_traits(GGML_TYPE_Q4_1) : ggml_internal_get_type_traits(GGML_TYPE_Q4_0); std::vector q40; std::vector q41; @@ -261,9 +261,9 @@ int main(int argc, char** argv) { // Note, we do not include this in the timing as in practical application // we already have the quantized model weights. if (useQ4_1) { - funcs.quantize_row_q(x1.data(), q41.data(), kVecSize); + funcs.from_float(x1.data(), q41.data(), kVecSize); } else { - funcs.quantize_row_q(x1.data(), q40.data(), kVecSize); + funcs.from_float(x1.data(), q40.data(), kVecSize); } // Now measure time the dot product needs using the "scalar" version above @@ -282,9 +282,10 @@ int main(int argc, char** argv) { dot_q4_q8(kVecSize, &result, q40.data(), q8.data()); } else { - funcs.quantize_row_q_dot(y1.data(), q8.data(), kVecSize); - if (useQ4_1) funcs.vec_dot_q(kVecSize, &result, q41.data(), q8.data()); - else funcs.vec_dot_q(kVecSize, &result, q40.data(), q8.data()); + auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type); + vdot.from_float(y1.data(), q8.data(), kVecSize); + if (useQ4_1) funcs.vec_dot(kVecSize, &result, q41.data(), q8.data()); + else funcs.vec_dot(kVecSize, &result, q40.data(), q8.data()); } sumq += result; t2 = std::chrono::high_resolution_clock::now(); -- cgit v1.2.3