From 1cbf561466e957b25f0e8163c2386683f8674369 Mon Sep 17 00:00:00 2001 From: Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com> Date: Wed, 12 Jul 2023 16:10:55 -0400 Subject: metal : new q4_0 matrix-vector kernel (#2188) Prefetch data to improve GPU utilization. ~48% faster for 33B model. --- ggml-metal.m | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'ggml-metal.m') diff --git a/ggml-metal.m b/ggml-metal.m index d7a1693..02dc9be 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -739,7 +739,10 @@ void ggml_metal_graph_compute( [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; - if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) { + if (src0t == GGML_TYPE_Q4_0) { + [encoder dispatchThreadgroups:MTLSizeMake(ne01 / 8+((ne01 % 8) & 0x01), ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q4_1) { [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } -- cgit v1.2.3