metal : new q4_0 matrix-vector kernel (#2188)

Prefetch data to improve GPU utilization. ~48% faster for 33B model.
author: Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com> 2023-07-12 16:10:55 -0400
committer: GitHub <noreply@github.com> 2023-07-12 23:10:55 +0300
commit: 1cbf561466e957b25f0e8163c2386683f8674369 (patch)
tree: 4d796b3189de81bd3a32dde500d1d2f46d06eb07 /ggml-metal.m
parent: 975221e9548ef6d9f4af8d39cdffc4811c050beb (diff)
1 files changed, 4 insertions, 1 deletions
diff --git a/ggml-metal.m b/ggml-metal.m
index d7a1693..02dc9be 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -739,7 +739,10 @@ void ggml_metal_graph_compute(
                                 [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
                                 [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
 
-                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
+                                if (src0t == GGML_TYPE_Q4_0) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01 / 8+((ne01 % 8) & 0x01), ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                                else if (src0t == GGML_TYPE_Q4_1) {
                                     [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
                                     [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                 }
author	Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>	2023-07-12 16:10:55 -0400
committer	GitHub <noreply@github.com>	2023-07-12 23:10:55 +0300
commit	1cbf561466e957b25f0e8163c2386683f8674369 (patch)
tree	4d796b3189de81bd3a32dde500d1d2f46d06eb07 /ggml-metal.m
parent	975221e9548ef6d9f4af8d39cdffc4811c050beb (diff)