aboutsummaryrefslogtreecommitdiff
path: root/ggml-metal.m
diff options
context:
space:
mode:
authorShouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>2023-07-12 16:10:55 -0400
committerGitHub <noreply@github.com>2023-07-12 23:10:55 +0300
commit1cbf561466e957b25f0e8163c2386683f8674369 (patch)
tree4d796b3189de81bd3a32dde500d1d2f46d06eb07 /ggml-metal.m
parent975221e9548ef6d9f4af8d39cdffc4811c050beb (diff)
metal : new q4_0 matrix-vector kernel (#2188)
Prefetch data to improve GPU utilization. ~48% faster for 33B model.
Diffstat (limited to 'ggml-metal.m')
-rw-r--r--ggml-metal.m5
1 files changed, 4 insertions, 1 deletions
diff --git a/ggml-metal.m b/ggml-metal.m
index d7a1693..02dc9be 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -739,7 +739,10 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
- if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
+ if (src0t == GGML_TYPE_Q4_0) {
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01 / 8+((ne01 % 8) & 0x01), ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+ }
+ else if (src0t == GGML_TYPE_Q4_1) {
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}