aboutsummaryrefslogtreecommitdiff
path: root/ggml-metal.m
diff options
context:
space:
mode:
authorKawrakow <48489457+ikawrakow@users.noreply.github.com>2023-07-20 18:19:45 +0300
committerGitHub <noreply@github.com>2023-07-20 18:19:45 +0300
commite782c9e735f93ab4767ffc37462c523b73a17ddc (patch)
treeb5c87fd34707ec9aa7a7a9716b2ef96157c033c5 /ggml-metal.m
parent785829dfe8baf0213f2ff66963d28c62f92d7930 (diff)
Faster Q5_K and Q6_K on Metal (#2294)
* Faster Q6_K on Metal * Faster Q5_K on Metal * Another Q5_K speedup --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml-metal.m')
-rw-r--r--ggml-metal.m19
1 files changed, 11 insertions, 8 deletions
diff --git a/ggml-metal.m b/ggml-metal.m
index 5e2a211..44d0468 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -703,8 +703,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1);
- nth0 = 4;
- nth1 = 16;
+ nth0 = 2;
+ nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
} break;
case GGML_TYPE_Q6_K:
@@ -712,8 +712,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1);
- nth0 = 4;
- nth1 = 16;
+ nth0 = 2;
+ nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
} break;
default:
@@ -743,11 +743,14 @@ void ggml_metal_graph_compute(
src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
+ else if (src0t == GGML_TYPE_Q5_K) {
+ [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+ }
+ else if (src0t == GGML_TYPE_Q6_K) {
+ [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+ }
else if (src0t == GGML_TYPE_Q2_K ||
- src0t == GGML_TYPE_Q3_K ||
- src0t == GGML_TYPE_Q4_K ||
- src0t == GGML_TYPE_Q5_K ||
- src0t == GGML_TYPE_Q6_K) {
+ src0t == GGML_TYPE_Q3_K) {
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else {