aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKawrakow <48489457+ikawrakow@users.noreply.github.com>2023-06-09 10:39:59 +0300
committerGitHub <noreply@github.com>2023-06-09 10:39:59 +0300
commit245fc3c37da5ac5963f9f11a9f4f2ac08d96afc6 (patch)
treeb2312b5b19a6887526d9e25d41b29eb4fdbcd49e
parent72ff5282bf0388c60821f504c4c8cc2b1f491aa6 (diff)
metal : faster q4_0 (#1775)
* metal : 8% faster q4_0 Avoid copying into local uchar4 anf float4. * metal : 17% faster Q4_0 Use 64 threads in a thread group. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
-rw-r--r--ggml-metal.m2
-rw-r--r--ggml-metal.metal34
2 files changed, 20 insertions, 16 deletions
diff --git a/ggml-metal.m b/ggml-metal.m
index ac4f134..54cbaf8 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -526,7 +526,7 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne12 == 1);
nth0 = 8;
- nth1 = 4;
+ nth1 = 8;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
} break;
case GGML_TYPE_Q2_K:
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 43814ed..8e730eb 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -267,6 +267,8 @@ kernel void kernel_mul_mat_q4_0_f32(
uint2 tptg[[threads_per_threadgroup]]) {
const int nb = ne00/QK4_0;
+ const int8_t m8 = 8;
+
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
@@ -276,33 +278,34 @@ kernel void kernel_mul_mat_q4_0_f32(
const uint nth = tptg.x*tptg.y;
const uint ith = tptg.y*tpitg.x + tpitg.y;
- sum[ith] = 0.0f;
+ const int ix = tpitg.y/4; // 0 or 1
+ const int iy = tpitg.y - 4*ix; // 0...3
- for (int i = tpitg.x; i < nb; i += tptg.x) {
- device const uchar4 * x0p = (device const uchar4 *) (x + i)->qs;
- device const float4 * y0p = (device const float4 *) (y + i*QK4_0);
+ const int first = 4 * iy;
+
+ float sumf = 0;
- const float d = (float)((x + i)->d);
+ for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
- const uchar4 x0v = *(x0p + tpitg.y);
- const float4 y0v = *(y0p + tpitg.y + 0);
- const float4 y1v = *(y0p + tpitg.y + 4);
+ const float d = (float)x[i].d;
- float acc = 0.0f;
+ device const uint8_t * xl = x[i].qs + first;
+ device const float * yl = y + i * QK4_0 + first;
+
+ float2 acc = {0.0f, 0.0f};
for (int j = 0; j < 4; ++j) {
- const int x0 = x0v[j] & 0x0F;
- const int x1 = x0v[j] >> 4;
- const float y0 = y0v[j];
- const float y1 = y1v[j];
+ acc[0] += yl[j+ 0] * ((int8_t)(xl[j] & 0xF) - m8);
+ acc[1] += yl[j+16] * ((int8_t)(xl[j] >> 4) - m8);
- acc += (x0 - 8)*y0 + (x1 - 8)*y1;
}
- sum[ith] += acc*d;
+ sumf += d * (acc[0] + acc[1]);
}
+ sum[ith] = sumf;
+
//
// Accumulate the sum from all threads in the threadgroup
// This version is slightly faster than the commented out one below,
@@ -357,6 +360,7 @@ kernel void kernel_mul_mat_f16_f32(
uint3 tpig[[thread_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 tptg[[threads_per_threadgroup]]) {
+
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
const int64_t im = tgpig.z;