aboutsummaryrefslogtreecommitdiff
path: root/ggml-metal.metal
diff options
context:
space:
mode:
Diffstat (limited to 'ggml-metal.metal')
-rw-r--r--ggml-metal.metal412
1 files changed, 328 insertions, 84 deletions
diff --git a/ggml-metal.metal b/ggml-metal.metal
index d1e4922..e62fe68 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -428,7 +428,7 @@ kernel void kernel_mul_mat_q4_0_f32(
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) {
- for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
+ for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0];
}
}
@@ -497,7 +497,7 @@ kernel void kernel_mul_mat_q4_1_f32(
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) {
- for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
+ for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0];
}
}
@@ -775,47 +775,76 @@ kernel void kernel_cpy_f32_f32(
//============================================ k-quants ======================================================
+#ifndef QK_K
#define QK_K 256
+#else
+static_assert(QK_K == 256 || QK_K == 64, "QK_K must be 256 or 64");
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
typedef struct {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
half d; // super-block scale for quantized scales
half dmin; // super-block scale for quantized mins
-} block_q2_k;
+} block_q2_K;
// 84 bytes / block
typedef struct {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
- uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
- half d; // super-block scale
-} block_q3_k;
-// 110 bytes / block
-
+#if QK_K == 64
+ uint8_t scales[2];
+#else
+ uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+ half d; // super-block scale
+} block_q3_K;
+
+#if QK_K == 64
+typedef struct {
+ half d[2]; // super-block scales/mins
+ uint8_t scales[2];
+ uint8_t qs[QK_K/2]; // 4-bit quants
+} block_q4_K;
+#else
typedef struct {
half d; // super-block scale for quantized scales
half dmin; // super-block scale for quantized mins
- uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
-} block_q4_k;
-// 144 bytes / block
+} block_q4_K;
+#endif
+#if QK_K == 64
+typedef struct {
+ half d; // super-block scales/mins
+ int8_t scales[QK_K/16]; // 8-bit block scales
+ uint8_t qh[QK_K/8]; // quants, high bit
+ uint8_t qs[QK_K/2]; // quants, low 4 bits
+} block_q5_K;
+#else
typedef struct {
half d; // super-block scale for quantized scales
half dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
-} block_q5_k;
+} block_q5_K;
// 176 bytes / block
+#endif
typedef struct {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
half d; // super-block scale
-} block_q6_k;
+} block_q6_K;
// 210 bytes / block
static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
@@ -836,7 +865,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
//========================================== dequantization =============================
-static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, int k) {
+static void dequantize_row_q2_K(device const block_q2_K * x, device float * y, int k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
@@ -847,6 +876,7 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i
device const uint8_t * q = x[i].qs;
+#if QK_K == 256
int is = 0;
float dl, ml;
for (int n = 0; n < QK_K; n += 128) {
@@ -865,14 +895,29 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i
}
q += 32;
}
+#else
+ float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4);
+ float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4);
+ float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4);
+ float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4);
+ for (int l = 0; l < 16; ++l) {
+ y[l+ 0] = dl1 * ((q[l] >> 0) & 3) - ml1;
+ y[l+16] = dl2 * ((q[l] >> 2) & 3) - ml2;
+ y[l+32] = dl3 * ((q[l] >> 4) & 3) - ml3;
+ y[l+48] = dl4 * ((q[l] >> 6) & 3) - ml4;
+ }
+ y += QK_K;
+#endif
}
}
-static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, int k) {
+static void dequantize_row_q3_K(device const block_q3_K * x, device float * y, int k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
+#if QK_K == 256
+
const uint16_t kmask1 = 0x0303;
const uint16_t kmask2 = 0x0f0f;
@@ -918,22 +963,49 @@ static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, i
}
q += 32;
}
+ }
+#else
+ for (int i = 0; i < nb; i++) {
+ const float d_all = (float)(x[i].d);
+
+ device const uint8_t * q = x[i].qs;
+ device const uint8_t * hm = x[i].hmask;
+
+ const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
+ const float d2 = d_all * ((x[i].scales[0] >> 4) - 8);
+ const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
+ const float d4 = d_all * ((x[i].scales[1] >> 4) - 8);
+
+ for (int l = 0; l < 8; ++l) {
+ uint8_t h = hm[l];
+ y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4));
+ y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4));
+ y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4));
+ y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4));
+ y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4));
+ y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4));
+ y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4));
+ y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4));
+ }
+ y += QK_K;
}
+#endif
}
-static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, int k) {
+static void dequantize_row_q4_K(device const block_q4_K * x, device float * y, int k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
-
for (int i = 0; i < nb; i++) {
+ device const uint8_t * q = x[i].qs;
+
+#if QK_K == 256
const float d = x[i].d;
const float min = x[i].dmin;
- device const uint8_t * q = x[i].qs;
device const uint8_t * scales = x[i].scales;
int is = 0;
@@ -945,14 +1017,29 @@ static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, i
for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2;
q += 32; is += 2;
}
+#else
+ device const uint8_t * s = x[i].scales;
+ device const half2 * dh = (device const half2 *)x[i].d;
+ const float2 d = (float2)dh[0];
+ const float d1 = d[0] * (s[0] & 0xF);
+ const float d2 = d[0] * (s[1] & 0xF);
+ const float m1 = d[1] * (s[0] >> 4);
+ const float m2 = d[1] * (s[1] >> 4);
+ for (int l = 0; l < 32; ++l) {
+ y[l+ 0] = d1 * (q[l] & 0xF) - m1;
+ y[l+32] = d2 * (q[l] >> 4) - m2;
+ }
+ y += QK_K;
+#endif
}
}
-static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, int k) {
+static void dequantize_row_q5_K(device const block_q5_K * x, device float * y, int k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
+#if QK_K == 256
for (int i = 0; i < nb; i++) {
const float d = (float)(x[i].d);
@@ -973,10 +1060,32 @@ static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, i
u1 <<= 2; u2 <<= 2;
}
}
+#else
+ for (int i = 0; i < nb; i++) {
+
+ const float d = (float)x[i].d;
+
+ device const uint8_t * ql = x[i].qs;
+ device const uint8_t * qh = x[i].qh;
+ device const int8_t * sc = x[i].scales;
+
+ for (int l = 0; l < 8; ++l) {
+ y[l+ 0] = d * sc[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16));
+ y[l+ 8] = d * sc[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16));
+ y[l+16] = d * sc[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16));
+ y[l+24] = d * sc[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16));
+ y[l+32] = d * sc[2] * ((ql[l+ 0] >> 4) - (qh[l] & 0x10 ? 0 : 16));
+ y[l+40] = d * sc[2] * ((ql[l+ 8] >> 4) - (qh[l] & 0x20 ? 0 : 16));
+ y[l+48] = d * sc[3] * ((ql[l+16] >> 4) - (qh[l] & 0x40 ? 0 : 16));
+ y[l+56] = d * sc[3] * ((ql[l+24] >> 4) - (qh[l] & 0x80 ? 0 : 16));
+ }
+ y += QK_K;
+ }
+#endif
}
-static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, int k) {
+static void dequantize_row_q6_K(device const block_q6_K * x, device float * y, int k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
@@ -988,6 +1097,7 @@ static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, i
const float d = x[i].d;
+#if QK_K == 256
for (int n = 0; n < QK_K; n += 128) {
for (int l = 0; l < 32; ++l) {
int is = l/16;
@@ -1005,10 +1115,23 @@ static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, i
qh += 32;
sc += 8;
}
+#else
+ for (int l = 0; l < 16; ++l) {
+ const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+ const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+ const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+ const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+ y[l+ 0] = d * sc[0] * q1;
+ y[l+16] = d * sc[1] * q2;
+ y[l+32] = d * sc[2] * q3;
+ y[l+48] = d * sc[3] * q4;
+ }
+ y += 64;
+#endif
}
}
-kernel void kernel_get_rows_q2_k(
+kernel void kernel_get_rows_q2_K(
device const void * src0,
device const int * src1,
device float * dst,
@@ -1019,12 +1142,12 @@ kernel void kernel_get_rows_q2_k(
const int i = tpig;
const int r = ((device int32_t *) src1)[i];
- dequantize_row_q2_k(
- (device const block_q2_k *) ((device char *) src0 + r*nb01),
+ dequantize_row_q2_K(
+ (device const block_q2_K *) ((device char *) src0 + r*nb01),
(device float *) ((device char *) dst + i*nb1), ne00);
}
-kernel void kernel_get_rows_q3_k(
+kernel void kernel_get_rows_q3_K(
device const void * src0,
device const int * src1,
device float * dst,
@@ -1035,12 +1158,12 @@ kernel void kernel_get_rows_q3_k(
const int i = tpig;
const int r = ((device int32_t *) src1)[i];
- dequantize_row_q3_k(
- (device const block_q3_k *) ((device char *) src0 + r*nb01),
+ dequantize_row_q3_K(
+ (device const block_q3_K *) ((device char *) src0 + r*nb01),
(device float *) ((device char *) dst + i*nb1), ne00);
}
-kernel void kernel_get_rows_q4_k(
+kernel void kernel_get_rows_q4_K(
device const void * src0,
device const int * src1,
device float * dst,
@@ -1051,12 +1174,12 @@ kernel void kernel_get_rows_q4_k(
const int i = tpig;
const int r = ((device int32_t *) src1)[i];
- dequantize_row_q4_k(
- (device const block_q4_k *) ((device char *) src0 + r*nb01),
+ dequantize_row_q4_K(
+ (device const block_q4_K *) ((device char *) src0 + r*nb01),
(device float *) ((device char *) dst + i*nb1), ne00);
}
-kernel void kernel_get_rows_q5_k(
+kernel void kernel_get_rows_q5_K(
device const void * src0,
device const int * src1,
device float * dst,
@@ -1067,12 +1190,12 @@ kernel void kernel_get_rows_q5_k(
const int i = tpig;
const int r = ((device int32_t *) src1)[i];
- dequantize_row_q5_k(
- (device const block_q5_k *) ((device char *) src0 + r*nb01),
+ dequantize_row_q5_K(
+ (device const block_q5_K *) ((device char *) src0 + r*nb01),
(device float *) ((device char *) dst + i*nb1), ne00);
}
-kernel void kernel_get_rows_q6_k(
+kernel void kernel_get_rows_q6_K(
device const void * src0,
device const int * src1,
device float * dst,
@@ -1083,14 +1206,14 @@ kernel void kernel_get_rows_q6_k(
const int i = tpig;
const int r = ((device int32_t *) src1)[i];
- dequantize_row_q6_k(
- (device const block_q6_k *) ((device char *) src0 + r*nb01),
+ dequantize_row_q6_K(
+ (device const block_q6_K *) ((device char *) src0 + r*nb01),
(device float *) ((device char *) dst + i*nb1), ne00);
}
//====================================== dot products =========================
-kernel void kernel_mul_mat_q2_k_f32(
+kernel void kernel_mul_mat_q2_K_f32(
device const void * src0,
device const float * src1,
device float * dst,
@@ -1107,12 +1230,15 @@ kernel void kernel_mul_mat_q2_k_f32(
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
- device const block_q2_k * x = (device const block_q2_k *) src0 + r0*nb;
+ device const block_q2_K * x = (device const block_q2_K *) src0 + r0*nb;
device const float * yy = (device const float *) src1 + r1*ne10;
const int nth = tptg.x*tptg.y;
const int ith = tptg.y*tpitg.x + tpitg.y;
+ float sumf = 0;
+
+#if QK_K == 256
const int tid = tpitg.y; // 0...16
const int il = tid/4; // 0...3
const int ir = tid%4; // 0...3
@@ -1125,9 +1251,6 @@ kernel void kernel_mul_mat_q2_k_f32(
const int y_offset = 64*il + n*ir;
const int q_offset = 32*ip + n*ir;
- sum[ith] = 0.0f;
-
- float sumf = 0;
for (int i = tpitg.x; i < nb; i += tptg.x) {
device const uint8_t * q = x[i].qs + q_offset;
@@ -1140,7 +1263,6 @@ kernel void kernel_mul_mat_q2_k_f32(
device const float * y = yy + i*QK_K + y_offset;
- //float4 s = {0.f, 0.f, 0.f, 0.f};
float2 s = {0.f, 0.f};
float smin = 0;
for (int l = 0; l < n; ++l) {
@@ -1155,25 +1277,38 @@ kernel void kernel_mul_mat_q2_k_f32(
sumf += dall * (s[0] * d1 + s[1] * d2) - dmin * smin;
}
- sum[ith] = sumf;
+#else
+ const int il = 4 * tpitg.x;
- //int mask1 = (ith%4 == 0);
- //int mask2 = (ith%16 == 0);
+ uint32_t aux[2];
+ thread const uint8_t * d = (thread const uint8_t *)aux;
+ thread const uint8_t * m = (thread const uint8_t *)aux + 4;
- //threadgroup_barrier(mem_flags::mem_threadgroup);
- //for (int i = 1; i < 4; ++i) sum[ith] += mask1 * sum[ith + i];
- //threadgroup_barrier(mem_flags::mem_threadgroup);
- //for (int i = 4; i < 16; i += 4) sum[ith] += mask2 * sum[ith + i];
- //threadgroup_barrier(mem_flags::mem_threadgroup);
- //if (ith == 0) {
- // for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
- // dst[r1*ne0 + r0] = sum[0];
- //}
+ for (int i = tpitg.y; i < nb; i += tptg.y) {
+
+ device const uint8_t * q = x[i].qs + il;
+ device const float * y = yy + i*QK_K + il;
+
+ const float dall = (float)x[i].d;
+ const float dmin = (float)x[i].dmin;
+
+ device const uint32_t * a = (device const uint32_t *)x[i].scales;
+ aux[0] = a[0] & 0x0f0f0f0f;
+ aux[1] = (a[0] >> 4) & 0x0f0f0f0f;
+
+ for (int l = 0; l < 4; ++l) {
+ sumf += y[l+ 0] * (dall * d[0] * ((q[l] >> 0) & 3) - dmin * m[0])
+ + y[l+16] * (dall * d[1] * ((q[l] >> 2) & 3) - dmin * m[1])
+ + y[l+32] * (dall * d[2] * ((q[l] >> 4) & 3) - dmin * m[2])
+ + y[l+48] * (dall * d[3] * ((q[l] >> 6) & 3) - dmin * m[3]);
+ }
+ }
+#endif
+
+ sum[ith] = sumf;
//
// Accumulate the sum from all threads in the threadgroup
- // This version is slightly faster than the commented out one below,
- // which I copy-pasted from ggerganov's q4_0 dot product for metal.
//
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%4 == 0) {
@@ -1190,7 +1325,7 @@ kernel void kernel_mul_mat_q2_k_f32(
}
}
-kernel void kernel_mul_mat_q3_k_f32(
+kernel void kernel_mul_mat_q3_K_f32(
device const void * src0,
device const float * src1,
device float * dst,
@@ -1203,23 +1338,25 @@ kernel void kernel_mul_mat_q3_k_f32(
uint2 tpitg[[thread_position_in_threadgroup]],
uint2 tptg[[threads_per_threadgroup]]) {
- const uint16_t kmask1 = 0x0303;
- const uint16_t kmask2 = 0x0f0f;
-
- const uint8_t m3 = 3;
- const int8_t m4 = 4;
-
const int nb = ne00/QK_K;
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
- device const block_q3_k * x = (device const block_q3_k *) src0 + r0*nb;
+ device const block_q3_K * x = (device const block_q3_K *) src0 + r0*nb;
device const float * yy = (device const float *) src1 + r1*ne10;
const int nth = tptg.x*tptg.y;
const int ith = tptg.y*tpitg.x + tpitg.y;
+#if QK_K == 256
+
+ const uint8_t m3 = 3;
+ const int8_t m4 = 4;
+
+ const uint16_t kmask1 = 0x0303;
+ const uint16_t kmask2 = 0x0f0f;
+
const int tid = tpitg.y; // expecting 16
const int ip = tid/8; // 0 or 1
const int il = tid/2 - 4*ip; // 0...3
@@ -1273,6 +1410,39 @@ kernel void kernel_mul_mat_q3_k_f32(
//sum[ith] = sumf;
sum[ith] = sumf1 - 32.f*sumf2;
+#else
+ const int il = 4 * tpitg.x; // 0, 4, 8, 12
+ const int im = il/8; // 0, 0, 1, 1
+ const int in = il%8; // 0, 4, 0, 4
+
+ float sumf = 0;
+
+ for (int i = tpitg.y; i < nb; i += tptg.y) {
+
+ const float d_all = (float)(x[i].d);
+
+ device const uint8_t * q = x[i].qs + il;
+ device const uint8_t * h = x[i].hmask + in;
+ device const float * y = yy + i * QK_K + il;
+
+ const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
+ const float d2 = d_all * ((x[i].scales[0] >> 4) - 8);
+ const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
+ const float d4 = d_all * ((x[i].scales[1] >> 4) - 8);
+
+ for (int l = 0; l < 4; ++l) {
+ const uint8_t hm = h[l] >> im;
+ sumf += y[l+ 0] * d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((hm & 0x01) ? 0 : 4))
+ + y[l+16] * d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((hm & 0x04) ? 0 : 4))
+ + y[l+32] * d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((hm & 0x10) ? 0 : 4))
+ + y[l+48] * d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((hm & 0x40) ? 0 : 4));
+ }
+
+ }
+
+ sum[ith] = sumf;
+
+#endif
//
// Accumulate the sum from all threads in the threadgroup
@@ -1293,7 +1463,7 @@ kernel void kernel_mul_mat_q3_k_f32(
}
-kernel void kernel_mul_mat_q4_k_f32(
+kernel void kernel_mul_mat_q4_K_f32(
device const void * src0,
device const float * src1,
device float * dst,
@@ -1305,21 +1475,25 @@ kernel void kernel_mul_mat_q4_k_f32(
uint2 tpitg[[thread_position_in_threadgroup]],
uint2 tptg[[threads_per_threadgroup]]) {
- const uint16_t kmask1 = 0x3f3f;
- const uint16_t kmask2 = 0x0f0f;
- const uint16_t kmask3 = 0xc0c0;
-
const int nb = ne00/QK_K;
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
- device const block_q4_k * x = (device const block_q4_k *) src0 + r0*nb;
- device const float * yy = (device const float *) src1 + r1*ne10;
-
const int nth = tptg.x*tptg.y;
const int ith = tptg.y*tpitg.x + tpitg.y;
+ device const block_q4_K * x = (device const block_q4_K *) src0 + r0*nb;
+ device const float * yy = (device const float *) src1 + r1*ne10;
+
+ float sumf = 0;
+
+#if QK_K == 256
+
+ const uint16_t kmask1 = 0x3f3f;
+ const uint16_t kmask2 = 0x0f0f;
+ const uint16_t kmask3 = 0xc0c0;
+
const int tid = tpitg.y; // 0...16
const int il = tid/4; // 0...3
const int ir = tid - 4*il;// 0...3
@@ -1332,11 +1506,8 @@ kernel void kernel_mul_mat_q4_k_f32(
const int q_offset = 32*im + l0;
const int y_offset = 64*im + l0;
- sum[ith] = 0.0f;
-
uchar2 sc1, sc2, sc3, sc4;
- float sumf = 0;
for (int i = tpitg.x; i < nb; i += tptg.x) {
device const uint8_t * q1 = (x + i)->qs + q_offset;
@@ -1365,6 +1536,30 @@ kernel void kernel_mul_mat_q4_k_f32(
sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
}
+#else
+ uint16_t aux16[2];
+ thread const uint8_t * scales = (thread const uint8_t *)aux16;
+
+ const int il = 4*tpitg.x;
+
+ for (int i = tpitg.y; i < nb; i += tptg.y) {
+
+ device const uint8_t * q = x[i].qs + il;
+ device const float * y = yy + i * QK_K + il;
+
+ const float d = (float)x[i].d[0];
+ const float m = (float)x[i].d[1];
+
+ device const uint16_t * a = (device const uint16_t *)x[i].scales;
+ aux16[0] = a[0] & 0x0f0f;
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
+
+ for (int l = 0; l < 4; ++l) {
+ sumf += d * scales[0] * (y[l+ 0] * (q[l] & 0xF) + y[l+16] * (q[l+16] & 0xF)) - m * scales[2] * (y[l+ 0] + y[l+16])
+ + d * scales[1] * (y[l+32] * (q[l] >> 4) + y[l+48] * (q[l+16] >> 4)) - m * scales[3] * (y[l+32] + y[l+48]);
+ }
+ }
+#endif
sum[ith] = sumf;
@@ -1401,7 +1596,7 @@ kernel void kernel_mul_mat_q4_k_f32(
//}
}
-kernel void kernel_mul_mat_q5_k_f32(
+kernel void kernel_mul_mat_q5_K_f32(
device const void * src0,
device const float * src1,
device float * dst,
@@ -1413,21 +1608,25 @@ kernel void kernel_mul_mat_q5_k_f32(
uint2 tpitg[[thread_position_in_threadgroup]],
uint2 tptg[[threads_per_threadgroup]]) {
- const uint16_t kmask1 = 0x3f3f;
- const uint16_t kmask2 = 0x0f0f;
- const uint16_t kmask3 = 0xc0c0;
-
const int nb = ne00/QK_K;
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
- device const block_q5_k * x = (device const block_q5_k *) src0 + r0*nb;
+ device const block_q5_K * x = (device const block_q5_K *) src0 + r0*nb;
device const float * yy = (device const float *) src1 + r1*ne10;
const int nth = tptg.x*tptg.y;
const int ith = tptg.y*tpitg.x + tpitg.y;
+ float sumf = 0;
+
+#if QK_K == 256
+
+ const uint16_t kmask1 = 0x3f3f;
+ const uint16_t kmask2 = 0x0f0f;
+ const uint16_t kmask3 = 0xc0c0;
+
const int tid = tpitg.y; // 0...16
const int il = tid/4; // 0...3
const int ir = tid - 4*il;// 0...3
@@ -1447,7 +1646,6 @@ kernel void kernel_mul_mat_q5_k_f32(
uchar2 sc1, sc2, sc3, sc4;
- float sumf = 0;
for (int i = tpitg.x; i < nb; i += tptg.x) {
device const uint8_t * q1 = (x + i)->qs + q_offset;
@@ -1479,6 +1677,28 @@ kernel void kernel_mul_mat_q5_k_f32(
sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
}
+#else
+ const int il = 4 * tpitg.x; // 0, 4, 8, 12
+ const int im = il/8; // 0, 0, 1, 1
+ const int in = il%8; // 0, 4, 0, 4
+
+ for (int i = tpitg.y; i < nb; i += tptg.y) {
+
+ const float d = (float)x[i].d;
+ device const uint8_t * q = x[i].qs + il;
+ device const uint8_t * h = x[i].qh + in;
+ device const int8_t * s = x[i].scales;
+ device const float * y = yy + i*QK_K + il;
+
+ for (int l = 0; l < 4; ++l) {
+ const uint8_t hl = h[l] >> im;
+ sumf += y[l+ 0] * d * s[0] * ((q[l+ 0] & 0xF) - (hl & 0x01 ? 0 : 16))
+ + y[l+16] * d * s[1] * ((q[l+16] & 0xF) - (hl & 0x04 ? 0 : 16))
+ + y[l+32] * d * s[2] * ((q[l+ 0] >> 4) - (hl & 0x10 ? 0 : 16))
+ + y[l+48] * d * s[3] * ((q[l+16] >> 4) - (hl & 0x40 ? 0 : 16));
+ }
+ }
+#endif
sum[ith] = sumf;
//
@@ -1500,7 +1720,7 @@ kernel void kernel_mul_mat_q5_k_f32(
}
-kernel void kernel_mul_mat_q6_k_f32(
+kernel void kernel_mul_mat_q6_K_f32(
device const void * src0,
device const float * src1,
device float * dst,
@@ -1522,12 +1742,15 @@ kernel void kernel_mul_mat_q6_k_f32(
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
- device const block_q6_k * x = (device const block_q6_k *) src0 + r0*nb;
+ device const block_q6_K * x = (device const block_q6_K *) src0 + r0*nb;
device const float * yy = (device const float *) src1 + r1*ne10;
const int nth = tptg.x*tptg.y;
const int ith = tptg.y*tpitg.x + tpitg.y;
+ float sumf = 0;
+
+#if QK_K == 256
// Note: we absolutely assume that tptg.y = 16 and QK_K = 256!
const int iqs = 16 * tpitg.y;
const int ip = iqs / 128; // 0 or 1
@@ -1540,7 +1763,6 @@ kernel void kernel_mul_mat_q6_k_f32(
const int q_offset_l = 64*ip + l0;
const int q_offset_h = 32*ip + l0;
- float sumf = 0;
for (int i = tpitg.x; i < nb; i += tptg.x) {
device const uint8_t * ql = x[i].ql + q_offset_l;
@@ -1562,6 +1784,28 @@ kernel void kernel_mul_mat_q6_k_f32(
sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
}
+#else
+ const int il = 4*tpitg.x; // 0, 4, 8, 12
+
+ for (int i = tpitg.y; i < nb; i += tptg.y) {
+ device const float * y = yy + i * QK_K + il;
+ device const uint8_t * ql = x[i].ql + il;
+ device const uint8_t * qh = x[i].qh + il;
+ device const int8_t * s = x[i].scales;
+
+ const float d = x[i].d;
+
+ float4 sums = {0.f, 0.f, 0.f, 0.f};
+ for (int l = 0; l < 4; ++l) {
+ sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
+ sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
+ sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >> 4) | ((qh[l] & kmask3) >> 0)) - 32);
+ sums[3] += y[l+48] * ((int8_t)((ql[l+16] >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
+ }
+ sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
+ }
+
+#endif
sum[ith] = sumf;