aboutsummaryrefslogtreecommitdiff
path: root/ggml-metal.m
diff options
context:
space:
mode:
authorKawrakow <48489457+ikawrakow@users.noreply.github.com>2023-06-26 19:43:07 +0300
committerGitHub <noreply@github.com>2023-06-26 19:43:07 +0300
commit6769e944c727c63612dcafbef52009d21ae00fff (patch)
tree987a35bf7f7c0e0947c85bc75cba047834fbccd5 /ggml-metal.m
parentcbebf61ca7584e9709265395f0127ae7fc0f1882 (diff)
k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights * k_quants: WIP super-blocks with 64 weights Q6_K scalar and AVX2 works * k_quants: WIP super-blocks with 64 weights Q4_K scalar and AVX2 works * k_quants: WIP super-blocks with 64 weights Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower than the scalar implementation) * k_quants: WIP super-blocks with 64 weights Q3_K scalar and AVX2 works. * k_quants: WIP super-blocks with 64 weights Q5_K scalar and AVX2 works, and with that all k_quants are done on AVX2 and scalar * k_quants: WIP super-blocks with 64 weights Q6_K working on CUDA. Cannot make it run quite as gast as with super-blocks with 256 weigths: 8% slower on 4080, 20% slower on the 1660 (but there we fit 1 less layer on the GPU because pf the larger model size), so some fraction of these 20% is due to that, * k_quants: WIP super-blocks with 64 weights Q4_K working on CUDA. ~10% slower on GTX-1660, 16% slower on 4080. * k_quants: WIP super-blocks with 64 weights Q2_K working on CUDA. ~3% slower on GTX-1660, 10% slower on 4080. * k_quants: WIP super-blocks with 64 weights Q3_K working on CUDA. * k_quants: WIP super-blocks with 64 weights Q5_K working on CUDA, and with this CUDA is done. * k_quants: WIP super-blocks with 64 weights Q6_K working on ARM_NEON * k_quants: WIP super-blocks with 64 weights Q4_K working on ARM_NEON, but quite a bit slower than 256 weights * k_quants: WIP super-blocks with 64 weights Q2_K working on ARM_NEON, but quite a bit slower than 256 weights * k_quants: WIP super-blocks with 64 weights Q3_K working on ARM_NEON, but quite a bit slower than 256 weights. * k_quants: WIP super-blocks with 64 weights Q5_K working on ARM_NEON, but quite a bit slower than 256 weights. With that, we have full support for ARM_NEON, although performance is not quite there. * k_quants: WIP super-blocks with 64 weights Slightly more efficient Q3_K and Q5_K * k_quants: WIP super-blocks with 64 weights Another small improvement for Q3_K and Q5_K on ARM_NEON * k_quants: WIP super-blocks with 64 weights Yet another speedup for Q5_K on ARM_NEON. We are now within 10% of the QK_K = 256 version. * k_quants: WIP super-blocks with 64 weights * We are able to pass preprocessor macros to the Metal compiler * Q6_K works and is actually slightly more efficient than the QK_K = 256 version (25.2 ms vs 25.8 ms) * k_quants: WIP super-blocks with 64 weights Q4_K works on Metal and is actually slightly faster than QK_K = 256 (21.95 ms vs 24.0 ms). * k_quants: WIP super-blocks with 64 weights Q2_K works on Metal and is very slightly faster than QK_K = 256 (23.8 ms vs 24.2 ms). * k_quants: WIP super-blocks with 64 weights Q3_K works on Metal and is slightly faster than QK_K = 256 (26.6 ms vs 28.3 ms). * k_quants: WIP super-blocks with 64 weights Q5_K works on Metal and is slightly faster than QK_K = 256 (23.7 ms vs 26.3 ms). * k_quants: call them _K, not _k, also on Metal * k_quants: correctly define QK_K in llama.cpp * Fixed bug in q4_K quantization added with the 64-block addition * Simplify via lambda * k_quants: swicth Q3_K to 4-bit scales when QK_K = 64 Otherwise there isn't much benefit from this quantization type. There is some very slight loss in accuracy, but we reduce size by ~7%. E.g., for OpenLLaMA-3B, Q3_K_S perplexity is 8.6131 with 8-bit scales and 8.6352 with 4-bit, while file size decreases from 1.53G to 1.44G. * k_quants: switch Q4_K to 4-bit scales when QK_K = 64 Here the loss in accuracy is greater than for Q3_K, but the Q4_K points still move further to the left on the perplexity vs size curve. * k_quants: forgot to add the Metal changes in last commit * k_quants: change Q5_K to be type 0 when QK_K = 64 Still needs AVX2 implementation * k_quants: AVX2 implementation for new 64-weight Q5_K * k_quants: 10% faster ARM_NEON Q5_K dot product * k_quants: fixed issue caused by merging with master --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml-metal.m')
-rw-r--r--ggml-metal.m66
1 files changed, 36 insertions, 30 deletions
diff --git a/ggml-metal.m b/ggml-metal.m
index a7e104d..7551231 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -51,21 +51,21 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(get_rows_f16);
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
- GGML_METAL_DECL_KERNEL(get_rows_q2_k);
- GGML_METAL_DECL_KERNEL(get_rows_q3_k);
- GGML_METAL_DECL_KERNEL(get_rows_q4_k);
- GGML_METAL_DECL_KERNEL(get_rows_q5_k);
- GGML_METAL_DECL_KERNEL(get_rows_q6_k);
+ GGML_METAL_DECL_KERNEL(get_rows_q2_K);
+ GGML_METAL_DECL_KERNEL(get_rows_q3_K);
+ GGML_METAL_DECL_KERNEL(get_rows_q4_K);
+ GGML_METAL_DECL_KERNEL(get_rows_q5_K);
+ GGML_METAL_DECL_KERNEL(get_rows_q6_K);
GGML_METAL_DECL_KERNEL(rms_norm);
GGML_METAL_DECL_KERNEL(norm);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
- GGML_METAL_DECL_KERNEL(mul_mat_q2_k_f32);
- GGML_METAL_DECL_KERNEL(mul_mat_q3_k_f32);
- GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32);
- GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32);
- GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
+ GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
+ GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
+ GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
+ GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
+ GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_DECL_KERNEL(rope);
GGML_METAL_DECL_KERNEL(alibi_f32);
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
@@ -132,7 +132,13 @@ struct ggml_metal_context * ggml_metal_init(void) {
exit(1);
}
+#ifdef GGML_QKK_64
+ MTLCompileOptions* options = [MTLCompileOptions new];
+ options.preprocessorMacros = @{ @"QK_K" : @(64) };
+ ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
+#else
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
+#endif
if (error) {
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
exit(1);
@@ -159,21 +165,21 @@ struct ggml_metal_context * ggml_metal_init(void) {
GGML_METAL_ADD_KERNEL(get_rows_f16);
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
- GGML_METAL_ADD_KERNEL(get_rows_q2_k);
- GGML_METAL_ADD_KERNEL(get_rows_q3_k);
- GGML_METAL_ADD_KERNEL(get_rows_q4_k);
- GGML_METAL_ADD_KERNEL(get_rows_q5_k);
- GGML_METAL_ADD_KERNEL(get_rows_q6_k);
+ GGML_METAL_ADD_KERNEL(get_rows_q2_K);
+ GGML_METAL_ADD_KERNEL(get_rows_q3_K);
+ GGML_METAL_ADD_KERNEL(get_rows_q4_K);
+ GGML_METAL_ADD_KERNEL(get_rows_q5_K);
+ GGML_METAL_ADD_KERNEL(get_rows_q6_K);
GGML_METAL_ADD_KERNEL(rms_norm);
GGML_METAL_ADD_KERNEL(norm);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
- GGML_METAL_ADD_KERNEL(mul_mat_q2_k_f32);
- GGML_METAL_ADD_KERNEL(mul_mat_q3_k_f32);
- GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32);
- GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32);
- GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
+ GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
+ GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
+ GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
+ GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
+ GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_ADD_KERNEL(rope);
GGML_METAL_ADD_KERNEL(alibi_f32);
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
@@ -662,7 +668,7 @@ void ggml_metal_graph_compute(
nth0 = 4;
nth1 = 16;
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
} break;
case GGML_TYPE_Q3_K:
{
@@ -671,7 +677,7 @@ void ggml_metal_graph_compute(
nth0 = 4;
nth1 = 16;
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
} break;
case GGML_TYPE_Q4_K:
{
@@ -680,7 +686,7 @@ void ggml_metal_graph_compute(
nth0 = 4;
nth1 = 16;
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
} break;
case GGML_TYPE_Q5_K:
{
@@ -689,7 +695,7 @@ void ggml_metal_graph_compute(
nth0 = 4;
nth1 = 16;
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
} break;
case GGML_TYPE_Q6_K:
{
@@ -698,7 +704,7 @@ void ggml_metal_graph_compute(
nth0 = 4;
nth1 = 16;
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
} break;
default:
{
@@ -750,11 +756,11 @@ void ggml_metal_graph_compute(
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
- case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
- case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
- case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
- case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
- case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
+ case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
+ case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
+ case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
+ case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
+ case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
default: GGML_ASSERT(false && "not implemented");
}