aboutsummaryrefslogtreecommitdiff
path: root/ggml-metal.m
diff options
context:
space:
mode:
authorShouzheng Liu <lshzh.hi@gmail.com>2023-07-20 06:32:22 -0400
committerGitHub <noreply@github.com>2023-07-20 13:32:22 +0300
commit417a85a0010519224cf154eb85d383ffeafeeead (patch)
treeeb9b9668426c7318e2ab1389f04118e126752a8e /ggml-metal.m
parent294f424554c1599784ac9962462fc39ace92d8a5 (diff)
metal: minor q4 optimization and reduce code size (#2248)
* metal: use uint16_t instead of uint8_t. Apple GPU doesn't like uint8_t. For every operation on uint8_t the gpu need to copy the uint8_t to an empty 16 bit register, then it can issue other instructions. For the matrix-vector multiplication kernel only, we observed a 340~350 GB/s memory read speed on M1 Max after this commit, which is very close to the reported hardware limit. * metal: update rms_norm kernel This commit double the speed of rms_norm operations by using 512 threads per threadgroup, combining with SIMD primitives to minimize the need for thread group barriers. * metal: use template to reduce size Revert modifications on block_q4_0 and block_q4_1.
Diffstat (limited to 'ggml-metal.m')
-rw-r--r--ggml-metal.m4
1 files changed, 2 insertions, 2 deletions
diff --git a/ggml-metal.m b/ggml-metal.m
index ee205bc..d80a380 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -792,7 +792,7 @@ void ggml_metal_graph_compute(
const float eps = 1e-6f;
- const int nth = 256;
+ const int nth = 512;
[encoder setComputePipelineState:ctx->pipeline_rms_norm];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -800,7 +800,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
- [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
+ [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
const int64_t nrows = ggml_nrows(src0);