diff options
author | Shouzheng Liu <lshzh.hi@gmail.com> | 2023-07-20 06:32:22 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-20 13:32:22 +0300 |
commit | 417a85a0010519224cf154eb85d383ffeafeeead (patch) | |
tree | eb9b9668426c7318e2ab1389f04118e126752a8e /ggml-metal.m | |
parent | 294f424554c1599784ac9962462fc39ace92d8a5 (diff) |
metal: minor q4 optimization and reduce code size (#2248)
* metal: use uint16_t instead of uint8_t.
Apple GPU doesn't like uint8_t. For every operation on uint8_t
the gpu need to copy the uint8_t to an empty 16 bit register, then
it can issue other instructions.
For the matrix-vector multiplication kernel only, we observed a
340~350 GB/s memory read speed on M1 Max after this commit, which is
very close to the reported hardware limit.
* metal: update rms_norm kernel
This commit double the speed of rms_norm operations by using 512 threads
per threadgroup, combining with SIMD primitives to minimize the need for
thread group barriers.
* metal: use template to reduce size
Revert modifications on block_q4_0 and block_q4_1.
Diffstat (limited to 'ggml-metal.m')
-rw-r--r-- | ggml-metal.m | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/ggml-metal.m b/ggml-metal.m index ee205bc..d80a380 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -792,7 +792,7 @@ void ggml_metal_graph_compute( const float eps = 1e-6f; - const int nth = 256; + const int nth = 512; [encoder setComputePipelineState:ctx->pipeline_rms_norm]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -800,7 +800,7 @@ void ggml_metal_graph_compute( [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; [encoder setBytes:&eps length:sizeof( float) atIndex:4]; - [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; + [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0]; const int64_t nrows = ggml_nrows(src0); |