ggml : add Q8_0 quantization format (rename the old one to Q8_1) (ARM NEON) (#1179)

* ggml : add Q8_0 quantization format (rename the old one to Q8_1) * tests : fix test-quantize-fns * ggml : finalize Q8_0 implementation * ggml : use q4_0_q8_0 and q4_2_q8_0 * ggml : fix Q8_0 dot product bug (ARM) * ggml : Q8_0 unroll x2 * ggml : fix bug - using wrong block type * ggml : extend quantize_fns_t with "vec_dot_type" * ggml : fix Q8_0 to use 255 values out of 256 * ggml : fix assert using wrong QK4_2 instead of QK4_3
author: Georgi Gerganov <ggerganov@gmail.com> 2023-04-25 23:40:51 +0300
committer: GitHub <noreply@github.com> 2023-04-25 23:40:51 +0300
commit: 7a32fcb3b29f4db8aed8a85dc58eb958fb118153 (patch)
tree: b363c851cead2b5d6efced33cc461e37d8ed6bf8 /ggml-cuda.cu
parent: dd0eabc049fb1efc631cab8eb0a646808d704e18 (diff)
1 files changed, 28 insertions, 0 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index fa511c1..f104ed5 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -37,6 +37,13 @@ typedef struct {
 } block_q4_3;
 static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
 
+#define QK8_0 32
+typedef struct {
+    float   d;              // delta
+    int8_t  qs[QK8_0];      // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
+
 static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
     const block_q4_0 * x = (const block_q4_0 *) vx;
 
@@ -131,6 +138,22 @@ static __global__ void dequantize_block_q4_3(const void * vx, float * y) {
     }
 }
 
+static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const int i = blockIdx.x;
+
+    const float d = x[i].d;
+
+    const int8_t * pp = x[i].qs;
+
+    for (int l = 0; l < QK8_0; l++) {
+        const int8_t vi = pp[l];
+
+        y[i*QK8_0 + l] = vi*d;
+    }
+}
+
 void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK4_0;
     dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
@@ -151,6 +174,11 @@ void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t st
     dequantize_block_q4_3<<<nb, 1, 0, stream>>>(vx, y);
 }
 
+void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK8_0;
+    dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
+}
+
 // buffer pool for cuda
 #define MAX_CUDA_BUFFERS 16
author	Georgi Gerganov <ggerganov@gmail.com>	2023-04-25 23:40:51 +0300
committer	GitHub <noreply@github.com>	2023-04-25 23:40:51 +0300
commit	7a32fcb3b29f4db8aed8a85dc58eb958fb118153 (patch)
tree	b363c851cead2b5d6efced33cc461e37d8ed6bf8 /ggml-cuda.cu
parent	dd0eabc049fb1efc631cab8eb0a646808d704e18 (diff)