ggml : add new Q4_2 quantization (ARM only) (#1046)

* ggml : Q4_2 ARM * ggml : add ggml_is_quantized() * llama : update llama_type_name() with Q4_2 entry * ggml : speed-up q4_2 - 4 threads: ~100ms -> ~90ms - 8 threads: ~55ms -> ~50ms * ggml : optimize q4_2 using vmlaq_n_f32 + vmulq_n_f32
author: Georgi Gerganov <ggerganov@gmail.com> 2023-04-18 23:54:57 +0300
committer: GitHub <noreply@github.com> 2023-04-18 23:54:57 +0300
commit: 77a73403ca8eaced2590559d0f9cebd2b3649d32 (patch)
tree: 7b95e7565ce86b81d8dd620117564da901ce3ce7 /llama.cpp
parent: 50a8a2af97cb92e53e7a3195aa201c3d87da5415 (diff)
1 files changed, 9 insertions, 1 deletions
diff --git a/llama.cpp b/llama.cpp
index db71c03..f14324f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -478,6 +478,7 @@ struct llama_file_loader {
                 case GGML_TYPE_F16:
                 case GGML_TYPE_Q4_0:
                 case GGML_TYPE_Q4_1:
+                case GGML_TYPE_Q4_2:
                     break;
                 default: {
                     throw format("unrecognized tensor type %u\n", shard.type);
@@ -550,6 +551,7 @@ struct llama_file_saver {
             case GGML_TYPE_F16:
             case GGML_TYPE_Q4_0:
             case GGML_TYPE_Q4_1:
+            case GGML_TYPE_Q4_2:
                 break;
             default: LLAMA_ASSERT(false);
         }
@@ -838,6 +840,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
         case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
                                       return "mostly Q4_1, some F16";
+        case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
         default:                      return "unknown, may not work";
     }
 }
@@ -1571,6 +1574,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     switch (ftype) {
         case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
         case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
+        case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
         default: throw format("invalid output file type %d\n", ftype);
     };
 
@@ -1644,6 +1648,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                     {
                         new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
                     } break;
+                case GGML_TYPE_Q4_2:
+                    {
+                        new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
+                    } break;
                 default:
                     LLAMA_ASSERT(false);
             }
@@ -1955,7 +1963,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
                 base_t = dest_t;
             }
 
-            if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
+            if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1 || base_t->type == GGML_TYPE_Q4_2) {
                 if (!warned) {
                     fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
                                     "use a f16 or f32 base model with --lora-base\n", __func__);
author	Georgi Gerganov <ggerganov@gmail.com>	2023-04-18 23:54:57 +0300
committer	GitHub <noreply@github.com>	2023-04-18 23:54:57 +0300
commit	77a73403ca8eaced2590559d0f9cebd2b3649d32 (patch)
tree	7b95e7565ce86b81d8dd620117564da901ce3ce7 /llama.cpp
parent	50a8a2af97cb92e53e7a3195aa201c3d87da5415 (diff)