Allow "quantizing" to f16 and f32 (#1787)

* Allow "quantizing" to f16 and f32 Fix an issue where quantizing didn't respect LLAMA_NO_K_QUANTS Add brief help to the list of quantization types in the quantize tool Ignore case for quantization type arguments in the quantize tool
author: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> 2023-06-13 04:23:23 -0600
committer: GitHub <noreply@github.com> 2023-06-13 04:23:23 -0600
commit: 74d4cfa3438cb58bd177eed30014e6588694aaa8 (patch)
tree: 8252d459812376411bcf8508ad7d1f059670601a /llama.cpp
parent: 74a6d922f12ccfe16b0c265f43be8978c6f25e98 (diff)
1 files changed, 14 insertions, 13 deletions
diff --git a/llama.cpp b/llama.cpp
index f0f9124..c7a3336 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2298,7 +2298,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
         case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
         case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
+        case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
+        case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
 
+#ifdef GGML_USE_K_QUANTS
         // K-quants
         case LLAMA_FTYPE_MOSTLY_Q2_K:   quantized_type = GGML_TYPE_Q2_K; break;
         case LLAMA_FTYPE_MOSTLY_Q3_K_S:
@@ -2309,6 +2312,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_Q5_K_S:
         case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
         case LLAMA_FTYPE_MOSTLY_Q6_K:   quantized_type = GGML_TYPE_Q6_K; break;
+#endif
         default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
     }
 
@@ -2320,6 +2324,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                                                                             /*vocab_only*/ false));
     llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
 
+#ifdef GGML_USE_K_QUANTS
     int n_attention_wv    = 0;
     int n_feed_forward_w2 = 0;
     for (auto& tensor : model_loader->tensors_map.tensors) {
@@ -2333,6 +2338,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
     int i_attention_wv = 0;
     int i_feed_forward_w2 = 0;
+#endif
 
     size_t total_size_org = 0;
     size_t total_size_new = 0;
@@ -2358,12 +2364,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
         // quantize only 2D tensors
         quantize &= (tensor.ne.size() == 2);
-
-        // uncomment this to keep the output layer in FP16
-        if (!params->quantize_output_tensor && tensor.name == "output.weight") {
-           quantize = false;
-        }
-        quantize = quantize && quantized_type != tensor.type;
+        quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
+        quantize &= quantized_type != tensor.type;
 
         enum ggml_type new_type;
         void * new_data;
@@ -2377,29 +2379,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
         } else {
             new_type = quantized_type;
+#ifdef GGML_USE_K_QUANTS
             if (tensor.name == "output.weight") {
-                new_type = GGML_TYPE_Q6_K;
-            }
-            else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
+               new_type = GGML_TYPE_Q6_K;
+            } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
                 if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
                 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
                 else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
                          (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
                          (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
                 ++i_attention_wv;
-            }
-            if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
+            } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
                 if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
                 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
                 else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
                          (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
                          (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
                 ++i_feed_forward_w2;
-            }
-            if (tensor.name.find("attention.wo.weight") != std::string::npos) {
+            } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
                 if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
                 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
             }
+#endif
 
             float * f32_data;
             size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
author	Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>	2023-06-13 04:23:23 -0600
committer	GitHub <noreply@github.com>	2023-06-13 04:23:23 -0600
commit	74d4cfa3438cb58bd177eed30014e6588694aaa8 (patch)
tree	8252d459812376411bcf8508ad7d1f059670601a /llama.cpp
parent	74a6d922f12ccfe16b0c265f43be8978c6f25e98 (diff)