aboutsummaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorKerfuffle <44031344+KerfuffleV2@users.noreply.github.com>2023-06-13 04:23:23 -0600
committerGitHub <noreply@github.com>2023-06-13 04:23:23 -0600
commit74d4cfa3438cb58bd177eed30014e6588694aaa8 (patch)
tree8252d459812376411bcf8508ad7d1f059670601a /llama.cpp
parent74a6d922f12ccfe16b0c265f43be8978c6f25e98 (diff)
Allow "quantizing" to f16 and f32 (#1787)
* Allow "quantizing" to f16 and f32 Fix an issue where quantizing didn't respect LLAMA_NO_K_QUANTS Add brief help to the list of quantization types in the quantize tool Ignore case for quantization type arguments in the quantize tool
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp27
1 files changed, 14 insertions, 13 deletions
diff --git a/llama.cpp b/llama.cpp
index f0f9124..c7a3336 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2298,7 +2298,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
+ case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
+ case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
+#ifdef GGML_USE_K_QUANTS
// K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
@@ -2309,6 +2312,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
+#endif
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
}
@@ -2320,6 +2324,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
/*vocab_only*/ false));
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
+#ifdef GGML_USE_K_QUANTS
int n_attention_wv = 0;
int n_feed_forward_w2 = 0;
for (auto& tensor : model_loader->tensors_map.tensors) {
@@ -2333,6 +2338,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
int i_attention_wv = 0;
int i_feed_forward_w2 = 0;
+#endif
size_t total_size_org = 0;
size_t total_size_new = 0;
@@ -2358,12 +2364,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// quantize only 2D tensors
quantize &= (tensor.ne.size() == 2);
-
- // uncomment this to keep the output layer in FP16
- if (!params->quantize_output_tensor && tensor.name == "output.weight") {
- quantize = false;
- }
- quantize = quantize && quantized_type != tensor.type;
+ quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
+ quantize &= quantized_type != tensor.type;
enum ggml_type new_type;
void * new_data;
@@ -2377,29 +2379,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
} else {
new_type = quantized_type;
+#ifdef GGML_USE_K_QUANTS
if (tensor.name == "output.weight") {
- new_type = GGML_TYPE_Q6_K;
- }
- else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
+ new_type = GGML_TYPE_Q6_K;
+ } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
++i_attention_wv;
- }
- if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
+ } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
++i_feed_forward_w2;
- }
- if (tensor.name.find("attention.wo.weight") != std::string::npos) {
+ } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
}
+#endif
float * f32_data;
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);