aboutsummaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-04-18 23:54:57 +0300
committerGitHub <noreply@github.com>2023-04-18 23:54:57 +0300
commit77a73403ca8eaced2590559d0f9cebd2b3649d32 (patch)
tree7b95e7565ce86b81d8dd620117564da901ce3ce7 /llama.cpp
parent50a8a2af97cb92e53e7a3195aa201c3d87da5415 (diff)
ggml : add new Q4_2 quantization (ARM only) (#1046)
* ggml : Q4_2 ARM * ggml : add ggml_is_quantized() * llama : update llama_type_name() with Q4_2 entry * ggml : speed-up q4_2 - 4 threads: ~100ms -> ~90ms - 8 threads: ~55ms -> ~50ms * ggml : optimize q4_2 using vmlaq_n_f32 + vmulq_n_f32
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp10
1 files changed, 9 insertions, 1 deletions
diff --git a/llama.cpp b/llama.cpp
index db71c03..f14324f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -478,6 +478,7 @@ struct llama_file_loader {
case GGML_TYPE_F16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
+ case GGML_TYPE_Q4_2:
break;
default: {
throw format("unrecognized tensor type %u\n", shard.type);
@@ -550,6 +551,7 @@ struct llama_file_saver {
case GGML_TYPE_F16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
+ case GGML_TYPE_Q4_2:
break;
default: LLAMA_ASSERT(false);
}
@@ -838,6 +840,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
return "mostly Q4_1, some F16";
+ case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
default: return "unknown, may not work";
}
}
@@ -1571,6 +1574,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
switch (ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
+ case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
default: throw format("invalid output file type %d\n", ftype);
};
@@ -1644,6 +1648,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
{
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
} break;
+ case GGML_TYPE_Q4_2:
+ {
+ new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
+ } break;
default:
LLAMA_ASSERT(false);
}
@@ -1955,7 +1963,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
base_t = dest_t;
}
- if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
+ if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1 || base_t->type == GGML_TYPE_Q4_2) {
if (!warned) {
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
"use a f16 or f32 base model with --lora-base\n", __func__);