aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--examples/quantize/quantize.cpp57
-rw-r--r--llama.cpp103
-rw-r--r--llama.h14
3 files changed, 134 insertions, 40 deletions
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 947b402..c6bf1b7 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -3,6 +3,7 @@
#include "llama.h"
#include <cstdio>
+#include <cstring>
#include <map>
#include <string>
@@ -53,27 +54,49 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st
// usage:
// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
//
+void usage(const char * executable) {
+ fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n", executable);
+ fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
+ fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
+ fprintf(stderr, "Allowed quantization types:\n");
+ for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
+ fprintf(stderr, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
+ }
+ exit(1);
+}
+
int main(int argc, char ** argv) {
if (argc < 3) {
- fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
- for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
- fprintf(stderr, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
+ usage(argv[0]);
+ }
+
+ llama_model_quantize_params params = llama_model_quantize_default_params();
+
+ int arg_idx = 1;
+
+ for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
+ if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
+ params.quantize_output_tensor = false;
+ } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
+ params.allow_requantize = true;
+ } else {
+ usage(argv[0]);
}
- return 1;
+ }
+
+ if (argc - arg_idx < 3) {
+ usage(argv[0]);
}
llama_init_backend();
// parse command line arguments
- const std::string fname_inp = argv[1];
+ const std::string fname_inp = argv[arg_idx];
+ arg_idx++;
std::string fname_out;
- int nthread;
- llama_ftype ftype;
- int arg_idx = 2;
std::string ftype_str;
- if (try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
- // argv[2] is the ftype
+ if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
std::string fpath;
const size_t pos = fname_inp.find_last_of('/');
if (pos != std::string::npos) {
@@ -84,7 +107,6 @@ int main(int argc, char ** argv) {
arg_idx++;
}
else {
- // argv[2] is the output path
fname_out = argv[arg_idx];
arg_idx++;
@@ -92,8 +114,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: missing ftype\n", __func__);
return 1;
}
- // argv[3] is the ftype
- if (!try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
+ if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
return 1;
}
@@ -103,21 +124,19 @@ int main(int argc, char ** argv) {
// parse nthreads
if (argc > arg_idx) {
try {
- nthread = std::stoi(argv[arg_idx]);
+ params.nthread = std::stoi(argv[arg_idx]);
}
catch (const std::exception & e) {
fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what());
return 1;
}
- } else {
- nthread = 0;
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
- if (nthread > 0) {
- fprintf(stderr, " using %d threads", nthread);
+ if (params.nthread > 0) {
+ fprintf(stderr, " using %d threads", params.nthread);
}
fprintf(stderr, "\n");
@@ -129,7 +148,7 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_us = llama_time_us();
- if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
+ if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), &params)) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}
diff --git a/llama.cpp b/llama.cpp
index f40c5af..e100e2b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -886,6 +886,17 @@ struct llama_context_params llama_context_default_params() {
return result;
}
+struct llama_model_quantize_params llama_model_quantize_default_params() {
+ struct llama_model_quantize_params result = {
+ /*.nthread =*/ 0,
+ /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
+ /*.allow_requantize =*/ false,
+ /*.quantize_output_tensor =*/ true,
+ };
+
+ return result;
+}
+
bool llama_mmap_supported() {
return llama_mmap::SUPPORTED;
}
@@ -2231,9 +2242,70 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
// quantization
//
-static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
+static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
+ if (output.size < nelements * sizeof(float)) {
+ output.resize(nelements * sizeof(float));
+ }
+ float * f32_output = (float *) output.addr;
+
+ quantize_fns_t qtype;
+ if (ggml_is_quantized(tensor.type)) {
+ qtype = ggml_internal_get_quantize_fn(tensor.type);
+ if (qtype.dequantize_row_q == NULL) {
+ throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
+ }
+ } else if (tensor.type != GGML_TYPE_F16) {
+ throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
+ }
+
+ if (nthread < 2) {
+ if (tensor.type == GGML_TYPE_F16) {
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
+ } else if (ggml_is_quantized(tensor.type)) {
+ qtype.dequantize_row_q(tensor.data, f32_output, nelements);
+ } else {
+ LLAMA_ASSERT(false); // unreachable
+ }
+ return;
+ }
+
+ auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
+ auto block_size_bytes = ggml_type_size(tensor.type);
+
+ LLAMA_ASSERT(nelements % block_size == 0);
+ auto nblocks = nelements / block_size;
+ auto blocks_per_thread = nblocks / nthread;
+ auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
+
+ std::vector<std::thread> workers;
+ for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
+ auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
+ auto thr_elems = thr_blocks * block_size; // number of elements for this thread
+ auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
+
+ auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
+ if (typ == GGML_TYPE_F16) {
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
+ } else {
+ qtype.dequantize_row_q(inbuf, outbuf, nels);
+ }
+ };
+ workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
+ in_buff_offs += thr_block_bytes;
+ out_buff_offs += thr_elems;
+ }
+ for (auto & worker : workers) {
+ worker.join();
+ }
+
+}
+
+static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
ggml_type quantized_type;
- switch (ftype) {
+ llama_ftype ftype = params->ftype;
+ int nthread = params->nthread;
+
+ switch (params->ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
@@ -2259,7 +2331,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
/*vocab_only*/ false));
- llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
int n_attention_wv = 0;
int n_feed_forward_w2 = 0;
@@ -2301,9 +2373,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
quantize &= (tensor.ne.size() == 2);
// uncomment this to keep the output layer in FP16
- //if (tensor.name == "output.weight") {
- // quantize = false;
- //}
+ if (!params->quantize_output_tensor && tensor.name == "output.weight") {
+ quantize = false;
+ }
+ quantize = quantize && quantized_type != tensor.type;
enum ggml_type new_type;
void * new_data;
@@ -2346,17 +2419,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
float * f32_data;
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
llama_buffer f32_conv_buf;
+
if (tensor.type == GGML_TYPE_F32) {
f32_data = (float *) tensor.data;
- } else if (tensor.type == GGML_TYPE_F16) {
- f32_conv_buf.resize(nelements * sizeof(float));
- f32_data = (float *) f32_conv_buf.addr;
- const auto * f16_data = (const ggml_fp16_t *) tensor.data;
- for (size_t i = 0; i < nelements; i++) {
- f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
- }
+ } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
+ throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
} else {
- throw std::runtime_error(format("type %s unsupported for integer quantization", ggml_type_name(tensor.type)));
+ llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
+ f32_data = (float *) f32_conv_buf.addr;
}
printf("quantizing .. ");
@@ -2566,10 +2636,9 @@ void llama_free(struct llama_context * ctx) {
int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
- enum llama_ftype ftype,
- int nthread) {
+ const llama_model_quantize_params *params) {
try {
- llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
+ llama_model_quantize_internal(fname_inp, fname_out, params);
return 0;
} catch (const std::exception & err) {
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
diff --git a/llama.h b/llama.h
index dc033b7..7c7fd48 100644
--- a/llama.h
+++ b/llama.h
@@ -115,7 +115,16 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
};
+ // model quantization parameters
+ typedef struct llama_model_quantize_params {
+ int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+ enum llama_ftype ftype; // quantize to this llama_ftype
+ bool allow_requantize; // allow quantizing non-f32/f16 tensors
+ bool quantize_output_tensor; // quantize output.weight
+ } llama_model_quantize_params;
+
LLAMA_API struct llama_context_params llama_context_default_params();
+ LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
LLAMA_API bool llama_mmap_supported();
LLAMA_API bool llama_mlock_supported();
@@ -137,14 +146,11 @@ extern "C" {
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
- // TODO: not great API - very likely to change
// Returns 0 on success
- // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
LLAMA_API int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
- enum llama_ftype ftype,
- int nthread);
+ const llama_model_quantize_params * params);
// Apply a LoRA adapter to a loaded model
// path_base_model is the path to a higher quality model to use as a base for