diff options
author | Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> | 2023-06-10 01:59:17 -0600 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-06-10 10:59:17 +0300 |
commit | 4f0154b0bad775ac4651bf73b5c216eb43c45cdc (patch) | |
tree | 33a6036c589fd494af7de0cd786e395d4fd3f699 /llama.h | |
parent | ef3171d16241c18581d4d08374f0b9e396ade6b7 (diff) |
llama : support requantizing models instead of only allowing quantization from 16/32bit (#1691)
* Add support for quantizing already quantized models
* Threaded dequantizing and f16 to f32 conversion
* Clean up thread blocks with spares calculation a bit
* Use std::runtime_error exceptions.
Diffstat (limited to 'llama.h')
-rw-r--r-- | llama.h | 14 |
1 files changed, 10 insertions, 4 deletions
@@ -115,7 +115,16 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors }; + // model quantization parameters + typedef struct llama_model_quantize_params { + int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() + enum llama_ftype ftype; // quantize to this llama_ftype + bool allow_requantize; // allow quantizing non-f32/f16 tensors + bool quantize_output_tensor; // quantize output.weight + } llama_model_quantize_params; + LLAMA_API struct llama_context_params llama_context_default_params(); + LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); LLAMA_API bool llama_mmap_supported(); LLAMA_API bool llama_mlock_supported(); @@ -137,14 +146,11 @@ extern "C" { // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); - // TODO: not great API - very likely to change // Returns 0 on success - // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given LLAMA_API int llama_model_quantize( const char * fname_inp, const char * fname_out, - enum llama_ftype ftype, - int nthread); + const llama_model_quantize_params * params); // Apply a LoRA adapter to a loaded model // path_base_model is the path to a higher quality model to use as a base for |