diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2023-05-20 11:06:11 +0300 |
---|---|---|
committer | Georgi Gerganov <ggerganov@gmail.com> | 2023-05-20 11:06:37 +0300 |
commit | ec2e10c4443209da56b431b24dd0845b60e757fb (patch) | |
tree | 0a285ebbdd3efa99eb60042631ddd86ae6dedd00 /examples/quantize | |
parent | d2c59b8ba498ab01e65203dde6fe95236d20f6e7 (diff) |
llama : add llama_init_backend() API (close #1527)
Diffstat (limited to 'examples/quantize')
-rw-r--r-- | examples/quantize/quantize.cpp | 21 |
1 files changed, 7 insertions, 14 deletions
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 115d8fb..769dd36 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -1,7 +1,7 @@ -#include "ggml.h" -#include "llama.h" #include "build-info.h" +#include "llama.h" + #include <cstdio> #include <map> #include <string> @@ -42,8 +42,6 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st // ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads] // int main(int argc, char ** argv) { - ggml_time_init(); - if (argc < 3) { fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]); for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) { @@ -52,12 +50,7 @@ int main(int argc, char ** argv) { return 1; } - // needed to initialize f16 tables - { - struct ggml_init_params params = { 0, NULL, false }; - struct ggml_context * ctx = ggml_init(params); - ggml_free(ctx); - } + llama_init_backend(); // parse command line arguments const std::string fname_inp = argv[1]; @@ -116,25 +109,25 @@ int main(int argc, char ** argv) { } fprintf(stderr, "\n"); - const int64_t t_main_start_us = ggml_time_us(); + const int64_t t_main_start_us = llama_time_us(); int64_t t_quantize_us = 0; // load the model { - const int64_t t_start_us = ggml_time_us(); + const int64_t t_start_us = llama_time_us(); if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) { fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); return 1; } - t_quantize_us = ggml_time_us() - t_start_us; + t_quantize_us = llama_time_us() - t_start_us; } // report timing { - const int64_t t_main_end_us = ggml_time_us(); + const int64_t t_main_end_us = llama_time_us(); printf("\n"); printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0); |