llama : add llama_init_backend() API (close #1527)

author: Georgi Gerganov <ggerganov@gmail.com> 2023-05-20 11:06:11 +0300
committer: Georgi Gerganov <ggerganov@gmail.com> 2023-05-20 11:06:37 +0300
commit: ec2e10c4443209da56b431b24dd0845b60e757fb (patch)
tree: 0a285ebbdd3efa99eb60042631ddd86ae6dedd00 /examples/quantize
parent: d2c59b8ba498ab01e65203dde6fe95236d20f6e7 (diff)
1 files changed, 7 insertions, 14 deletions
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 115d8fb..769dd36 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -1,7 +1,7 @@
-#include "ggml.h"
-#include "llama.h"
 #include "build-info.h"
 
+#include "llama.h"
+
 #include <cstdio>
 #include <map>
 #include <string>
@@ -42,8 +42,6 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st
 //  ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
 //
 int main(int argc, char ** argv) {
-    ggml_time_init();
-
     if (argc < 3) {
         fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
         for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
@@ -52,12 +50,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
+    llama_init_backend();
 
     // parse command line arguments
     const std::string fname_inp = argv[1];
@@ -116,25 +109,25 @@ int main(int argc, char ** argv) {
     }
     fprintf(stderr, "\n");
 
-    const int64_t t_main_start_us = ggml_time_us();
+    const int64_t t_main_start_us = llama_time_us();
 
     int64_t t_quantize_us = 0;
 
     // load the model
     {
-        const int64_t t_start_us = ggml_time_us();
+        const int64_t t_start_us = llama_time_us();
 
         if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
             fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
             return 1;
         }
 
-        t_quantize_us = ggml_time_us() - t_start_us;
+        t_quantize_us = llama_time_us() - t_start_us;
     }
 
     // report timing
     {
-        const int64_t t_main_end_us = ggml_time_us();
+        const int64_t t_main_end_us = llama_time_us();
 
         printf("\n");
         printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
author	Georgi Gerganov <ggerganov@gmail.com>	2023-05-20 11:06:11 +0300
committer	Georgi Gerganov <ggerganov@gmail.com>	2023-05-20 11:06:37 +0300
commit	ec2e10c4443209da56b431b24dd0845b60e757fb (patch)
tree	0a285ebbdd3efa99eb60042631ddd86ae6dedd00 /examples/quantize
parent	d2c59b8ba498ab01e65203dde6fe95236d20f6e7 (diff)