Expose generation timings from server & update completions.js (#2116)

* use javascript generators as much cleaner API Also add ways to access completion as promise and EventSource * export llama_timings as struct and expose them in server * update readme, update baked includes * llama : uniform variable names + struct init --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
author: Tobias Lütke <tobi@shopify.com> 2023-07-05 16:51:13 -0400
committer: GitHub <noreply@github.com> 2023-07-05 16:51:13 -0400
commit: 31cfbb1013a482e89c72146e2063ac4362becae7 (patch)
tree: b6479860979f90d185a132efe883ae7fb62b07f5 /llama.cpp
parent: 983b555e9ddb36703cee4d22642afe958de093b7 (diff)
1 files changed, 22 insertions, 10 deletions
diff --git a/llama.cpp b/llama.cpp
index 7a866cb..02afdeb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3480,23 +3480,35 @@ llama_token llama_token_nl() {
     return 13;
 }
 
+struct llama_timings llama_get_timings(struct llama_context * ctx) {
+    struct llama_timings result = {
+        /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
+        /*.t_end_ms    =*/ 1.00 * ggml_time_ms(),
+        /*.t_load_ms   =*/ 1e-3 * ctx->t_load_us,
+        /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
+        /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
+        /*.t_eval_ms   =*/ 1e-3 * ctx->t_eval_us,
+
+        /*.n_sample =*/ std::max(1, ctx->n_sample),
+        /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
+        /*.n_eval   =*/ std::max(1, ctx->n_eval),
+    };
 
-void llama_print_timings(struct llama_context * ctx) {
-    const int64_t t_end_us = ggml_time_us();
+    return result;
+}
 
-    const int32_t n_sample = std::max(1, ctx->n_sample);
-    const int32_t n_eval   = std::max(1, ctx->n_eval);
-    const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
+void llama_print_timings(struct llama_context * ctx) {
+    const llama_timings timings = llama_get_timings(ctx);
 
     fprintf(stderr, "\n");
-    fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
+    fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, timings.t_load_ms);
     fprintf(stderr, "%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
+            __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
     fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
+            __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
     fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, 1e-3 * ctx->t_eval_us,   n_eval,   1e-3 * ctx->t_eval_us   / n_eval,   1e6 / ctx->t_eval_us   * n_eval);
-    fprintf(stderr, "%s:       total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
+            __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
+    fprintf(stderr, "%s:       total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
 }
 
 void llama_reset_timings(struct llama_context * ctx) {
author	Tobias Lütke <tobi@shopify.com>	2023-07-05 16:51:13 -0400
committer	GitHub <noreply@github.com>	2023-07-05 16:51:13 -0400
commit	31cfbb1013a482e89c72146e2063ac4362becae7 (patch)
tree	b6479860979f90d185a132efe883ae7fb62b07f5 /llama.cpp
parent	983b555e9ddb36703cee4d22642afe958de093b7 (diff)