diff options
author | Tobias Lütke <tobi@shopify.com> | 2023-07-05 16:51:13 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-05 16:51:13 -0400 |
commit | 31cfbb1013a482e89c72146e2063ac4362becae7 (patch) | |
tree | b6479860979f90d185a132efe883ae7fb62b07f5 /llama.cpp | |
parent | 983b555e9ddb36703cee4d22642afe958de093b7 (diff) |
Expose generation timings from server & update completions.js (#2116)
* use javascript generators as much cleaner API
Also add ways to access completion as promise and EventSource
* export llama_timings as struct and expose them in server
* update readme, update baked includes
* llama : uniform variable names + struct init
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 32 |
1 files changed, 22 insertions, 10 deletions
@@ -3480,23 +3480,35 @@ llama_token llama_token_nl() { return 13; } +struct llama_timings llama_get_timings(struct llama_context * ctx) { + struct llama_timings result = { + /*.t_start_ms =*/ 1e-3 * ctx->t_start_us, + /*.t_end_ms =*/ 1.00 * ggml_time_ms(), + /*.t_load_ms =*/ 1e-3 * ctx->t_load_us, + /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us, + /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us, + /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us, + + /*.n_sample =*/ std::max(1, ctx->n_sample), + /*.n_p_eval =*/ std::max(1, ctx->n_p_eval), + /*.n_eval =*/ std::max(1, ctx->n_eval), + }; -void llama_print_timings(struct llama_context * ctx) { - const int64_t t_end_us = ggml_time_us(); + return result; +} - const int32_t n_sample = std::max(1, ctx->n_sample); - const int32_t n_eval = std::max(1, ctx->n_eval); - const int32_t n_p_eval = std::max(1, ctx->n_p_eval); +void llama_print_timings(struct llama_context * ctx) { + const llama_timings timings = llama_get_timings(ctx); fprintf(stderr, "\n"); - fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0); + fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms); fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample); + __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample); fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval); + __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval); - fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0); + __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); + fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); } void llama_reset_timings(struct llama_context * ctx) { |