aboutsummaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp32
1 files changed, 22 insertions, 10 deletions
diff --git a/llama.cpp b/llama.cpp
index 7a866cb..02afdeb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3480,23 +3480,35 @@ llama_token llama_token_nl() {
return 13;
}
+struct llama_timings llama_get_timings(struct llama_context * ctx) {
+ struct llama_timings result = {
+ /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
+ /*.t_end_ms =*/ 1.00 * ggml_time_ms(),
+ /*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
+ /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
+ /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
+ /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
+
+ /*.n_sample =*/ std::max(1, ctx->n_sample),
+ /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
+ /*.n_eval =*/ std::max(1, ctx->n_eval),
+ };
-void llama_print_timings(struct llama_context * ctx) {
- const int64_t t_end_us = ggml_time_us();
+ return result;
+}
- const int32_t n_sample = std::max(1, ctx->n_sample);
- const int32_t n_eval = std::max(1, ctx->n_eval);
- const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
+void llama_print_timings(struct llama_context * ctx) {
+ const llama_timings timings = llama_get_timings(ctx);
fprintf(stderr, "\n");
- fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
+ fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
- __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
+ __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
- __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
+ __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
- __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
- fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
+ __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
+ fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
}
void llama_reset_timings(struct llama_context * ctx) {