From e986f94829bae0b9e66b326acbbba179931c84f1 Mon Sep 17 00:00:00 2001
From: Christian Falch <875252+chrfalch@users.noreply.github.com>
Date: Sun, 2 Apr 2023 12:23:04 +0200
Subject: Added api for getting/setting the kv_cache (#685)

The api provides access methods for retrieving the current memory buffer for the kv_cache and its token number.
It also contains a method for setting the kv_cache from a memory buffer.

This makes it possible to load/save history - maybe support --cache-prompt paramater as well?

Co-authored-by: Pavol Rusnak <pavol@rusnak.io>
---
 llama.cpp | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'llama.cpp')

diff --git a/llama.cpp b/llama.cpp
index b0f53ca..8789071 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1668,6 +1668,33 @@ int llama_model_quantize(
     return 0;
 }
 
+// Returns the KV cache that will contain the context for the
+// ongoing prediction with the model.
+const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
+    return ctx->model.kv_self.buf.data();
+}
+
+// Returns the size of the KV cache
+size_t llama_get_kv_cache_size(struct llama_context * ctx) {
+    return ctx->model.kv_self.buf.size();
+}
+
+int llama_get_kv_cache_token_count(struct llama_context * ctx) {
+    return ctx->model.kv_self.n;
+}
+
+// Sets the KV cache containing the current context for the model
+void llama_set_kv_cache(
+        struct llama_context * ctx,
+               const uint8_t * kv_cache,
+                      size_t   n_size,
+                         int   n_token_count) {
+    // Make sure we have the same kv cache setup
+    LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
+    memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
+    ctx->model.kv_self.n = n_token_count;
+}
+
 int llama_eval(
         struct llama_context * ctx,
            const llama_token * tokens,
-- 
cgit v1.2.3