From 31cfbb1013a482e89c72146e2063ac4362becae7 Mon Sep 17 00:00:00 2001
From: Tobias Lütke <tobi@shopify.com>
Date: Wed, 5 Jul 2023 16:51:13 -0400
Subject: Expose generation timings from server & update completions.js (#2116)

* use javascript generators as much cleaner API

Also add ways to access completion as promise and EventSource

* export llama_timings as struct and expose them in server

* update readme, update baked includes

* llama : uniform variable names + struct init

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/server/public/completion.js | 119 +++++++++++++++++++++++++-----
 examples/server/public/index.html    | 135 ++++++++++++++++++++---------------
 2 files changed, 181 insertions(+), 73 deletions(-)

(limited to 'examples/server/public')
diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js
index 4f5005c..a43d5a7 100644
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@@ -5,20 +5,29 @@ const paramDefaults = {
   stop: ["</s>"]
 };
 
-/**
- * This function completes the input text using a llama dictionary.
- * @param {object} params - The parameters for the completion request.
- * @param {object} controller - an instance of AbortController if you need one, or null.
- * @param {function} callback - The callback function to call when the completion is done.
- * @returns {string} the completed text as a string. Ideally ignored, and you get at it via the callback.
- */
-export const llamaComplete = async (params, controller, callback) => {
+let generation_settings = null;
+
+
+// Completes the prompt as a generator. Recommended for most use cases.
+//
+// Example:
+//
+//    import { llama } from '/completion.js'
+//
+//    const request = llama("Tell me a joke", {n_predict: 800})
+//    for await (const chunk of request) {
+//      document.write(chunk.data.content)
+//    }
+//
+export async function* llama(prompt, params = {}, config = {}) {
+  let controller = config.controller;
+
   if (!controller) {
     controller = new AbortController();
   }
-  const completionParams = { ...paramDefaults, ...params };
 
-  // we use fetch directly here becasue the built in fetchEventSource does not support POST
+  const completionParams = { ...paramDefaults, ...params, prompt };
+
   const response = await fetch("/completion", {
     method: 'POST',
     body: JSON.stringify(completionParams),
@@ -36,7 +45,6 @@ export const llamaComplete = async (params, controller, callback) => {
   let content = "";
 
   try {
-
     let cont = true;
 
     while (cont) {
@@ -59,18 +67,21 @@ export const llamaComplete = async (params, controller, callback) => {
       result.data = JSON.parse(result.data);
       content += result.data.content;
 
-      // callack
-      if (callback) {
-        cont = callback(result) != false;
-      }
+      // yield
+      yield result;
 
       // if we got a stop token from server, we will break here
       if (result.data.stop) {
+        if (result.data.generation_settings) {
+          generation_settings = result.data.generation_settings;
+        }
         break;
       }
     }
   } catch (e) {
-    console.error("llama error: ", e);
+    if (e.name !== 'AbortError') {
+      console.error("llama error: ", e);
+    }
     throw e;
   }
   finally {
@@ -79,3 +90,79 @@ export const llamaComplete = async (params, controller, callback) => {
 
   return content;
 }
+
+// Call llama, return an event target that you can subcribe to
+//
+// Example:
+//
+//    import { llamaEventTarget } from '/completion.js'
+//
+//    const conn = llamaEventTarget(prompt)
+//    conn.addEventListener("message", (chunk) => {
+//      document.write(chunk.detail.content)
+//    })
+//
+export const llamaEventTarget = (prompt, params = {}, config = {}) => {
+  const eventTarget = new EventTarget();
+  (async () => {
+    let content = "";
+    for await (const chunk of llama(prompt, params, config)) {
+      if (chunk.data) {
+        content += chunk.data.content;
+        eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
+      }
+      if (chunk.data.generation_settings) {
+        eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
+      }
+      if (chunk.data.timings) {
+        eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
+      }
+    }
+    eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
+  })();
+  return eventTarget;
+}
+
+// Call llama, return a promise that resolves to the completed text. This does not support streaming
+//
+// Example:
+//
+//     llamaPromise(prompt).then((content) => {
+//       document.write(content)
+//     })
+//
+//     or
+//
+//     const content = await llamaPromise(prompt)
+//     document.write(content)
+//
+export const llamaPromise = (prompt, params = {}, config = {}) => {
+  return new Promise(async (resolve, reject) => {
+    let content = "";
+    try {
+      for await (const chunk of llama(prompt, params, config)) {
+        content += chunk.data.content;
+      }
+      resolve(content);
+    } catch (error) {
+      reject(error);
+    }
+  });
+};
+
+/**
+ * (deprecated)
+ */
+export const llamaComplete = async (params, controller, callback) => {
+  for await (const chunk of llama(params.prompt, params, { controller })) {
+    callback(chunk);
+  }
+}
+
+// Get the model info from the server. This is useful for getting the context window and so on.
+export const llamaModelInfo = async () => {
+  if (!generation_settings) {
+    generation_settings = await fetch("/model.json").then(r => r.json());
+  }
+  return generation_settings;
+}
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index 6393e2e..8ace0b0 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -6,7 +6,6 @@
   <title>llama.cpp - chat</title>
 
   <style>
-
     body {
       background-color: #fff;
       color: #000;
@@ -22,10 +21,6 @@
       height: 100%;
     }
 
-    header, footer {
-      text-align: center;
-    }
-
     main {
       margin: 3px;
       display: flex;
@@ -99,6 +94,15 @@
       margin: 0.5em 0;
       display: block;
     }
+
+    header, footer {
+      text-align: center;
+    }
+
+    footer {
+      font-size: 80%;
+      color: #888;
+    }
   </style>
 
   <script type="module">
@@ -106,10 +110,10 @@
       html, h, signal, effect, computed, render, useSignal, useEffect, useRef
     } from '/index.js';
 
-    import { llamaComplete } from '/completion.js';
+    import { llama } from '/completion.js';
 
     const session = signal({
-      prompt: "This is a conversation between user and llama, a friendly chatbot. respond in markdown.",
+      prompt: "This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.",
       template: "{{prompt}}\n\n{{history}}\n{{char}}:",
       historyTemplate: "{{name}}: {{message}}",
       transcript: [],
@@ -118,15 +122,6 @@
       user: "User",
     })
 
-    const transcriptUpdate = (transcript) => {
-      session.value = {
-        ...session.value,
-        transcript
-      }
-    }
-
-    const chatStarted = computed(() => session.value.transcript.length > 0)
-
     const params = signal({
       n_predict: 400,
       temperature: 0.7,
@@ -136,8 +131,18 @@
       top_p: 0.5,
     })
 
+    const llamaStats = signal(null)
     const controller = signal(null)
+
     const generating = computed(() => controller.value == null )
+    const chatStarted = computed(() => session.value.transcript.length > 0)
+
+    const transcriptUpdate = (transcript) => {
+      session.value = {
+        ...session.value,
+        transcript
+      }
+    }
 
     // simple template replace
     const template = (str, extraSettings) => {
@@ -158,7 +163,7 @@
 
       transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
 
-      const payload = template(session.value.template, {
+      const prompt = template(session.value.template, {
         message: msg,
         history: session.value.transcript.flatMap(([name, message]) => template(session.value.historyTemplate, {name, message})).join("\n"),
       });
@@ -168,22 +173,26 @@
 
       const llamaParams = {
         ...params.value,
-        prompt: payload,
         stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
       }
 
-      await llamaComplete(llamaParams, controller.value, (message) => {
-        const data = message.data;
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
+        const data = chunk.data;
         currentMessage += data.content;
+
         // remove leading whitespace
         currentMessage = currentMessage.replace(/^\s+/, "")
 
         transcriptUpdate([...history, ["{{char}}", currentMessage]])
 
         if (data.stop) {
-          console.log("-->", data, ' response was:', currentMessage, 'transcript state:', session.value.transcript);
+          console.log("Completion finished: '", currentMessage, "', summary: ", data);
         }
-      })
+
+        if (data.timings) {
+          llamaStats.value = data.timings;
+        }
+      }
 
       controller.value = null;
     }
@@ -219,13 +228,12 @@
       return html`
         <form onsubmit=${submit}>
           <div>
-          <textarea type="text" rows=2 onkeypress=${enterSubmits} value="${message}" oninput=${(e) => message.value = e.target.value} placeholder="Say something..."/>
-
+            <textarea type="text" rows=2 onkeypress=${enterSubmits} value="${message}" oninput=${(e) => message.value = e.target.value} placeholder="Say something..."/>
           </div>
           <div class="right">
-          <button type="submit" disabled=${!generating.value} >Send</button>
-          <button onclick=${stop} disabled=${generating}>Stop</button>
-          <button onclick=${reset}>Reset</button>
+            <button type="submit" disabled=${!generating.value} >Send</button>
+            <button onclick=${stop} disabled=${generating}>Stop</button>
+            <button onclick=${reset}>Reset</button>
           </div>
         </form>
       `
@@ -243,7 +251,7 @@
       }, [messages])
 
       const chatLine = ([user, msg]) => {
-        return html`<p key=${msg}><strong>${template(user)}:</strong> <${Markdown} text=${template(msg)} /></p>`
+        return html`<p key=${msg}><strong>${template(user)}:</strong> <${Markdownish} text=${template(msg)} /></p>`
       };
 
       return html`
@@ -313,39 +321,52 @@
         </form>
       `
     }
-const Markdown = (params) => {
-  const md = params.text
-    .replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
-    .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
-    .replace(/__(.*?)__/g, '<strong>$1</strong>')
-    .replace(/\*(.*?)\*/g, '<em>$1</em>')
-    .replace(/_(.*?)_/g, '<em>$1</em>')
-    .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
-    .replace(/`(.*?)`/g, '<code>$1</code>')
-    .replace(/\n/gim, '<br />');
-  return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
-};
+    // poor mans markdown replacement
+    const Markdownish = (params) => {
+      const md = params.text
+        .replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
+        .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
+        .replace(/__(.*?)__/g, '<strong>$1</strong>')
+        .replace(/\*(.*?)\*/g, '<em>$1</em>')
+        .replace(/_(.*?)_/g, '<em>$1</em>')
+        .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
+        .replace(/`(.*?)`/g, '<code>$1</code>')
+        .replace(/\n/gim, '<br />');
+      return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
+    };
+
+    const ModelGenerationInfo = (params) => {
+      if (!llamaStats.value) {
+        return html`<span/>`
+      }
+      return html`
+        <span>
+          ${llamaStats.value.predicted_per_token_ms.toFixed()}ms per token, ${llamaStats.value.predicted_per_second.toFixed(2)} tokens per second
+        </span>
+      `
+    }
 
     function App(props) {
 
       return html`
-      <div id="container">
-        <header>
-          <h1>llama.cpp</h1>
-        </header>
-
-        <main id="content">
-          <${chatStarted.value ? ChatLog : ConfigForm} />
-        </main>
-
-        <footer id="write">
-          <${MessageInput} />
-        </footer>
-
-        <footer>
-          <p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a></p>
-        </footer>
-      </div>
+        <div id="container">
+          <header>
+            <h1>llama.cpp</h1>
+          </header>
+
+          <main id="content">
+            <${chatStarted.value ? ChatLog : ConfigForm} />
+          </main>
+
+          <section id="write">
+            <${MessageInput} />
+          </section>
+
+          <footer>
+            <p><${ModelGenerationInfo} /></p>
+            <p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
+          </footer>
+        </div>
       `;
     }
 
-- 
cgit v1.2.3