diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 0000000..ce3a347
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,36 @@
+# dependencies
+find_package(Threads REQUIRED)
+# third-party
+# ...
+# common
+set(TARGET common)
+add_library(${TARGET} OBJECT
+ common.h
+ common.cpp
+ )
+target_include_directories(${TARGET} PUBLIC .)
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
+target_link_libraries(${TARGET} PRIVATE llama)
+# examples
+ add_subdirectory(main)
+ add_subdirectory(quantize)
+ add_subdirectory(perplexity)
+ add_subdirectory(embedding)
diff --git a/examples/common.cpp b/examples/common.cpp
new file mode 100644
index 0000000..afa7d40
--- /dev/null
+++ b/examples/common.cpp
@@ -0,0 +1,251 @@
+#include "common.h"
+#include "ggml.h"
+#include <cassert>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+ #if defined(_MSC_VER) || defined(__MINGW32__)
+ #include <malloc.h> // using malloc.h with MSC/MINGW
+ #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
+ #include <alloca.h>
+ #endif
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
+ // determine sensible default number of threads.
+ // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
+#ifdef __linux__
+ std::ifstream cpuinfo("/proc/cpuinfo");
+ params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
+ std::istream_iterator<std::string>(),
+ std::string("processor"));
+ if (params.n_threads == 0) {
+ params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
+ }
+ bool invalid_param = false;
+ std::string arg;
+ for (int i = 1; i < argc; i++) {
+ arg = argv[i];
+ if (arg == "-s" || arg == "--seed") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.seed = std::stoi(argv[i]);
+ } else if (arg == "-t" || arg == "--threads") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.n_threads = std::stoi(argv[i]);
+ } else if (arg == "-p" || arg == "--prompt") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.prompt = argv[i];
+ } else if (arg == "-f" || arg == "--file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ std::ifstream file(argv[i]);
+ std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
+ if (params.prompt.back() == '\n') {
+ params.prompt.pop_back();
+ }
+ } else if (arg == "-n" || arg == "--n_predict") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.n_predict = std::stoi(argv[i]);
+ } else if (arg == "--top_k") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.top_k = std::stoi(argv[i]);
+ } else if (arg == "-c" || arg == "--ctx_size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.n_ctx = std::stoi(argv[i]);
+ } else if (arg == "--memory_f32") {
+ params.memory_f16 = false;
+ } else if (arg == "--top_p") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.top_p = std::stof(argv[i]);
+ } else if (arg == "--temp") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.temp = std::stof(argv[i]);
+ } else if (arg == "--repeat_last_n") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.repeat_last_n = std::stoi(argv[i]);
+ } else if (arg == "--repeat_penalty") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.repeat_penalty = std::stof(argv[i]);
+ } else if (arg == "-b" || arg == "--batch_size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.n_batch = std::stoi(argv[i]);
+ params.n_batch = std::min(512, params.n_batch);
+ } else if (arg == "-m" || arg == "--model") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.model = argv[i];
+ } else if (arg == "-i" || arg == "--interactive") {
+ params.interactive = true;
+ } else if (arg == "--embedding") {
+ params.embedding = true;
+ } else if (arg == "--interactive-start") {
+ params.interactive = true;
+ } else if (arg == "--interactive-first") {
+ params.interactive_start = true;
+ } else if (arg == "-ins" || arg == "--instruct") {
+ params.instruct = true;
+ } else if (arg == "--color") {
+ params.use_color = true;
+ } else if (arg == "--mlock") {
+ params.use_mlock = true;
+ } else if (arg == "--mtest") {
+ params.mem_test = true;
+ } else if (arg == "--verbose_prompt") {
+ params.verbose_prompt = true;
+ } else if (arg == "-r" || arg == "--reverse-prompt") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.antiprompt.push_back(argv[i]);
+ } else if (arg == "--perplexity") {
+ params.perplexity = true;
+ } else if (arg == "--ignore-eos") {
+ params.ignore_eos = true;
+ } else if (arg == "--n_parts") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.n_parts = std::stoi(argv[i]);
+ } else if (arg == "-h" || arg == "--help") {
+ gpt_print_usage(argc, argv, params);
+ exit(0);
+ } else if (arg == "--random-prompt") {
+ params.random_prompt = true;
+ } else if (arg == "--in-prefix") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.input_prefix = argv[i];
+ } else {
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+ gpt_print_usage(argc, argv, params);
+ exit(1);
+ }
+ }
+ if (invalid_param) {
+ fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+ gpt_print_usage(argc, argv, params);
+ exit(1);
+ }
+ return true;
+void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
+ fprintf(stderr, "usage: %s [options]\n", argv[0]);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "options:\n");
+ fprintf(stderr, " -h, --help show this help message and exit\n");
+ fprintf(stderr, " -i, --interactive run in interactive mode\n");
+ fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n");
+ fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
+ fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
+ fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
+ fprintf(stderr, " specified more than once for multiple prompts).\n");
+ fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
+ fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n");
+ fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
+ fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
+ fprintf(stderr, " prompt to start generation with (default: empty)\n");
+ fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
+ fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
+ fprintf(stderr, " -f FNAME, --file FNAME\n");
+ fprintf(stderr, " prompt file to start generation.\n");
+ fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
+ fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
+ fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
+ fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
+ fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty);
+ fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
+ fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n");
+ fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n");
+ fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
+ fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
+ fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
+ fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
+ if (ggml_mlock_supported()) {
+ fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
+ }
+ fprintf(stderr, " --mtest compute maximum memory usage\n");
+ fprintf(stderr, " --verbose-prompt print prompt before generation\n");
+ fprintf(stderr, " -m FNAME, --model FNAME\n");
+ fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
+ fprintf(stderr, "\n");
+std::string gpt_random_prompt(std::mt19937 & rng) {
+ const int r = rng() % 10;
+ switch (r) {
+ case 0: return "So";
+ case 1: return "Once upon a time";
+ case 2: return "When";
+ case 3: return "The";
+ case 4: return "After";
+ case 5: return "If";
+ case 6: return "import";
+ case 7: return "He";
+ case 8: return "She";
+ case 9: return "They";
+ default: return "To";
+ }
+ return "The";
+// TODO: not great allocating this every time
+std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
+ // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
+ std::vector<llama_token> res(text.size() + (int)add_bos);
+ int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
+ assert(n >= 0);
+ res.resize(n);
+ return res;
diff --git a/examples/common.h b/examples/common.h
new file mode 100644
index 0000000..dede803
--- /dev/null
+++ b/examples/common.h
@@ -0,0 +1,64 @@
+// Various helper functions and utilities
+#pragma once
+#include "llama.h"
+#include <string>
+#include <vector>
+#include <random>
+#include <thread>
+// CLI argument parsing
+struct gpt_params {
+ int32_t seed = -1; // RNG seed
+ int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+ int32_t n_predict = 128; // new tokens to predict
+ int32_t repeat_last_n = 64; // last n tokens to penalize
+ int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
+ int32_t n_ctx = 512; // context size
+ int32_t n_batch = 8; // batch size for prompt processing
+ // sampling parameters
+ int32_t top_k = 40;
+ float top_p = 0.95f;
+ float temp = 0.80f;
+ float repeat_penalty = 1.10f;
+ std::string model = "models/lamma-7B/ggml-model.bin"; // model path
+ std::string prompt = "";
+ std::string input_prefix = ""; // string to prefix user inputs with
+ std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
+ bool memory_f16 = true; // use f16 instead of f32 for memory kv
+ bool random_prompt = false; // do not randomize prompt if none provided
+ bool use_color = false; // use color to distinguish generations and inputs
+ bool interactive = false; // interactive mode
+ bool embedding = false; // get only sentence embedding
+ bool interactive_start = false; // wait for user input immediately
+ bool instruct = false; // instruction mode (used for Alpaca models)
+ bool ignore_eos = false; // do not stop generating after eos
+ bool perplexity = false; // compute perplexity over the prompt
+ bool use_mlock = false; // use mlock to keep model in memory
+ bool mem_test = false; // compute maximum memory usage
+ bool verbose_prompt = false; // print prompt tokens before generation
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
+void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+std::string gpt_random_prompt(std::mt19937 & rng);
+// Vocab utils
+std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt
new file mode 100644
index 0000000..88c425d
--- /dev/null
+++ b/examples/embedding/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET embedding)
+add_executable(${TARGET} embedding.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/embedding/README.md b/examples/embedding/README.md
new file mode 100644
index 0000000..21d8be6
--- /dev/null
+++ b/examples/embedding/README.md
@@ -0,0 +1,3 @@
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
new file mode 100644
index 0000000..3015293
--- /dev/null
+++ b/examples/embedding/embedding.cpp
@@ -0,0 +1,106 @@
+#include "common.h"
+#include "llama.h"
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <vector>
+int main(int argc, char ** argv) {
+ gpt_params params;
+ params.model = "models/llama-7B/ggml-model.bin";
+ if (gpt_params_parse(argc, argv, params) == false) {
+ return 1;
+ }
+ params.embedding = true;
+ if (params.n_ctx > 2048) {
+ fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+ "expect poor results\n", __func__, params.n_ctx);
+ }
+ if (params.seed <= 0) {
+ params.seed = time(NULL);
+ }
+ fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+ std::mt19937 rng(params.seed);
+ if (params.random_prompt) {
+ params.prompt = gpt_random_prompt(rng);
+ }
+ llama_context * ctx;
+ // load the model
+ {
+ auto lparams = llama_context_default_params();
+ lparams.n_ctx = params.n_ctx;
+ lparams.n_parts = params.n_parts;
+ lparams.seed = params.seed;
+ lparams.f16_kv = params.memory_f16;
+ lparams.logits_all = params.perplexity;
+ lparams.use_mlock = params.use_mlock;
+ lparams.embedding = params.embedding;
+ ctx = llama_init_from_file(params.model.c_str(), lparams);
+ if (ctx == NULL) {
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+ return 1;
+ }
+ }
+ // print system information
+ {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+ params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+ }
+ int n_past = 0;
+ // Add a space in front of the first character to match OG llama tokenizer behavior
+ params.prompt.insert(0, 1, ' ');
+ // tokenize the prompt
+ auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+ // determine newline token
+ auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
+ if (params.verbose_prompt) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+ fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+ for (int i = 0; i < (int) embd_inp.size(); i++) {
+ fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+ }
+ fprintf(stderr, "\n");
+ }
+ if (params.embedding){
+ if (embd_inp.size() > 0) {
+ if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
+ fprintf(stderr, "%s : failed to eval\n", __func__);
+ return 1;
+ }
+ }
+ const auto embeddings = llama_get_embeddings(ctx);
+ // TODO: print / use the embeddings
+ }
+ llama_print_timings(ctx);
+ llama_free(ctx);
+ return 0;
diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
new file mode 100644
index 0000000..b2dcc29
--- /dev/null
+++ b/examples/main/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET main)
+add_executable(${TARGET} main.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/main/README.md b/examples/main/README.md
new file mode 100644
index 0000000..4701aa5
--- /dev/null
+++ b/examples/main/README.md
@@ -0,0 +1,3 @@
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
new file mode 100644
index 0000000..b5f1a7b
--- /dev/null
+++ b/examples/main/main.cpp
@@ -0,0 +1,445 @@
+#include "common.h"
+#include "llama.h"
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#include <signal.h>
+#if defined (_WIN32)
+#pragma comment(lib,"kernel32.lib")
+extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
+extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
+extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
+#define ANSI_COLOR_RED "\x1b[31m"
+#define ANSI_COLOR_GREEN "\x1b[32m"
+#define ANSI_COLOR_YELLOW "\x1b[33m"
+#define ANSI_COLOR_BLUE "\x1b[34m"
+#define ANSI_COLOR_MAGENTA "\x1b[35m"
+#define ANSI_COLOR_CYAN "\x1b[36m"
+#define ANSI_COLOR_RESET "\x1b[0m"
+#define ANSI_BOLD "\x1b[1m"
+/* Keep track of current color of output, and emit ANSI code if it changes. */
+enum console_state {
+static console_state con_st = CONSOLE_STATE_DEFAULT;
+static bool con_use_color = false;
+void set_console_state(console_state new_st)
+ if (!con_use_color) return;
+ // only emit color code if state changed
+ if (new_st != con_st) {
+ con_st = new_st;
+ switch(con_st) {
+ return;
+ return;
+ return;
+ }
+ }
+static bool is_interacting = false;
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+void sigint_handler(int signo) {
+ set_console_state(CONSOLE_STATE_DEFAULT);
+ printf("\n"); // this also force flush stdout.
+ if (signo == SIGINT) {
+ if (!is_interacting) {
+ is_interacting=true;
+ } else {
+ _exit(130);
+ }
+ }
+int main(int argc, char ** argv) {
+ gpt_params params;
+ params.model = "models/llama-7B/ggml-model.bin";
+ if (gpt_params_parse(argc, argv, params) == false) {
+ return 1;
+ }
+ if (params.perplexity) {
+ printf("\n************\n");
+ printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+ printf("************\n\n");
+ return 0;
+ }
+ if (params.n_ctx > 2048) {
+ fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+ "expect poor results\n", __func__, params.n_ctx);
+ }
+ if (params.seed <= 0) {
+ params.seed = time(NULL);
+ }
+ fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+ std::mt19937 rng(params.seed);
+ if (params.random_prompt) {
+ params.prompt = gpt_random_prompt(rng);
+ }
+ // save choice to use color for later
+ // (note for later: this is a slightly awkward choice)
+ con_use_color = params.use_color;
+// params.prompt = R"(// this function checks if the number n is prime
+//bool is_prime(int n) {)";
+ llama_context * ctx;
+ // load the model
+ {
+ auto lparams = llama_context_default_params();
+ lparams.n_ctx = params.n_ctx;
+ lparams.n_parts = params.n_parts;
+ lparams.seed = params.seed;
+ lparams.f16_kv = params.memory_f16;
+ lparams.use_mlock = params.use_mlock;
+ ctx = llama_init_from_file(params.model.c_str(), lparams);
+ if (ctx == NULL) {
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+ return 1;
+ }
+ }
+ // print system information
+ {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+ params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+ }
+ // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
+ // uncomment the "used_mem" line in llama.cpp to see the results
+ if (params.mem_test) {
+ {
+ const std::vector<llama_token> tmp(params.n_batch, 0);
+ llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
+ }
+ {
+ const std::vector<llama_token> tmp = { 0, };
+ llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
+ }
+ llama_print_timings(ctx);
+ llama_free(ctx);
+ return 0;
+ }
+ int n_past = 0;
+ // Add a space in front of the first character to match OG llama tokenizer behavior
+ params.prompt.insert(0, 1, ' ');
+ // tokenize the prompt
+ auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+ const int n_ctx = llama_n_ctx(ctx);
+ params.n_predict = std::min(params.n_predict, n_ctx - (int) embd_inp.size());
+ // prefix & suffix for instruct mode
+ const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
+ const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
+ // in instruct mode, we inject a prefix and a suffix to each input by the user
+ if (params.instruct) {
+ params.interactive = true;
+ params.antiprompt.push_back("### Instruction:\n\n");
+ }
+ // enable interactive mode if reverse prompt is specified
+ if (params.antiprompt.size() != 0) {
+ params.interactive = true;
+ }
+ if (params.interactive_start) {
+ params.interactive = true;
+ }
+ // determine newline token
+ auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
+ if (params.verbose_prompt) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+ fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+ for (int i = 0; i < (int) embd_inp.size(); i++) {
+ fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+ }
+ fprintf(stderr, "\n");
+ }
+ if (params.interactive) {
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+ struct sigaction sigint_action;
+ sigint_action.sa_handler = sigint_handler;
+ sigemptyset (&sigint_action.sa_mask);
+ sigint_action.sa_flags = 0;
+ sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+ signal(SIGINT, sigint_handler);
+ fprintf(stderr, "%s: interactive mode on.\n", __func__);
+ if(params.antiprompt.size()) {
+ for (auto antiprompt : params.antiprompt) {
+ fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
+ }
+ }
+ if (!params.input_prefix.empty()) {
+ fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
+ }
+ }
+ fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
+ fprintf(stderr, "\n\n");
+ std::vector<llama_token> embd;
+ int last_n_size = params.repeat_last_n;
+ std::vector<llama_token> last_n_tokens(last_n_size);
+ std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+ if (params.interactive) {
+ fprintf(stderr, "== Running in interactive mode. ==\n"
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+ " - Press Ctrl+C to interject at any time.\n"
+ " - Press Return to return control to LLaMa.\n"
+ " - If you want to submit another line, end your input in '\\'.\n\n");
+ is_interacting = params.interactive_start || params.instruct;
+ }
+ int input_consumed = 0;
+ bool input_noecho = false;
+ int remaining_tokens = params.n_predict;
+#if defined (_WIN32)
+ if (params.use_color) {
+ // Enable ANSI colors on Windows 10+
+ unsigned long dwMode = 0;
+ void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
+ if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) {
+ SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
+ }
+ }
+ // the first thing we will do is to output the prompt, so set color accordingly
+ set_console_state(CONSOLE_STATE_PROMPT);
+ while (remaining_tokens > 0 || params.interactive) {
+ // predict
+ if (embd.size() > 0) {
+ if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
+ fprintf(stderr, "%s : failed to eval\n", __func__);
+ return 1;
+ }
+ }
+ n_past += embd.size();
+ embd.clear();
+ if ((int) embd_inp.size() <= input_consumed && !is_interacting) {
+ // out of user input, sample next token
+ const float top_k = params.top_k;
+ const float top_p = params.top_p;
+ const float temp = params.temp;
+ const float repeat_penalty = params.repeat_penalty;
+ llama_token id = 0;
+ {
+ auto logits = llama_get_logits(ctx);
+ if (params.ignore_eos) {
+ logits[llama_token_eos()] = 0;
+ }
+ id = llama_sample_top_p_top_k(ctx, last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_penalty);
+ last_n_tokens.erase(last_n_tokens.begin());
+ last_n_tokens.push_back(id);
+ }
+ // replace end of text token with newline token when in interactive mode
+ if (id == llama_token_eos() && params.interactive && !params.instruct) {
+ id = llama_token_newline.front();
+ if (params.antiprompt.size() != 0) {
+ // tokenize and inject first reverse prompt
+ const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+ embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+ }
+ }
+ // add it to the context
+ embd.push_back(id);
+ // echo this to console
+ input_noecho = false;
+ // decrement remaining sampling budget
+ --remaining_tokens;
+ } else {
+ // some user input remains from prompt or interaction, forward it to processing
+ while ((int) embd_inp.size() > input_consumed) {
+ embd.push_back(embd_inp[input_consumed]);
+ last_n_tokens.erase(last_n_tokens.begin());
+ last_n_tokens.push_back(embd_inp[input_consumed]);
+ ++input_consumed;
+ if ((int) embd.size() >= params.n_batch) {
+ break;
+ }
+ }
+ }
+ // display text
+ if (!input_noecho) {
+ for (auto id : embd) {
+ printf("%s", llama_token_to_str(ctx, id));
+ }
+ fflush(stdout);
+ }
+ // reset color to default if we there is no pending user input
+ if (!input_noecho && (int)embd_inp.size() == input_consumed) {
+ set_console_state(CONSOLE_STATE_DEFAULT);
+ }
+ // in interactive mode, and not currently processing queued inputs;
+ // check if we should prompt the user for more
+ if (params.interactive && (int) embd_inp.size() <= input_consumed) {
+ // check for reverse prompt
+ std::string last_output;
+ for (auto id : last_n_tokens) {
+ last_output += llama_token_to_str(ctx, id);
+ }
+ // Check if each of the reverse prompts appears at the end of the output.
+ for (std::string & antiprompt : params.antiprompt) {
+ if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
+ is_interacting = true;
+ set_console_state(CONSOLE_STATE_USER_INPUT);
+ fflush(stdout);
+ break;
+ }
+ }
+ if (n_past > 0 && is_interacting) {
+ // potentially set color to indicate we are taking user input
+ set_console_state(CONSOLE_STATE_USER_INPUT);
+ if (params.instruct) {
+ input_consumed = embd_inp.size();
+ embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
+ printf("\n> ");
+ }
+ std::string buffer;
+ if (!params.input_prefix.empty()) {
+ buffer += params.input_prefix;
+ printf("%s", buffer.c_str());
+ }
+ std::string line;
+ bool another_line = true;
+ do {
+ std::getline(std::cin, line);
+ if (line.empty() || line.back() != '\\') {
+ another_line = false;
+ } else {
+ line.pop_back(); // Remove the continue character
+ }
+ buffer += line + '\n'; // Append the line to the result
+ } while (another_line);
+ // done taking input, reset color
+ set_console_state(CONSOLE_STATE_DEFAULT);
+ auto line_inp = ::llama_tokenize(ctx, buffer, false);
+ embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+ if (params.instruct) {
+ embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
+ }
+ remaining_tokens -= line_inp.size();
+ input_noecho = true; // do not echo this again
+ }
+ if (n_past > 0) {
+ is_interacting = false;
+ }
+ }
+ // end of text token
+ if (embd.back() == llama_token_eos()) {
+ if (params.instruct) {
+ is_interacting = true;
+ } else {
+ fprintf(stderr, " [end of text]\n");
+ break;
+ }
+ }
+ // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
+ if (params.interactive && remaining_tokens <= 0) {
+ remaining_tokens = params.n_predict;
+ is_interacting = true;
+ }
+ }
+#if defined (_WIN32)
+ signal(SIGINT, SIG_DFL);
+ llama_print_timings(ctx);
+ llama_free(ctx);
+ set_console_state(CONSOLE_STATE_DEFAULT);
+ return 0;
diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt
new file mode 100644
index 0000000..5836df8
--- /dev/null
+++ b/examples/perplexity/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET perplexity)
+add_executable(${TARGET} perplexity.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md
new file mode 100644
index 0000000..a932275
--- /dev/null
+++ b/examples/perplexity/README.md
@@ -0,0 +1,3 @@
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
new file mode 100644
index 0000000..f0266a0
--- /dev/null
+++ b/examples/perplexity/perplexity.cpp
@@ -0,0 +1,146 @@
+#include "common.h"
+#include "llama.h"
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+std::vector<double> softmax(const std::vector<float>& logits) {
+ std::vector<double> probs(logits.size());
+ float max_logit = logits[0];
+ for (float v : logits) max_logit = std::max(max_logit, v);
+ double sum_exp = 0.0;
+ for (size_t i = 0; i < logits.size(); i++) {
+ // Subtract the maximum logit value from the current logit value for numerical stability
+ float logit = logits[i] - max_logit;
+ double exp_logit = std::exp(logit);
+ sum_exp += exp_logit;
+ probs[i] = exp_logit;
+ }
+ for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
+ return probs;
+void perplexity(llama_context * ctx, const gpt_params & params) {
+ // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
+ // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+ // Output: `perplexity: 13.5106 [114/114]`
+ auto tokens = ::llama_tokenize(ctx, params.prompt, true);
+ int count = 0;
+ double nll = 0.0;
+ int seq_count = tokens.size() / params.n_ctx;
+ fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
+ for (int i = 0; i < seq_count; ++i) {
+ int start = i * params.n_ctx;
+ int end = start + params.n_ctx - 1;
+ std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
+ auto start_t = std::chrono::high_resolution_clock::now();
+ if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
+ fprintf(stderr, "%s : failed to eval\n", __func__);
+ return;
+ }
+ auto end_t = std::chrono::high_resolution_clock::now();
+ if (i == 0) {
+ double seconds = std::chrono::duration<double>(end_t - start_t).count();
+ printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
+ }
+ // We get the logits for all the tokens in the context window (params.n_ctx)
+ // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
+ // calculate the perplexity over the last half the window (so the model always has
+ // some context to predict the token).
+ //
+ // We rely on the fact that attention in the forward pass only looks at previous
+ // tokens here, so the logits returned for each token are an accurate representation
+ // of what the model would have predicted at that point.
+ //
+ // Example, we have a context window of 512, we will compute perplexity for each of the
+ // last 256 tokens. Then, we split the input up into context window size chunks to
+ // process the entire prompt.
+ auto logits = llama_get_logits(ctx);
+ for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
+ // Calculate probability of next token, given the previous ones.
+ int n_vocab = llama_n_vocab(ctx);
+ std::vector<float> tok_logits(
+ logits + j * n_vocab,
+ logits + (j + 1) * n_vocab);
+ double prob = softmax(tok_logits)[tokens[start + j + 1]];
+ nll += -std::log(prob);
+ ++count;
+ }
+ // perplexity is e^(average negative log-likelihood)
+ printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+ fflush(stdout);
+ }
+ printf("\n");
+int main(int argc, char ** argv) {
+ gpt_params params;
+ params.model = "models/llama-7B/ggml-model.bin";
+ if (gpt_params_parse(argc, argv, params) == false) {
+ return 1;
+ }
+ params.perplexity = true;
+ if (params.n_ctx > 2048) {
+ fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+ "expect poor results\n", __func__, params.n_ctx);
+ }
+ if (params.seed <= 0) {
+ params.seed = time(NULL);
+ }
+ fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+ std::mt19937 rng(params.seed);
+ if (params.random_prompt) {
+ params.prompt = gpt_random_prompt(rng);
+ }
+ llama_context * ctx;
+ // load the model
+ {
+ auto lparams = llama_context_default_params();
+ lparams.n_ctx = params.n_ctx;
+ lparams.n_parts = params.n_parts;
+ lparams.seed = params.seed;
+ lparams.f16_kv = params.memory_f16;
+ lparams.logits_all = params.perplexity;
+ lparams.use_mlock = params.use_mlock;
+ lparams.embedding = params.embedding;
+ ctx = llama_init_from_file(params.model.c_str(), lparams);
+ if (ctx == NULL) {
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+ return 1;
+ }
+ }
+ // print system information
+ {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+ params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+ }
+ perplexity(ctx, params);
+ llama_print_timings(ctx);
+ llama_free(ctx);
+ return 0;
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
new file mode 100644
index 0000000..fb27d45
--- /dev/null
+++ b/examples/quantize/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET quantize)
+add_executable(${TARGET} quantize.cpp)
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/quantize/README.md b/examples/quantize/README.md
new file mode 100644
index 0000000..f349e91
--- /dev/null
+++ b/examples/quantize/README.md
@@ -0,0 +1,3 @@
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
new file mode 100644
index 0000000..f0230f5
--- /dev/null
+++ b/examples/quantize/quantize.cpp
@@ -0,0 +1,60 @@
+#include "ggml.h"
+#include "llama.h"
+#include <cstdio>
+#include <string>
+const int QK = 32;
+// usage:
+// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
+int main(int argc, char ** argv) {
+ ggml_time_init();
+ if (argc != 4) {
+ fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
+ fprintf(stderr, " type = 2 - q4_0\n");
+ fprintf(stderr, " type = 3 - q4_1\n");
+ return 1;
+ }
+ // needed to initialize f16 tables
+ {
+ struct ggml_init_params params = { 0, NULL };
+ struct ggml_context * ctx = ggml_init(params);
+ ggml_free(ctx);
+ }
+ const std::string fname_inp = argv[1];
+ const std::string fname_out = argv[2];
+ const int itype = atoi(argv[3]);
+ const int64_t t_main_start_us = ggml_time_us();
+ int64_t t_quantize_us = 0;
+ // load the model
+ {
+ const int64_t t_start_us = ggml_time_us();
+ if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype, QK)) {
+ fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+ return 1;
+ }
+ t_quantize_us = ggml_time_us() - t_start_us;
+ }
+ // report timing
+ {
+ const int64_t t_main_end_us = ggml_time_us();
+ printf("\n");
+ printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
+ printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+ }
+ return 0;