diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2023-03-25 20:26:40 +0200 |
---|---|---|
committer | Georgi Gerganov <ggerganov@gmail.com> | 2023-03-25 20:26:40 +0200 |
commit | a316a425d04027453dc0fd45f003b647c12f66f9 (patch) | |
tree | b33d7c55741f10f1cc84f489df05e1fad96f0417 /examples | |
parent | ecbe466a364876927994e2f1ec14f4d82301d201 (diff) |
Overhaul the examples structure
- main -> examples
- utils -> examples (renamed to "common")
- quantize -> examples
- separate tools for "perplexity" and "embedding"
Hope I didn't break something !
Diffstat (limited to 'examples')
-rw-r--r-- | examples/CMakeLists.txt | 36 | ||||
-rw-r--r-- | examples/common.cpp | 251 | ||||
-rw-r--r-- | examples/common.h | 64 | ||||
-rw-r--r-- | examples/embedding/CMakeLists.txt | 4 | ||||
-rw-r--r-- | examples/embedding/README.md | 3 | ||||
-rw-r--r-- | examples/embedding/embedding.cpp | 106 | ||||
-rw-r--r-- | examples/main/CMakeLists.txt | 4 | ||||
-rw-r--r-- | examples/main/README.md | 3 | ||||
-rw-r--r-- | examples/main/main.cpp | 445 | ||||
-rw-r--r-- | examples/perplexity/CMakeLists.txt | 4 | ||||
-rw-r--r-- | examples/perplexity/README.md | 3 | ||||
-rw-r--r-- | examples/perplexity/perplexity.cpp | 146 | ||||
-rw-r--r-- | examples/quantize/CMakeLists.txt | 4 | ||||
-rw-r--r-- | examples/quantize/README.md | 3 | ||||
-rw-r--r-- | examples/quantize/quantize.cpp | 60 |
15 files changed, 1136 insertions, 0 deletions
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 0000000..ce3a347 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,36 @@ +# dependencies + +find_package(Threads REQUIRED) + +# third-party + +# ... + +# common + +set(TARGET common) + +add_library(${TARGET} OBJECT + common.h + common.cpp + ) + +if (BUILD_SHARED_LIBS) + set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() + +target_include_directories(${TARGET} PUBLIC .) +target_compile_features(${TARGET} PUBLIC cxx_std_11) +target_link_libraries(${TARGET} PRIVATE llama) + +# examples + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +if (EMSCRIPTEN) +else() + add_subdirectory(main) + add_subdirectory(quantize) + add_subdirectory(perplexity) + add_subdirectory(embedding) +endif() diff --git a/examples/common.cpp b/examples/common.cpp new file mode 100644 index 0000000..afa7d40 --- /dev/null +++ b/examples/common.cpp @@ -0,0 +1,251 @@ +#include "common.h" + +#include "ggml.h" + +#include <cassert> +#include <cstring> +#include <fstream> +#include <string> +#include <iterator> +#include <algorithm> + + #if defined(_MSC_VER) || defined(__MINGW32__) + #include <malloc.h> // using malloc.h with MSC/MINGW + #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) + #include <alloca.h> + #endif + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + // determine sensible default number of threads. + // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0. +#ifdef __linux__ + std::ifstream cpuinfo("/proc/cpuinfo"); + params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo), + std::istream_iterator<std::string>(), + std::string("processor")); +#endif + if (params.n_threads == 0) { + params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency()); + } + + bool invalid_param = false; + std::string arg; + for (int i = 1; i < argc; i++) { + arg = argv[i]; + + if (arg == "-s" || arg == "--seed") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.seed = std::stoi(argv[i]); + } else if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + } else if (arg == "-p" || arg == "--prompt") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.prompt = argv[i]; + } else if (arg == "-f" || arg == "--file") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::ifstream file(argv[i]); + std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt)); + if (params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + } else if (arg == "-n" || arg == "--n_predict") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_predict = std::stoi(argv[i]); + } else if (arg == "--top_k") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.top_k = std::stoi(argv[i]); + } else if (arg == "-c" || arg == "--ctx_size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_ctx = std::stoi(argv[i]); + } else if (arg == "--memory_f32") { + params.memory_f16 = false; + } else if (arg == "--top_p") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.top_p = std::stof(argv[i]); + } else if (arg == "--temp") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.temp = std::stof(argv[i]); + } else if (arg == "--repeat_last_n") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.repeat_last_n = std::stoi(argv[i]); + } else if (arg == "--repeat_penalty") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.repeat_penalty = std::stof(argv[i]); + } else if (arg == "-b" || arg == "--batch_size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); + params.n_batch = std::min(512, params.n_batch); + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; + } else if (arg == "-i" || arg == "--interactive") { + params.interactive = true; + } else if (arg == "--embedding") { + params.embedding = true; + } else if (arg == "--interactive-start") { + params.interactive = true; + } else if (arg == "--interactive-first") { + params.interactive_start = true; + } else if (arg == "-ins" || arg == "--instruct") { + params.instruct = true; + } else if (arg == "--color") { + params.use_color = true; + } else if (arg == "--mlock") { + params.use_mlock = true; + } else if (arg == "--mtest") { + params.mem_test = true; + } else if (arg == "--verbose_prompt") { + params.verbose_prompt = true; + } else if (arg == "-r" || arg == "--reverse-prompt") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.antiprompt.push_back(argv[i]); + } else if (arg == "--perplexity") { + params.perplexity = true; + } else if (arg == "--ignore-eos") { + params.ignore_eos = true; + } else if (arg == "--n_parts") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_parts = std::stoi(argv[i]); + } else if (arg == "-h" || arg == "--help") { + gpt_print_usage(argc, argv, params); + exit(0); + } else if (arg == "--random-prompt") { + params.random_prompt = true; + } else if (arg == "--in-prefix") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.input_prefix = argv[i]; + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + gpt_print_usage(argc, argv, params); + exit(1); + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + gpt_print_usage(argc, argv, params); + exit(1); + } + + return true; +} + +void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -i, --interactive run in interactive mode\n"); + fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n"); + fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); + fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n"); + fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n"); + fprintf(stderr, " specified more than once for multiple prompts).\n"); + fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n"); + fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); + fprintf(stderr, " prompt to start generation with (default: empty)\n"); + fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); + fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); + fprintf(stderr, " -f FNAME, --file FNAME\n"); + fprintf(stderr, " prompt file to start generation.\n"); + fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); + fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k); + fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); + fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n); + fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty); + fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n"); + fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n"); + fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); + fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n"); + fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); + if (ggml_mlock_supported()) { + fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); + } + fprintf(stderr, " --mtest compute maximum memory usage\n"); + fprintf(stderr, " --verbose-prompt print prompt before generation\n"); + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, "\n"); +} + +std::string gpt_random_prompt(std::mt19937 & rng) { + const int r = rng() % 10; + switch (r) { + case 0: return "So"; + case 1: return "Once upon a time"; + case 2: return "When"; + case 3: return "The"; + case 4: return "After"; + case 5: return "If"; + case 6: return "import"; + case 7: return "He"; + case 8: return "She"; + case 9: return "They"; + default: return "To"; + } + + return "The"; +} + +// TODO: not great allocating this every time +std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { + // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars + std::vector<llama_token> res(text.size() + (int)add_bos); + int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); + assert(n >= 0); + res.resize(n); + + return res; +} diff --git a/examples/common.h b/examples/common.h new file mode 100644 index 0000000..dede803 --- /dev/null +++ b/examples/common.h @@ -0,0 +1,64 @@ +// Various helper functions and utilities + +#pragma once + +#include "llama.h" + +#include <string> +#include <vector> +#include <random> +#include <thread> + +// +// CLI argument parsing +// + +struct gpt_params { + int32_t seed = -1; // RNG seed + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + int32_t n_predict = 128; // new tokens to predict + int32_t repeat_last_n = 64; // last n tokens to penalize + int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) + int32_t n_ctx = 512; // context size + int32_t n_batch = 8; // batch size for prompt processing + + // sampling parameters + int32_t top_k = 40; + float top_p = 0.95f; + float temp = 0.80f; + float repeat_penalty = 1.10f; + + std::string model = "models/lamma-7B/ggml-model.bin"; // model path + std::string prompt = ""; + std::string input_prefix = ""; // string to prefix user inputs with + + + std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted + + bool memory_f16 = true; // use f16 instead of f32 for memory kv + bool random_prompt = false; // do not randomize prompt if none provided + bool use_color = false; // use color to distinguish generations and inputs + bool interactive = false; // interactive mode + + bool embedding = false; // get only sentence embedding + bool interactive_start = false; // wait for user input immediately + + bool instruct = false; // instruction mode (used for Alpaca models) + bool ignore_eos = false; // do not stop generating after eos + bool perplexity = false; // compute perplexity over the prompt + bool use_mlock = false; // use mlock to keep model in memory + bool mem_test = false; // compute maximum memory usage + bool verbose_prompt = false; // print prompt tokens before generation +}; + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params); + +void gpt_print_usage(int argc, char ** argv, const gpt_params & params); + +std::string gpt_random_prompt(std::mt19937 & rng); + +// +// Vocab utils +// + +std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos); diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt new file mode 100644 index 0000000..88c425d --- /dev/null +++ b/examples/embedding/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET embedding) +add_executable(${TARGET} embedding.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/embedding/README.md b/examples/embedding/README.md new file mode 100644 index 0000000..21d8be6 --- /dev/null +++ b/examples/embedding/README.md @@ -0,0 +1,3 @@ +# embedding
+
+TODO
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp new file mode 100644 index 0000000..3015293 --- /dev/null +++ b/examples/embedding/embedding.cpp @@ -0,0 +1,106 @@ +#include "common.h" +#include "llama.h" + +#include <cassert> +#include <cinttypes> +#include <cmath> +#include <cstdio> +#include <cstring> +#include <fstream> +#include <string> +#include <vector> + +int main(int argc, char ** argv) { + gpt_params params; + params.model = "models/llama-7B/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + params.embedding = true; + + if (params.n_ctx > 2048) { + fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + "expect poor results\n", __func__, params.n_ctx); + } + + if (params.seed <= 0) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_context * ctx; + + // load the model + { + auto lparams = llama_context_default_params(); + + lparams.n_ctx = params.n_ctx; + lparams.n_parts = params.n_parts; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.logits_all = params.perplexity; + lparams.use_mlock = params.use_mlock; + lparams.embedding = params.embedding; + + ctx = llama_init_from_file(params.model.c_str(), lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + + int n_past = 0; + + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); + + // tokenize the prompt + auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); + + // determine newline token + auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); + + if (params.verbose_prompt) { + fprintf(stderr, "\n"); + fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < (int) embd_inp.size(); i++) { + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i])); + } + fprintf(stderr, "\n"); + } + + if (params.embedding){ + if (embd_inp.size() > 0) { + if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + } + + const auto embeddings = llama_get_embeddings(ctx); + + // TODO: print / use the embeddings + } + + llama_print_timings(ctx); + llama_free(ctx); + + return 0; +} diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt new file mode 100644 index 0000000..b2dcc29 --- /dev/null +++ b/examples/main/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET main) +add_executable(${TARGET} main.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/main/README.md b/examples/main/README.md new file mode 100644 index 0000000..4701aa5 --- /dev/null +++ b/examples/main/README.md @@ -0,0 +1,3 @@ +# main
+
+TODO
diff --git a/examples/main/main.cpp b/examples/main/main.cpp new file mode 100644 index 0000000..b5f1a7b --- /dev/null +++ b/examples/main/main.cpp @@ -0,0 +1,445 @@ +#include "common.h" +#include "llama.h" + +#include <cassert> +#include <cinttypes> +#include <cmath> +#include <cstdio> +#include <cstring> +#include <fstream> +#include <iostream> +#include <string> +#include <vector> + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include <signal.h> +#include <unistd.h> +#elif defined (_WIN32) +#include <signal.h> +#endif + +#if defined (_WIN32) +#pragma comment(lib,"kernel32.lib") +extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle); +extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode); +extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode); +#endif + +#define ANSI_COLOR_RED "\x1b[31m" +#define ANSI_COLOR_GREEN "\x1b[32m" +#define ANSI_COLOR_YELLOW "\x1b[33m" +#define ANSI_COLOR_BLUE "\x1b[34m" +#define ANSI_COLOR_MAGENTA "\x1b[35m" +#define ANSI_COLOR_CYAN "\x1b[36m" +#define ANSI_COLOR_RESET "\x1b[0m" +#define ANSI_BOLD "\x1b[1m" + +/* Keep track of current color of output, and emit ANSI code if it changes. */ +enum console_state { + CONSOLE_STATE_DEFAULT=0, + CONSOLE_STATE_PROMPT, + CONSOLE_STATE_USER_INPUT +}; + +static console_state con_st = CONSOLE_STATE_DEFAULT; +static bool con_use_color = false; + +void set_console_state(console_state new_st) +{ + if (!con_use_color) return; + // only emit color code if state changed + if (new_st != con_st) { + con_st = new_st; + switch(con_st) { + case CONSOLE_STATE_DEFAULT: + printf(ANSI_COLOR_RESET); + return; + case CONSOLE_STATE_PROMPT: + printf(ANSI_COLOR_YELLOW); + return; + case CONSOLE_STATE_USER_INPUT: + printf(ANSI_BOLD ANSI_COLOR_GREEN); + return; + } + } +} + +static bool is_interacting = false; + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) +void sigint_handler(int signo) { + set_console_state(CONSOLE_STATE_DEFAULT); + printf("\n"); // this also force flush stdout. + if (signo == SIGINT) { + if (!is_interacting) { + is_interacting=true; + } else { + _exit(130); + } + } +} +#endif + +int main(int argc, char ** argv) { + gpt_params params; + params.model = "models/llama-7B/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + if (params.perplexity) { + printf("\n************\n"); + printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + printf("************\n\n"); + + return 0; + } + + if (params.n_ctx > 2048) { + fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + "expect poor results\n", __func__, params.n_ctx); + } + + if (params.seed <= 0) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + // save choice to use color for later + // (note for later: this is a slightly awkward choice) + con_use_color = params.use_color; + +// params.prompt = R"(// this function checks if the number n is prime +//bool is_prime(int n) {)"; + + llama_context * ctx; + + // load the model + { + auto lparams = llama_context_default_params(); + + lparams.n_ctx = params.n_ctx; + lparams.n_parts = params.n_parts; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.use_mlock = params.use_mlock; + + ctx = llama_init_from_file(params.model.c_str(), lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + + // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters + // uncomment the "used_mem" line in llama.cpp to see the results + if (params.mem_test) { + { + const std::vector<llama_token> tmp(params.n_batch, 0); + llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); + } + + { + const std::vector<llama_token> tmp = { 0, }; + llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads); + } + + llama_print_timings(ctx); + llama_free(ctx); + + return 0; + } + + int n_past = 0; + + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); + + // tokenize the prompt + auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); + + const int n_ctx = llama_n_ctx(ctx); + + params.n_predict = std::min(params.n_predict, n_ctx - (int) embd_inp.size()); + + // prefix & suffix for instruct mode + const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true); + const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); + + // in instruct mode, we inject a prefix and a suffix to each input by the user + if (params.instruct) { + params.interactive = true; + params.antiprompt.push_back("### Instruction:\n\n"); + } + + // enable interactive mode if reverse prompt is specified + if (params.antiprompt.size() != 0) { + params.interactive = true; + } + + if (params.interactive_start) { + params.interactive = true; + } + + // determine newline token + auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); + + if (params.verbose_prompt) { + fprintf(stderr, "\n"); + fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < (int) embd_inp.size(); i++) { + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i])); + } + fprintf(stderr, "\n"); + } + + if (params.interactive) { +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = sigint_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined (_WIN32) + signal(SIGINT, sigint_handler); +#endif + + fprintf(stderr, "%s: interactive mode on.\n", __func__); + + if(params.antiprompt.size()) { + for (auto antiprompt : params.antiprompt) { + fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str()); + } + } + + if (!params.input_prefix.empty()) { + fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str()); + } + } + fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); + fprintf(stderr, "\n\n"); + + std::vector<llama_token> embd; + + + int last_n_size = params.repeat_last_n; + std::vector<llama_token> last_n_tokens(last_n_size); + std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); + + if (params.interactive) { + fprintf(stderr, "== Running in interactive mode. ==\n" +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) + " - Press Ctrl+C to interject at any time.\n" +#endif + " - Press Return to return control to LLaMa.\n" + " - If you want to submit another line, end your input in '\\'.\n\n"); + is_interacting = params.interactive_start || params.instruct; + } + + int input_consumed = 0; + bool input_noecho = false; + + int remaining_tokens = params.n_predict; + +#if defined (_WIN32) + if (params.use_color) { + // Enable ANSI colors on Windows 10+ + unsigned long dwMode = 0; + void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11) + if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) { + SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4) + } + } +#endif + // the first thing we will do is to output the prompt, so set color accordingly + set_console_state(CONSOLE_STATE_PROMPT); + + while (remaining_tokens > 0 || params.interactive) { + // predict + if (embd.size() > 0) { + if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + } + + n_past += embd.size(); + embd.clear(); + + if ((int) embd_inp.size() <= input_consumed && !is_interacting) { + // out of user input, sample next token + const float top_k = params.top_k; + const float top_p = params.top_p; + const float temp = params.temp; + const float repeat_penalty = params.repeat_penalty; + + llama_token id = 0; + + { + auto logits = llama_get_logits(ctx); + + if (params.ignore_eos) { + logits[llama_token_eos()] = 0; + } + + id = llama_sample_top_p_top_k(ctx, last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_penalty); + + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(id); + } + + // replace end of text token with newline token when in interactive mode + if (id == llama_token_eos() && params.interactive && !params.instruct) { + id = llama_token_newline.front(); + if (params.antiprompt.size() != 0) { + // tokenize and inject first reverse prompt + const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); + embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); + } + } + + // add it to the context + embd.push_back(id); + + // echo this to console + input_noecho = false; + + // decrement remaining sampling budget + --remaining_tokens; + } else { + // some user input remains from prompt or interaction, forward it to processing + while ((int) embd_inp.size() > input_consumed) { + embd.push_back(embd_inp[input_consumed]); + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(embd_inp[input_consumed]); + ++input_consumed; + if ((int) embd.size() >= params.n_batch) { + break; + } + } + } + + // display text + if (!input_noecho) { + for (auto id : embd) { + printf("%s", llama_token_to_str(ctx, id)); + } + fflush(stdout); + } + // reset color to default if we there is no pending user input + if (!input_noecho && (int)embd_inp.size() == input_consumed) { + set_console_state(CONSOLE_STATE_DEFAULT); + } + + // in interactive mode, and not currently processing queued inputs; + // check if we should prompt the user for more + if (params.interactive && (int) embd_inp.size() <= input_consumed) { + // check for reverse prompt + std::string last_output; + for (auto id : last_n_tokens) { + last_output += llama_token_to_str(ctx, id); + } + + // Check if each of the reverse prompts appears at the end of the output. + for (std::string & antiprompt : params.antiprompt) { + if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) { + is_interacting = true; + set_console_state(CONSOLE_STATE_USER_INPUT); + fflush(stdout); + break; + } + } + + if (n_past > 0 && is_interacting) { + // potentially set color to indicate we are taking user input + set_console_state(CONSOLE_STATE_USER_INPUT); + + if (params.instruct) { + input_consumed = embd_inp.size(); + embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); + + printf("\n> "); + } + + std::string buffer; + if (!params.input_prefix.empty()) { + buffer += params.input_prefix; + printf("%s", buffer.c_str()); + } + + std::string line; + bool another_line = true; + do { + std::getline(std::cin, line); + if (line.empty() || line.back() != '\\') { + another_line = false; + } else { + line.pop_back(); // Remove the continue character + } + buffer += line + '\n'; // Append the line to the result + } while (another_line); + + // done taking input, reset color + set_console_state(CONSOLE_STATE_DEFAULT); + + auto line_inp = ::llama_tokenize(ctx, buffer, false); + embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + + if (params.instruct) { + embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + } + + remaining_tokens -= line_inp.size(); + + input_noecho = true; // do not echo this again + } + + if (n_past > 0) { + is_interacting = false; + } + } + + // end of text token + if (embd.back() == llama_token_eos()) { + if (params.instruct) { + is_interacting = true; + } else { + fprintf(stderr, " [end of text]\n"); + break; + } + } + + // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. + if (params.interactive && remaining_tokens <= 0) { + remaining_tokens = params.n_predict; + is_interacting = true; + } + } + +#if defined (_WIN32) + signal(SIGINT, SIG_DFL); +#endif + + llama_print_timings(ctx); + llama_free(ctx); + + set_console_state(CONSOLE_STATE_DEFAULT); + + return 0; +} diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt new file mode 100644 index 0000000..5836df8 --- /dev/null +++ b/examples/perplexity/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET perplexity) +add_executable(${TARGET} perplexity.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md new file mode 100644 index 0000000..a932275 --- /dev/null +++ b/examples/perplexity/README.md @@ -0,0 +1,3 @@ +# perplexity
+
+TODO
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp new file mode 100644 index 0000000..f0266a0 --- /dev/null +++ b/examples/perplexity/perplexity.cpp @@ -0,0 +1,146 @@ +#include "common.h" +#include "llama.h" + +#include <cassert> +#include <cinttypes> +#include <cmath> +#include <cstdio> +#include <cstring> +#include <string> +#include <vector> + +std::vector<double> softmax(const std::vector<float>& logits) { + std::vector<double> probs(logits.size()); + float max_logit = logits[0]; + for (float v : logits) max_logit = std::max(max_logit, v); + double sum_exp = 0.0; + for (size_t i = 0; i < logits.size(); i++) { + // Subtract the maximum logit value from the current logit value for numerical stability + float logit = logits[i] - max_logit; + double exp_logit = std::exp(logit); + sum_exp += exp_logit; + probs[i] = exp_logit; + } + for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp; + return probs; +} + +void perplexity(llama_context * ctx, const gpt_params & params) { + // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research + // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` + // Output: `perplexity: 13.5106 [114/114]` + auto tokens = ::llama_tokenize(ctx, params.prompt, true); + + int count = 0; + double nll = 0.0; + int seq_count = tokens.size() / params.n_ctx; + + fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count); + + for (int i = 0; i < seq_count; ++i) { + int start = i * params.n_ctx; + int end = start + params.n_ctx - 1; + std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end); + auto start_t = std::chrono::high_resolution_clock::now(); + if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return; + } + auto end_t = std::chrono::high_resolution_clock::now(); + if (i == 0) { + double seconds = std::chrono::duration<double>(end_t - start_t).count(); + printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0)); + } + // We get the logits for all the tokens in the context window (params.n_ctx) + // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, + // calculate the perplexity over the last half the window (so the model always has + // some context to predict the token). + // + // We rely on the fact that attention in the forward pass only looks at previous + // tokens here, so the logits returned for each token are an accurate representation + // of what the model would have predicted at that point. + // + // Example, we have a context window of 512, we will compute perplexity for each of the + // last 256 tokens. Then, we split the input up into context window size chunks to + // process the entire prompt. + + auto logits = llama_get_logits(ctx); + for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) { + // Calculate probability of next token, given the previous ones. + int n_vocab = llama_n_vocab(ctx); + std::vector<float> tok_logits( + logits + j * n_vocab, + logits + (j + 1) * n_vocab); + double prob = softmax(tok_logits)[tokens[start + j + 1]]; + nll += -std::log(prob); + ++count; + } + // perplexity is e^(average negative log-likelihood) + printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + fflush(stdout); + } + printf("\n"); +} + +int main(int argc, char ** argv) { + gpt_params params; + params.model = "models/llama-7B/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + params.perplexity = true; + + if (params.n_ctx > 2048) { + fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + "expect poor results\n", __func__, params.n_ctx); + } + + if (params.seed <= 0) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_context * ctx; + + // load the model + { + auto lparams = llama_context_default_params(); + + lparams.n_ctx = params.n_ctx; + lparams.n_parts = params.n_parts; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.logits_all = params.perplexity; + lparams.use_mlock = params.use_mlock; + lparams.embedding = params.embedding; + + ctx = llama_init_from_file(params.model.c_str(), lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + + perplexity(ctx, params); + + llama_print_timings(ctx); + llama_free(ctx); + + return 0; +} diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt new file mode 100644 index 0000000..fb27d45 --- /dev/null +++ b/examples/quantize/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET quantize) +add_executable(${TARGET} quantize.cpp) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize/README.md b/examples/quantize/README.md new file mode 100644 index 0000000..f349e91 --- /dev/null +++ b/examples/quantize/README.md @@ -0,0 +1,3 @@ +# quantize + +TODO diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp new file mode 100644 index 0000000..f0230f5 --- /dev/null +++ b/examples/quantize/quantize.cpp @@ -0,0 +1,60 @@ +#include "ggml.h" +#include "llama.h" + +#include <cstdio> +#include <string> + +const int QK = 32; + +// usage: +// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type +// +int main(int argc, char ** argv) { + ggml_time_init(); + + if (argc != 4) { + fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); + fprintf(stderr, " type = 2 - q4_0\n"); + fprintf(stderr, " type = 3 - q4_1\n"); + return 1; + } + + // needed to initialize f16 tables + { + struct ggml_init_params params = { 0, NULL }; + struct ggml_context * ctx = ggml_init(params); + ggml_free(ctx); + } + + const std::string fname_inp = argv[1]; + const std::string fname_out = argv[2]; + + const int itype = atoi(argv[3]); + + const int64_t t_main_start_us = ggml_time_us(); + + int64_t t_quantize_us = 0; + + // load the model + { + const int64_t t_start_us = ggml_time_us(); + + if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype, QK)) { + fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); + return 1; + } + + t_quantize_us = ggml_time_us() - t_start_us; + } + + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n"); + printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + } + + return 0; +} |