From a316a425d04027453dc0fd45f003b647c12f66f9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 25 Mar 2023 20:26:40 +0200 Subject: Overhaul the examples structure - main -> examples - utils -> examples (renamed to "common") - quantize -> examples - separate tools for "perplexity" and "embedding" Hope I didn't break something ! --- .gitignore | 1 + CMakeLists.txt | 29 +- Makefile | 19 +- examples/CMakeLists.txt | 36 +++ examples/common.cpp | 251 +++++++++++++++++ examples/common.h | 64 +++++ examples/embedding/CMakeLists.txt | 4 + examples/embedding/README.md | 3 + examples/embedding/embedding.cpp | 106 +++++++ examples/main/CMakeLists.txt | 4 + examples/main/README.md | 3 + examples/main/main.cpp | 445 ++++++++++++++++++++++++++++++ examples/perplexity/CMakeLists.txt | 4 + examples/perplexity/README.md | 3 + examples/perplexity/perplexity.cpp | 146 ++++++++++ examples/quantize/CMakeLists.txt | 4 + examples/quantize/README.md | 3 + examples/quantize/quantize.cpp | 60 ++++ ggml.c | 26 +- main.cpp | 546 ------------------------------------- quantize.cpp | 60 ---- tests/CMakeLists.txt | 2 +- tests/test-tokenizer-0.cpp | 6 +- utils.cpp | 251 ----------------- utils.h | 64 ----- 25 files changed, 1170 insertions(+), 970 deletions(-) create mode 100644 examples/CMakeLists.txt create mode 100644 examples/common.cpp create mode 100644 examples/common.h create mode 100644 examples/embedding/CMakeLists.txt create mode 100644 examples/embedding/README.md create mode 100644 examples/embedding/embedding.cpp create mode 100644 examples/main/CMakeLists.txt create mode 100644 examples/main/README.md create mode 100644 examples/main/main.cpp create mode 100644 examples/perplexity/CMakeLists.txt create mode 100644 examples/perplexity/README.md create mode 100644 examples/perplexity/perplexity.cpp create mode 100644 examples/quantize/CMakeLists.txt create mode 100644 examples/quantize/README.md create mode 100644 examples/quantize/quantize.cpp delete mode 100644 main.cpp delete mode 100644 quantize.cpp delete mode 100644 utils.cpp delete mode 100644 utils.h diff --git a/.gitignore b/.gitignore index 3087b0e..ce01fd5 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ models/* /main /quantize /result +/perplexity arm_neon.h compile_commands.json diff --git a/CMakeLists.txt b/CMakeLists.txt index 51af97c..a1ff5a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -211,17 +211,6 @@ endif() # Build libraries # -add_library(utils OBJECT - utils.cpp - utils.h) - -target_include_directories(utils PUBLIC .) -target_compile_features(utils PUBLIC cxx_std_11) # don't bump -target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS}) -if (BUILD_SHARED_LIBS) - set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON) -endif() - add_library(ggml OBJECT ggml.c ggml.h) @@ -239,22 +228,12 @@ add_library(llama target_include_directories(llama PUBLIC .) target_compile_features(llama PUBLIC cxx_std_11) # don't bump -target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS}) +target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS}) if (BUILD_SHARED_LIBS) set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD) endif() -# -# Executables -# - -add_executable(main main.cpp) -target_link_libraries(main PRIVATE llama ggml utils) - -add_executable(quantize quantize.cpp) -target_link_libraries(quantize PRIVATE llama ggml utils) - # # programs, examples and tests # @@ -264,6 +243,6 @@ if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) add_subdirectory(tests) endif () -#if (LLAMA_BUILD_EXAMPLES) -# add_subdirectory(examples) -#endif() +if (LLAMA_BUILD_EXAMPLES) + add_subdirectory(examples) +endif() diff --git a/Makefile b/Makefile index e8b128c..98a2d85 100644 --- a/Makefile +++ b/Makefile @@ -212,7 +212,7 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: main quantize +default: main quantize perplexity # # Build library @@ -224,20 +224,23 @@ ggml.o: ggml.c ggml.h llama.o: llama.cpp llama.h $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o -utils.o: utils.cpp utils.h - $(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o +common.o: examples/common.cpp examples/common.h + $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o clean: - rm -f *.o main quantize + rm -vf *.o main quantize perplexity -main: main.cpp ggml.o llama.o utils.o - $(CXX) $(CXXFLAGS) main.cpp ggml.o llama.o utils.o -o main $(LDFLAGS) +main: examples/main/main.cpp ggml.o llama.o common.o + $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -quantize: quantize.cpp ggml.o llama.o utils.o - $(CXX) $(CXXFLAGS) quantize.cpp ggml.o llama.o utils.o -o quantize $(LDFLAGS) +quantize: examples/quantize/quantize.cpp ggml.o llama.o + $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS) + +perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o + $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS) # # Tests diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 0000000..ce3a347 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,36 @@ +# dependencies + +find_package(Threads REQUIRED) + +# third-party + +# ... + +# common + +set(TARGET common) + +add_library(${TARGET} OBJECT + common.h + common.cpp + ) + +if (BUILD_SHARED_LIBS) + set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() + +target_include_directories(${TARGET} PUBLIC .) +target_compile_features(${TARGET} PUBLIC cxx_std_11) +target_link_libraries(${TARGET} PRIVATE llama) + +# examples + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +if (EMSCRIPTEN) +else() + add_subdirectory(main) + add_subdirectory(quantize) + add_subdirectory(perplexity) + add_subdirectory(embedding) +endif() diff --git a/examples/common.cpp b/examples/common.cpp new file mode 100644 index 0000000..afa7d40 --- /dev/null +++ b/examples/common.cpp @@ -0,0 +1,251 @@ +#include "common.h" + +#include "ggml.h" + +#include +#include +#include +#include +#include +#include + + #if defined(_MSC_VER) || defined(__MINGW32__) + #include // using malloc.h with MSC/MINGW + #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) + #include + #endif + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + // determine sensible default number of threads. + // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0. +#ifdef __linux__ + std::ifstream cpuinfo("/proc/cpuinfo"); + params.n_threads = std::count(std::istream_iterator(cpuinfo), + std::istream_iterator(), + std::string("processor")); +#endif + if (params.n_threads == 0) { + params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency()); + } + + bool invalid_param = false; + std::string arg; + for (int i = 1; i < argc; i++) { + arg = argv[i]; + + if (arg == "-s" || arg == "--seed") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.seed = std::stoi(argv[i]); + } else if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + } else if (arg == "-p" || arg == "--prompt") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.prompt = argv[i]; + } else if (arg == "-f" || arg == "--file") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::ifstream file(argv[i]); + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + } else if (arg == "-n" || arg == "--n_predict") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_predict = std::stoi(argv[i]); + } else if (arg == "--top_k") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.top_k = std::stoi(argv[i]); + } else if (arg == "-c" || arg == "--ctx_size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_ctx = std::stoi(argv[i]); + } else if (arg == "--memory_f32") { + params.memory_f16 = false; + } else if (arg == "--top_p") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.top_p = std::stof(argv[i]); + } else if (arg == "--temp") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.temp = std::stof(argv[i]); + } else if (arg == "--repeat_last_n") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.repeat_last_n = std::stoi(argv[i]); + } else if (arg == "--repeat_penalty") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.repeat_penalty = std::stof(argv[i]); + } else if (arg == "-b" || arg == "--batch_size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); + params.n_batch = std::min(512, params.n_batch); + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; + } else if (arg == "-i" || arg == "--interactive") { + params.interactive = true; + } else if (arg == "--embedding") { + params.embedding = true; + } else if (arg == "--interactive-start") { + params.interactive = true; + } else if (arg == "--interactive-first") { + params.interactive_start = true; + } else if (arg == "-ins" || arg == "--instruct") { + params.instruct = true; + } else if (arg == "--color") { + params.use_color = true; + } else if (arg == "--mlock") { + params.use_mlock = true; + } else if (arg == "--mtest") { + params.mem_test = true; + } else if (arg == "--verbose_prompt") { + params.verbose_prompt = true; + } else if (arg == "-r" || arg == "--reverse-prompt") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.antiprompt.push_back(argv[i]); + } else if (arg == "--perplexity") { + params.perplexity = true; + } else if (arg == "--ignore-eos") { + params.ignore_eos = true; + } else if (arg == "--n_parts") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_parts = std::stoi(argv[i]); + } else if (arg == "-h" || arg == "--help") { + gpt_print_usage(argc, argv, params); + exit(0); + } else if (arg == "--random-prompt") { + params.random_prompt = true; + } else if (arg == "--in-prefix") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.input_prefix = argv[i]; + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + gpt_print_usage(argc, argv, params); + exit(1); + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + gpt_print_usage(argc, argv, params); + exit(1); + } + + return true; +} + +void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -i, --interactive run in interactive mode\n"); + fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n"); + fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); + fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n"); + fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n"); + fprintf(stderr, " specified more than once for multiple prompts).\n"); + fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n"); + fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); + fprintf(stderr, " prompt to start generation with (default: empty)\n"); + fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); + fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); + fprintf(stderr, " -f FNAME, --file FNAME\n"); + fprintf(stderr, " prompt file to start generation.\n"); + fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); + fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k); + fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); + fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n); + fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty); + fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n"); + fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n"); + fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); + fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n"); + fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); + if (ggml_mlock_supported()) { + fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); + } + fprintf(stderr, " --mtest compute maximum memory usage\n"); + fprintf(stderr, " --verbose-prompt print prompt before generation\n"); + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, "\n"); +} + +std::string gpt_random_prompt(std::mt19937 & rng) { + const int r = rng() % 10; + switch (r) { + case 0: return "So"; + case 1: return "Once upon a time"; + case 2: return "When"; + case 3: return "The"; + case 4: return "After"; + case 5: return "If"; + case 6: return "import"; + case 7: return "He"; + case 8: return "She"; + case 9: return "They"; + default: return "To"; + } + + return "The"; +} + +// TODO: not great allocating this every time +std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { + // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars + std::vector res(text.size() + (int)add_bos); + int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); + assert(n >= 0); + res.resize(n); + + return res; +} diff --git a/examples/common.h b/examples/common.h new file mode 100644 index 0000000..dede803 --- /dev/null +++ b/examples/common.h @@ -0,0 +1,64 @@ +// Various helper functions and utilities + +#pragma once + +#include "llama.h" + +#include +#include +#include +#include + +// +// CLI argument parsing +// + +struct gpt_params { + int32_t seed = -1; // RNG seed + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + int32_t n_predict = 128; // new tokens to predict + int32_t repeat_last_n = 64; // last n tokens to penalize + int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) + int32_t n_ctx = 512; // context size + int32_t n_batch = 8; // batch size for prompt processing + + // sampling parameters + int32_t top_k = 40; + float top_p = 0.95f; + float temp = 0.80f; + float repeat_penalty = 1.10f; + + std::string model = "models/lamma-7B/ggml-model.bin"; // model path + std::string prompt = ""; + std::string input_prefix = ""; // string to prefix user inputs with + + + std::vector antiprompt; // string upon seeing which more user input is prompted + + bool memory_f16 = true; // use f16 instead of f32 for memory kv + bool random_prompt = false; // do not randomize prompt if none provided + bool use_color = false; // use color to distinguish generations and inputs + bool interactive = false; // interactive mode + + bool embedding = false; // get only sentence embedding + bool interactive_start = false; // wait for user input immediately + + bool instruct = false; // instruction mode (used for Alpaca models) + bool ignore_eos = false; // do not stop generating after eos + bool perplexity = false; // compute perplexity over the prompt + bool use_mlock = false; // use mlock to keep model in memory + bool mem_test = false; // compute maximum memory usage + bool verbose_prompt = false; // print prompt tokens before generation +}; + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params); + +void gpt_print_usage(int argc, char ** argv, const gpt_params & params); + +std::string gpt_random_prompt(std::mt19937 & rng); + +// +// Vocab utils +// + +std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos); diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt new file mode 100644 index 0000000..88c425d --- /dev/null +++ b/examples/embedding/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET embedding) +add_executable(${TARGET} embedding.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/embedding/README.md b/examples/embedding/README.md new file mode 100644 index 0000000..21d8be6 --- /dev/null +++ b/examples/embedding/README.md @@ -0,0 +1,3 @@ +# embedding + +TODO diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp new file mode 100644 index 0000000..3015293 --- /dev/null +++ b/examples/embedding/embedding.cpp @@ -0,0 +1,106 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char ** argv) { + gpt_params params; + params.model = "models/llama-7B/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + params.embedding = true; + + if (params.n_ctx > 2048) { + fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + "expect poor results\n", __func__, params.n_ctx); + } + + if (params.seed <= 0) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_context * ctx; + + // load the model + { + auto lparams = llama_context_default_params(); + + lparams.n_ctx = params.n_ctx; + lparams.n_parts = params.n_parts; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.logits_all = params.perplexity; + lparams.use_mlock = params.use_mlock; + lparams.embedding = params.embedding; + + ctx = llama_init_from_file(params.model.c_str(), lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + + int n_past = 0; + + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); + + // tokenize the prompt + auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); + + // determine newline token + auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); + + if (params.verbose_prompt) { + fprintf(stderr, "\n"); + fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < (int) embd_inp.size(); i++) { + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i])); + } + fprintf(stderr, "\n"); + } + + if (params.embedding){ + if (embd_inp.size() > 0) { + if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + } + + const auto embeddings = llama_get_embeddings(ctx); + + // TODO: print / use the embeddings + } + + llama_print_timings(ctx); + llama_free(ctx); + + return 0; +} diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt new file mode 100644 index 0000000..b2dcc29 --- /dev/null +++ b/examples/main/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET main) +add_executable(${TARGET} main.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/main/README.md b/examples/main/README.md new file mode 100644 index 0000000..4701aa5 --- /dev/null +++ b/examples/main/README.md @@ -0,0 +1,3 @@ +# main + +TODO diff --git a/examples/main/main.cpp b/examples/main/main.cpp new file mode 100644 index 0000000..b5f1a7b --- /dev/null +++ b/examples/main/main.cpp @@ -0,0 +1,445 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#include +#endif + +#if defined (_WIN32) +#pragma comment(lib,"kernel32.lib") +extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle); +extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode); +extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode); +#endif + +#define ANSI_COLOR_RED "\x1b[31m" +#define ANSI_COLOR_GREEN "\x1b[32m" +#define ANSI_COLOR_YELLOW "\x1b[33m" +#define ANSI_COLOR_BLUE "\x1b[34m" +#define ANSI_COLOR_MAGENTA "\x1b[35m" +#define ANSI_COLOR_CYAN "\x1b[36m" +#define ANSI_COLOR_RESET "\x1b[0m" +#define ANSI_BOLD "\x1b[1m" + +/* Keep track of current color of output, and emit ANSI code if it changes. */ +enum console_state { + CONSOLE_STATE_DEFAULT=0, + CONSOLE_STATE_PROMPT, + CONSOLE_STATE_USER_INPUT +}; + +static console_state con_st = CONSOLE_STATE_DEFAULT; +static bool con_use_color = false; + +void set_console_state(console_state new_st) +{ + if (!con_use_color) return; + // only emit color code if state changed + if (new_st != con_st) { + con_st = new_st; + switch(con_st) { + case CONSOLE_STATE_DEFAULT: + printf(ANSI_COLOR_RESET); + return; + case CONSOLE_STATE_PROMPT: + printf(ANSI_COLOR_YELLOW); + return; + case CONSOLE_STATE_USER_INPUT: + printf(ANSI_BOLD ANSI_COLOR_GREEN); + return; + } + } +} + +static bool is_interacting = false; + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) +void sigint_handler(int signo) { + set_console_state(CONSOLE_STATE_DEFAULT); + printf("\n"); // this also force flush stdout. + if (signo == SIGINT) { + if (!is_interacting) { + is_interacting=true; + } else { + _exit(130); + } + } +} +#endif + +int main(int argc, char ** argv) { + gpt_params params; + params.model = "models/llama-7B/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + if (params.perplexity) { + printf("\n************\n"); + printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + printf("************\n\n"); + + return 0; + } + + if (params.n_ctx > 2048) { + fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + "expect poor results\n", __func__, params.n_ctx); + } + + if (params.seed <= 0) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + // save choice to use color for later + // (note for later: this is a slightly awkward choice) + con_use_color = params.use_color; + +// params.prompt = R"(// this function checks if the number n is prime +//bool is_prime(int n) {)"; + + llama_context * ctx; + + // load the model + { + auto lparams = llama_context_default_params(); + + lparams.n_ctx = params.n_ctx; + lparams.n_parts = params.n_parts; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.use_mlock = params.use_mlock; + + ctx = llama_init_from_file(params.model.c_str(), lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + + // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters + // uncomment the "used_mem" line in llama.cpp to see the results + if (params.mem_test) { + { + const std::vector tmp(params.n_batch, 0); + llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); + } + + { + const std::vector tmp = { 0, }; + llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads); + } + + llama_print_timings(ctx); + llama_free(ctx); + + return 0; + } + + int n_past = 0; + + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); + + // tokenize the prompt + auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); + + const int n_ctx = llama_n_ctx(ctx); + + params.n_predict = std::min(params.n_predict, n_ctx - (int) embd_inp.size()); + + // prefix & suffix for instruct mode + const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true); + const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); + + // in instruct mode, we inject a prefix and a suffix to each input by the user + if (params.instruct) { + params.interactive = true; + params.antiprompt.push_back("### Instruction:\n\n"); + } + + // enable interactive mode if reverse prompt is specified + if (params.antiprompt.size() != 0) { + params.interactive = true; + } + + if (params.interactive_start) { + params.interactive = true; + } + + // determine newline token + auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); + + if (params.verbose_prompt) { + fprintf(stderr, "\n"); + fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < (int) embd_inp.size(); i++) { + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i])); + } + fprintf(stderr, "\n"); + } + + if (params.interactive) { +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = sigint_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined (_WIN32) + signal(SIGINT, sigint_handler); +#endif + + fprintf(stderr, "%s: interactive mode on.\n", __func__); + + if(params.antiprompt.size()) { + for (auto antiprompt : params.antiprompt) { + fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str()); + } + } + + if (!params.input_prefix.empty()) { + fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str()); + } + } + fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); + fprintf(stderr, "\n\n"); + + std::vector embd; + + + int last_n_size = params.repeat_last_n; + std::vector last_n_tokens(last_n_size); + std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); + + if (params.interactive) { + fprintf(stderr, "== Running in interactive mode. ==\n" +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) + " - Press Ctrl+C to interject at any time.\n" +#endif + " - Press Return to return control to LLaMa.\n" + " - If you want to submit another line, end your input in '\\'.\n\n"); + is_interacting = params.interactive_start || params.instruct; + } + + int input_consumed = 0; + bool input_noecho = false; + + int remaining_tokens = params.n_predict; + +#if defined (_WIN32) + if (params.use_color) { + // Enable ANSI colors on Windows 10+ + unsigned long dwMode = 0; + void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11) + if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) { + SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4) + } + } +#endif + // the first thing we will do is to output the prompt, so set color accordingly + set_console_state(CONSOLE_STATE_PROMPT); + + while (remaining_tokens > 0 || params.interactive) { + // predict + if (embd.size() > 0) { + if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + } + + n_past += embd.size(); + embd.clear(); + + if ((int) embd_inp.size() <= input_consumed && !is_interacting) { + // out of user input, sample next token + const float top_k = params.top_k; + const float top_p = params.top_p; + const float temp = params.temp; + const float repeat_penalty = params.repeat_penalty; + + llama_token id = 0; + + { + auto logits = llama_get_logits(ctx); + + if (params.ignore_eos) { + logits[llama_token_eos()] = 0; + } + + id = llama_sample_top_p_top_k(ctx, last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_penalty); + + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(id); + } + + // replace end of text token with newline token when in interactive mode + if (id == llama_token_eos() && params.interactive && !params.instruct) { + id = llama_token_newline.front(); + if (params.antiprompt.size() != 0) { + // tokenize and inject first reverse prompt + const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); + embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); + } + } + + // add it to the context + embd.push_back(id); + + // echo this to console + input_noecho = false; + + // decrement remaining sampling budget + --remaining_tokens; + } else { + // some user input remains from prompt or interaction, forward it to processing + while ((int) embd_inp.size() > input_consumed) { + embd.push_back(embd_inp[input_consumed]); + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(embd_inp[input_consumed]); + ++input_consumed; + if ((int) embd.size() >= params.n_batch) { + break; + } + } + } + + // display text + if (!input_noecho) { + for (auto id : embd) { + printf("%s", llama_token_to_str(ctx, id)); + } + fflush(stdout); + } + // reset color to default if we there is no pending user input + if (!input_noecho && (int)embd_inp.size() == input_consumed) { + set_console_state(CONSOLE_STATE_DEFAULT); + } + + // in interactive mode, and not currently processing queued inputs; + // check if we should prompt the user for more + if (params.interactive && (int) embd_inp.size() <= input_consumed) { + // check for reverse prompt + std::string last_output; + for (auto id : last_n_tokens) { + last_output += llama_token_to_str(ctx, id); + } + + // Check if each of the reverse prompts appears at the end of the output. + for (std::string & antiprompt : params.antiprompt) { + if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) { + is_interacting = true; + set_console_state(CONSOLE_STATE_USER_INPUT); + fflush(stdout); + break; + } + } + + if (n_past > 0 && is_interacting) { + // potentially set color to indicate we are taking user input + set_console_state(CONSOLE_STATE_USER_INPUT); + + if (params.instruct) { + input_consumed = embd_inp.size(); + embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); + + printf("\n> "); + } + + std::string buffer; + if (!params.input_prefix.empty()) { + buffer += params.input_prefix; + printf("%s", buffer.c_str()); + } + + std::string line; + bool another_line = true; + do { + std::getline(std::cin, line); + if (line.empty() || line.back() != '\\') { + another_line = false; + } else { + line.pop_back(); // Remove the continue character + } + buffer += line + '\n'; // Append the line to the result + } while (another_line); + + // done taking input, reset color + set_console_state(CONSOLE_STATE_DEFAULT); + + auto line_inp = ::llama_tokenize(ctx, buffer, false); + embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + + if (params.instruct) { + embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + } + + remaining_tokens -= line_inp.size(); + + input_noecho = true; // do not echo this again + } + + if (n_past > 0) { + is_interacting = false; + } + } + + // end of text token + if (embd.back() == llama_token_eos()) { + if (params.instruct) { + is_interacting = true; + } else { + fprintf(stderr, " [end of text]\n"); + break; + } + } + + // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. + if (params.interactive && remaining_tokens <= 0) { + remaining_tokens = params.n_predict; + is_interacting = true; + } + } + +#if defined (_WIN32) + signal(SIGINT, SIG_DFL); +#endif + + llama_print_timings(ctx); + llama_free(ctx); + + set_console_state(CONSOLE_STATE_DEFAULT); + + return 0; +} diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt new file mode 100644 index 0000000..5836df8 --- /dev/null +++ b/examples/perplexity/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET perplexity) +add_executable(${TARGET} perplexity.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md new file mode 100644 index 0000000..a932275 --- /dev/null +++ b/examples/perplexity/README.md @@ -0,0 +1,3 @@ +# perplexity + +TODO diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp new file mode 100644 index 0000000..f0266a0 --- /dev/null +++ b/examples/perplexity/perplexity.cpp @@ -0,0 +1,146 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include + +std::vector softmax(const std::vector& logits) { + std::vector probs(logits.size()); + float max_logit = logits[0]; + for (float v : logits) max_logit = std::max(max_logit, v); + double sum_exp = 0.0; + for (size_t i = 0; i < logits.size(); i++) { + // Subtract the maximum logit value from the current logit value for numerical stability + float logit = logits[i] - max_logit; + double exp_logit = std::exp(logit); + sum_exp += exp_logit; + probs[i] = exp_logit; + } + for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp; + return probs; +} + +void perplexity(llama_context * ctx, const gpt_params & params) { + // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research + // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` + // Output: `perplexity: 13.5106 [114/114]` + auto tokens = ::llama_tokenize(ctx, params.prompt, true); + + int count = 0; + double nll = 0.0; + int seq_count = tokens.size() / params.n_ctx; + + fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count); + + for (int i = 0; i < seq_count; ++i) { + int start = i * params.n_ctx; + int end = start + params.n_ctx - 1; + std::vector embd(tokens.begin() + start, tokens.begin() + end); + auto start_t = std::chrono::high_resolution_clock::now(); + if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return; + } + auto end_t = std::chrono::high_resolution_clock::now(); + if (i == 0) { + double seconds = std::chrono::duration(end_t - start_t).count(); + printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0)); + } + // We get the logits for all the tokens in the context window (params.n_ctx) + // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, + // calculate the perplexity over the last half the window (so the model always has + // some context to predict the token). + // + // We rely on the fact that attention in the forward pass only looks at previous + // tokens here, so the logits returned for each token are an accurate representation + // of what the model would have predicted at that point. + // + // Example, we have a context window of 512, we will compute perplexity for each of the + // last 256 tokens. Then, we split the input up into context window size chunks to + // process the entire prompt. + + auto logits = llama_get_logits(ctx); + for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) { + // Calculate probability of next token, given the previous ones. + int n_vocab = llama_n_vocab(ctx); + std::vector tok_logits( + logits + j * n_vocab, + logits + (j + 1) * n_vocab); + double prob = softmax(tok_logits)[tokens[start + j + 1]]; + nll += -std::log(prob); + ++count; + } + // perplexity is e^(average negative log-likelihood) + printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + fflush(stdout); + } + printf("\n"); +} + +int main(int argc, char ** argv) { + gpt_params params; + params.model = "models/llama-7B/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + params.perplexity = true; + + if (params.n_ctx > 2048) { + fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + "expect poor results\n", __func__, params.n_ctx); + } + + if (params.seed <= 0) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_context * ctx; + + // load the model + { + auto lparams = llama_context_default_params(); + + lparams.n_ctx = params.n_ctx; + lparams.n_parts = params.n_parts; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.logits_all = params.perplexity; + lparams.use_mlock = params.use_mlock; + lparams.embedding = params.embedding; + + ctx = llama_init_from_file(params.model.c_str(), lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + + perplexity(ctx, params); + + llama_print_timings(ctx); + llama_free(ctx); + + return 0; +} diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt new file mode 100644 index 0000000..fb27d45 --- /dev/null +++ b/examples/quantize/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET quantize) +add_executable(${TARGET} quantize.cpp) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize/README.md b/examples/quantize/README.md new file mode 100644 index 0000000..f349e91 --- /dev/null +++ b/examples/quantize/README.md @@ -0,0 +1,3 @@ +# quantize + +TODO diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp new file mode 100644 index 0000000..f0230f5 --- /dev/null +++ b/examples/quantize/quantize.cpp @@ -0,0 +1,60 @@ +#include "ggml.h" +#include "llama.h" + +#include +#include + +const int QK = 32; + +// usage: +// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type +// +int main(int argc, char ** argv) { + ggml_time_init(); + + if (argc != 4) { + fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); + fprintf(stderr, " type = 2 - q4_0\n"); + fprintf(stderr, " type = 3 - q4_1\n"); + return 1; + } + + // needed to initialize f16 tables + { + struct ggml_init_params params = { 0, NULL }; + struct ggml_context * ctx = ggml_init(params); + ggml_free(ctx); + } + + const std::string fname_inp = argv[1]; + const std::string fname_out = argv[2]; + + const int itype = atoi(argv[3]); + + const int64_t t_main_start_us = ggml_time_us(); + + int64_t t_quantize_us = 0; + + // load the model + { + const int64_t t_start_us = ggml_time_us(); + + if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype, QK)) { + fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); + return 1; + } + + t_quantize_us = ggml_time_us() - t_start_us; + } + + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n"); + printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + } + + return 0; +} diff --git a/ggml.c b/ggml.c index 291e12a..b566b56 100644 --- a/ggml.c +++ b/ggml.c @@ -5741,8 +5741,8 @@ static bool ggml_compute_forward_mul_mat_use_blas( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - const int ne00 = src0->ne[0]; - const int ne01 = src0->ne[1]; + //const int ne00 = src0->ne[0]; + //const int ne01 = src0->ne[1]; const int ne10 = src1->ne[0]; @@ -5776,16 +5776,16 @@ static void ggml_compute_forward_mul_mat_f32( const int ne10 = src1->ne[0]; const int ne11 = src1->ne[1]; - const int ne12 = src1->ne[2]; - const int ne13 = src1->ne[3]; + //const int ne12 = src1->ne[2]; + //const int ne13 = src1->ne[3]; - const int ne0 = dst->ne[0]; - const int ne1 = dst->ne[1]; - const int ne2 = dst->ne[2]; - const int ne3 = dst->ne[3]; - const int ne = ne0*ne1*ne2*ne3; + //const int ne0 = dst->ne[0]; + //const int ne1 = dst->ne[1]; + //const int ne2 = dst->ne[2]; + //const int ne3 = dst->ne[3]; + //const int ne = ne0*ne1*ne2*ne3; - const int nb00 = src0->nb[0]; + //const int nb00 = src0->nb[0]; const int nb01 = src0->nb[1]; const int nb02 = src0->nb[2]; const int nb03 = src0->nb[3]; @@ -5947,7 +5947,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( const int ne1 = dst->ne[1]; const int ne2 = dst->ne[2]; const int ne3 = dst->ne[3]; - const int ne = ne0*ne1*ne2*ne3; + //const int ne = ne0*ne1*ne2*ne3; const int nb00 = src0->nb[0]; const int nb01 = src0->nb[1]; @@ -6137,7 +6137,7 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( const int ne1 = dst->ne[1]; const int ne2 = dst->ne[2]; const int ne3 = dst->ne[3]; - const int ne = ne0*ne1*ne2*ne3; + //const int ne = ne0*ne1*ne2*ne3; const int nb00 = src0->nb[0]; const int nb01 = src0->nb[1]; @@ -6322,7 +6322,7 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( const int ne1 = dst->ne[1]; const int ne2 = dst->ne[2]; const int ne3 = dst->ne[3]; - const int ne = ne0*ne1*ne2*ne3; + //const int ne = ne0*ne1*ne2*ne3; const int nb00 = src0->nb[0]; const int nb01 = src0->nb[1]; diff --git a/main.cpp b/main.cpp deleted file mode 100644 index 77260bb..0000000 --- a/main.cpp +++ /dev/null @@ -1,546 +0,0 @@ -#include "utils.h" -#include "ggml.h" -#include "llama.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) -#include -#include -#elif defined (_WIN32) -#include -#endif - -#if defined (_WIN32) -#pragma comment(lib,"kernel32.lib") -extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle); -extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode); -extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode); -#endif - -#define ANSI_COLOR_RED "\x1b[31m" -#define ANSI_COLOR_GREEN "\x1b[32m" -#define ANSI_COLOR_YELLOW "\x1b[33m" -#define ANSI_COLOR_BLUE "\x1b[34m" -#define ANSI_COLOR_MAGENTA "\x1b[35m" -#define ANSI_COLOR_CYAN "\x1b[36m" -#define ANSI_COLOR_RESET "\x1b[0m" -#define ANSI_BOLD "\x1b[1m" - -/* Keep track of current color of output, and emit ANSI code if it changes. */ -enum console_state { - CONSOLE_STATE_DEFAULT=0, - CONSOLE_STATE_PROMPT, - CONSOLE_STATE_USER_INPUT -}; - -static console_state con_st = CONSOLE_STATE_DEFAULT; -static bool con_use_color = false; - -void set_console_state(console_state new_st) -{ - if (!con_use_color) return; - // only emit color code if state changed - if (new_st != con_st) { - con_st = new_st; - switch(con_st) { - case CONSOLE_STATE_DEFAULT: - printf(ANSI_COLOR_RESET); - return; - case CONSOLE_STATE_PROMPT: - printf(ANSI_COLOR_YELLOW); - return; - case CONSOLE_STATE_USER_INPUT: - printf(ANSI_BOLD ANSI_COLOR_GREEN); - return; - } - } -} - -std::vector softmax(const std::vector& logits) { - std::vector probs(logits.size()); - float max_logit = logits[0]; - for (float v : logits) max_logit = std::max(max_logit, v); - double sum_exp = 0.0; - for (size_t i = 0; i < logits.size(); i++) { - // Subtract the maximum logit value from the current logit value for numerical stability - float logit = logits[i] - max_logit; - double exp_logit = std::exp(logit); - sum_exp += exp_logit; - probs[i] = exp_logit; - } - for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp; - return probs; -} - -void perplexity(llama_context * ctx, const gpt_params & params) { - // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research - // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` - // Output: `perplexity: 13.5106 [114/114]` - auto tokens = ::llama_tokenize(ctx, params.prompt, true); - - int count = 0; - double nll = 0.0; - int seq_count = tokens.size() / params.n_ctx; - - fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count); - - for (int i = 0; i < seq_count; ++i) { - int start = i * params.n_ctx; - int end = start + params.n_ctx - 1; - std::vector embd(tokens.begin() + start, tokens.begin() + end); - auto start_t = std::chrono::high_resolution_clock::now(); - if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return; - } - auto end_t = std::chrono::high_resolution_clock::now(); - if (i == 0) { - double seconds = std::chrono::duration(end_t - start_t).count(); - printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0)); - } - // We get the logits for all the tokens in the context window (params.n_ctx) - // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, - // calculate the perplexity over the last half the window (so the model always has - // some context to predict the token). - // - // We rely on the fact that attention in the forward pass only looks at previous - // tokens here, so the logits returned for each token are an accurate representation - // of what the model would have predicted at that point. - // - // Example, we have a context window of 512, we will compute perplexity for each of the - // last 256 tokens. Then, we split the input up into context window size chunks to - // process the entire prompt. - - auto logits = llama_get_logits(ctx); - for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) { - // Calculate probability of next token, given the previous ones. - int n_vocab = llama_n_vocab(ctx); - std::vector tok_logits( - logits + j * n_vocab, - logits + (j + 1) * n_vocab); - double prob = softmax(tok_logits)[tokens[start + j + 1]]; - nll += -std::log(prob); - ++count; - } - // perplexity is e^(average negative log-likelihood) - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); - fflush(stdout); - } - printf("\n"); -} - -static bool is_interacting = false; - -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) -void sigint_handler(int signo) { - set_console_state(CONSOLE_STATE_DEFAULT); - printf("\n"); // this also force flush stdout. - if (signo == SIGINT) { - if (!is_interacting) { - is_interacting=true; - } else { - _exit(130); - } - } -} -#endif - -int main(int argc, char ** argv) { - // has to be called once at the start of the program to init ggml stuff - ggml_time_init(); - - gpt_params params; - params.model = "models/llama-7B/ggml-model.bin"; - - if (gpt_params_parse(argc, argv, params) == false) { - return 1; - } - - if (params.n_ctx > 2048) { - fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" - "expect poor results\n", __func__, params.n_ctx); - } - - if (params.seed <= 0) { - params.seed = time(NULL); - } - - fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - if (params.random_prompt) { - params.prompt = gpt_random_prompt(rng); - } - - // save choice to use color for later - // (note for later: this is a slightly awkward choice) - con_use_color = params.use_color; - -// params.prompt = R"(// this function checks if the number n is prime -//bool is_prime(int n) {)"; - - llama_context * ctx; - - // load the model - { - auto lparams = llama_context_default_params(); - - lparams.n_ctx = params.n_ctx; - lparams.n_parts = params.n_parts; - lparams.seed = params.seed; - lparams.f16_kv = params.memory_f16; - lparams.logits_all = params.perplexity; - lparams.use_mlock = params.use_mlock; - lparams.embedding = params.embedding; - - ctx = llama_init_from_file(params.model.c_str(), lparams); - - if (ctx == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); - return 1; - } - } - - // print system information - { - fprintf(stderr, "\n"); - fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", - params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); - } - - // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters - // uncomment the "used_mem" line in llama.cpp to see the results - if (params.mem_test) { - { - const std::vector tmp(params.n_batch, 0); - llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); - } - - { - const std::vector tmp = { 0, }; - llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads); - } - - llama_print_timings(ctx); - llama_free(ctx); - - return 0; - } - - if (params.perplexity) { - perplexity(ctx, params); - exit(0); - } - - int n_past = 0; - - // Add a space in front of the first character to match OG llama tokenizer behavior - params.prompt.insert(0, 1, ' '); - - // tokenize the prompt - auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); - - const int n_ctx = llama_n_ctx(ctx); - - params.n_predict = std::min(params.n_predict, n_ctx - (int) embd_inp.size()); - - // prefix & suffix for instruct mode - const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true); - const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); - - // in instruct mode, we inject a prefix and a suffix to each input by the user - if (params.instruct) { - params.interactive = true; - params.antiprompt.push_back("### Instruction:\n\n"); - } - - // enable interactive mode if reverse prompt is specified - if (params.antiprompt.size() != 0) { - params.interactive = true; - } - - if (params.interactive_start) { - params.interactive = true; - } - - // determine newline token - auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); - - if (params.verbose_prompt) { - fprintf(stderr, "\n"); - fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i])); - } - fprintf(stderr, "\n"); - } - - if (params.interactive) { -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) - struct sigaction sigint_action; - sigint_action.sa_handler = sigint_handler; - sigemptyset (&sigint_action.sa_mask); - sigint_action.sa_flags = 0; - sigaction(SIGINT, &sigint_action, NULL); -#elif defined (_WIN32) - signal(SIGINT, sigint_handler); -#endif - - fprintf(stderr, "%s: interactive mode on.\n", __func__); - - if(params.antiprompt.size()) { - for (auto antiprompt : params.antiprompt) { - fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str()); - } - } - - if (!params.input_prefix.empty()) { - fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str()); - } - } - fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); - fprintf(stderr, "\n\n"); - - std::vector embd; - - - int last_n_size = params.repeat_last_n; - std::vector last_n_tokens(last_n_size); - std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - - if (params.interactive) { - fprintf(stderr, "== Running in interactive mode. ==\n" -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) - " - Press Ctrl+C to interject at any time.\n" -#endif - " - Press Return to return control to LLaMa.\n" - " - If you want to submit another line, end your input in '\\'.\n\n"); - is_interacting = params.interactive_start || params.instruct; - } - - int input_consumed = 0; - bool input_noecho = false; - - int remaining_tokens = params.n_predict; - -#if defined (_WIN32) - if (params.use_color) { - // Enable ANSI colors on Windows 10+ - unsigned long dwMode = 0; - void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11) - if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) { - SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4) - } - } -#endif - // the first thing we will do is to output the prompt, so set color accordingly - set_console_state(CONSOLE_STATE_PROMPT); - - if (params.embedding){ - embd = embd_inp; - - if (embd.size() > 0) { - if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return 1; - } - } - - const auto embeddings = llama_get_embeddings(ctx); - - // TODO: print / use the embeddings - - if (params.use_color) { - printf(ANSI_COLOR_RESET); - } - - return 0; - } - - while (remaining_tokens > 0 || params.interactive) { - // predict - if (embd.size() > 0) { - if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return 1; - } - } - - n_past += embd.size(); - embd.clear(); - - if ((int) embd_inp.size() <= input_consumed && !is_interacting) { - // out of user input, sample next token - const float top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - const float repeat_penalty = params.repeat_penalty; - - llama_token id = 0; - - { - auto logits = llama_get_logits(ctx); - - if (params.ignore_eos) { - // set the logit of the eos token to zero to avoid sampling it - //logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0; - // TODO: this does not work of params.logits_all == true - assert(params.perplexity == false); - logits[llama_token_eos()] = 0; - } - - id = llama_sample_top_p_top_k(ctx, last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_penalty); - - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(id); - } - - // replace end of text token with newline token when in interactive mode - if (id == llama_token_eos() && params.interactive && !params.instruct) { - id = llama_token_newline.front(); - if (params.antiprompt.size() != 0) { - // tokenize and inject first reverse prompt - const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); - embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); - } - } - - // add it to the context - embd.push_back(id); - - // echo this to console - input_noecho = false; - - // decrement remaining sampling budget - --remaining_tokens; - } else { - // some user input remains from prompt or interaction, forward it to processing - while ((int) embd_inp.size() > input_consumed) { - embd.push_back(embd_inp[input_consumed]); - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(embd_inp[input_consumed]); - ++input_consumed; - if ((int) embd.size() >= params.n_batch) { - break; - } - } - } - - // display text - if (!input_noecho) { - for (auto id : embd) { - printf("%s", llama_token_to_str(ctx, id)); - } - fflush(stdout); - } - // reset color to default if we there is no pending user input - if (!input_noecho && (int)embd_inp.size() == input_consumed) { - set_console_state(CONSOLE_STATE_DEFAULT); - } - - // in interactive mode, and not currently processing queued inputs; - // check if we should prompt the user for more - if (params.interactive && (int) embd_inp.size() <= input_consumed) { - // check for reverse prompt - std::string last_output; - for (auto id : last_n_tokens) { - last_output += llama_token_to_str(ctx, id); - } - - // Check if each of the reverse prompts appears at the end of the output. - for (std::string & antiprompt : params.antiprompt) { - if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) { - is_interacting = true; - set_console_state(CONSOLE_STATE_USER_INPUT); - fflush(stdout); - break; - } - } - - if (n_past > 0 && is_interacting) { - // potentially set color to indicate we are taking user input - set_console_state(CONSOLE_STATE_USER_INPUT); - - if (params.instruct) { - input_consumed = embd_inp.size(); - embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); - - printf("\n> "); - } - - std::string buffer; - if (!params.input_prefix.empty()) { - buffer += params.input_prefix; - printf("%s", buffer.c_str()); - } - - std::string line; - bool another_line = true; - do { - std::getline(std::cin, line); - if (line.empty() || line.back() != '\\') { - another_line = false; - } else { - line.pop_back(); // Remove the continue character - } - buffer += line + '\n'; // Append the line to the result - } while (another_line); - - // done taking input, reset color - set_console_state(CONSOLE_STATE_DEFAULT); - - auto line_inp = ::llama_tokenize(ctx, buffer, false); - embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); - - if (params.instruct) { - embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); - } - - remaining_tokens -= line_inp.size(); - - input_noecho = true; // do not echo this again - } - - if (n_past > 0) { - is_interacting = false; - } - } - - // end of text token - if (embd.back() == llama_token_eos()) { - if (params.instruct) { - is_interacting = true; - } else { - fprintf(stderr, " [end of text]\n"); - break; - } - } - - // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. - if (params.interactive && remaining_tokens <= 0) { - remaining_tokens = params.n_predict; - is_interacting = true; - } - } - -#if defined (_WIN32) - signal(SIGINT, SIG_DFL); -#endif - - llama_print_timings(ctx); - llama_free(ctx); - - set_console_state(CONSOLE_STATE_DEFAULT); - - return 0; -} diff --git a/quantize.cpp b/quantize.cpp deleted file mode 100644 index f0230f5..0000000 --- a/quantize.cpp +++ /dev/null @@ -1,60 +0,0 @@ -#include "ggml.h" -#include "llama.h" - -#include -#include - -const int QK = 32; - -// usage: -// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type -// -int main(int argc, char ** argv) { - ggml_time_init(); - - if (argc != 4) { - fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); - fprintf(stderr, " type = 2 - q4_0\n"); - fprintf(stderr, " type = 3 - q4_1\n"); - return 1; - } - - // needed to initialize f16 tables - { - struct ggml_init_params params = { 0, NULL }; - struct ggml_context * ctx = ggml_init(params); - ggml_free(ctx); - } - - const std::string fname_inp = argv[1]; - const std::string fname_out = argv[2]; - - const int itype = atoi(argv[3]); - - const int64_t t_main_start_us = ggml_time_us(); - - int64_t t_quantize_us = 0; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype, QK)) { - fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); - return 1; - } - - t_quantize_us = ggml_time_us() - t_start_us; - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n"); - printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - return 0; -} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6a4170f..b44d7fe 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,7 +1,7 @@ function(llama_add_test source) get_filename_component(TEST_TARGET ${source} NAME_WE) add_executable(${TEST_TARGET} ${source}) - target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils) + target_link_libraries(${TEST_TARGET} PRIVATE llama) add_test(NAME ${TEST_TARGET} COMMAND $ ${ARGN}) endfunction() diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 49bc232..3820553 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -1,9 +1,9 @@ -#include "utils.h" #include "llama.h" #include #include #include +#include static const std::map> k_tests = { { "Hello World", { 1, 10994, 2787, }, }, @@ -48,7 +48,9 @@ int main(int argc, char **argv) { } for (const auto & test_kv : k_tests) { - const auto res = ::llama_tokenize(ctx, test_kv.first, true); + std::vector res(test_kv.first.size()); + const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true); + res.resize(n); bool correct = res.size() == test_kv.second.size(); diff --git a/utils.cpp b/utils.cpp deleted file mode 100644 index cea3096..0000000 --- a/utils.cpp +++ /dev/null @@ -1,251 +0,0 @@ -#include "ggml.h" - -#include "utils.h" - -#include -#include -#include -#include -#include -#include - - #if defined(_MSC_VER) || defined(__MINGW32__) - #include // using malloc.h with MSC/MINGW - #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) - #include - #endif - -bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { - // determine sensible default number of threads. - // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0. -#ifdef __linux__ - std::ifstream cpuinfo("/proc/cpuinfo"); - params.n_threads = std::count(std::istream_iterator(cpuinfo), - std::istream_iterator(), - std::string("processor")); -#endif - if (params.n_threads == 0) { - params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency()); - } - - bool invalid_param = false; - std::string arg; - for (int i = 1; i < argc; i++) { - arg = argv[i]; - - if (arg == "-s" || arg == "--seed") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.seed = std::stoi(argv[i]); - } else if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads = std::stoi(argv[i]); - } else if (arg == "-p" || arg == "--prompt") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.prompt = argv[i]; - } else if (arg == "-f" || arg == "--file") { - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (params.prompt.back() == '\n') { - params.prompt.pop_back(); - } - } else if (arg == "-n" || arg == "--n_predict") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_predict = std::stoi(argv[i]); - } else if (arg == "--top_k") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.top_k = std::stoi(argv[i]); - } else if (arg == "-c" || arg == "--ctx_size") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_ctx = std::stoi(argv[i]); - } else if (arg == "--memory_f32") { - params.memory_f16 = false; - } else if (arg == "--top_p") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.top_p = std::stof(argv[i]); - } else if (arg == "--temp") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.temp = std::stof(argv[i]); - } else if (arg == "--repeat_last_n") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.repeat_last_n = std::stoi(argv[i]); - } else if (arg == "--repeat_penalty") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.repeat_penalty = std::stof(argv[i]); - } else if (arg == "-b" || arg == "--batch_size") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_batch = std::stoi(argv[i]); - params.n_batch = std::min(512, params.n_batch); - } else if (arg == "-m" || arg == "--model") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model = argv[i]; - } else if (arg == "-i" || arg == "--interactive") { - params.interactive = true; - } else if (arg == "--embedding") { - params.embedding = true; - } else if (arg == "--interactive-start") { - params.interactive = true; - } else if (arg == "--interactive-first") { - params.interactive_start = true; - } else if (arg == "-ins" || arg == "--instruct") { - params.instruct = true; - } else if (arg == "--color") { - params.use_color = true; - } else if (arg == "--mlock") { - params.use_mlock = true; - } else if (arg == "--mtest") { - params.mem_test = true; - } else if (arg == "--verbose_prompt") { - params.verbose_prompt = true; - } else if (arg == "-r" || arg == "--reverse-prompt") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.antiprompt.push_back(argv[i]); - } else if (arg == "--perplexity") { - params.perplexity = true; - } else if (arg == "--ignore-eos") { - params.ignore_eos = true; - } else if (arg == "--n_parts") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_parts = std::stoi(argv[i]); - } else if (arg == "-h" || arg == "--help") { - gpt_print_usage(argc, argv, params); - exit(0); - } else if (arg == "--random-prompt") { - params.random_prompt = true; - } else if (arg == "--in-prefix") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.input_prefix = argv[i]; - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - gpt_print_usage(argc, argv, params); - exit(1); - } - } - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - gpt_print_usage(argc, argv, params); - exit(1); - } - - return true; -} - -void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -i, --interactive run in interactive mode\n"); - fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n"); - fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); - fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n"); - fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n"); - fprintf(stderr, " specified more than once for multiple prompts).\n"); - fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n"); - fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); - fprintf(stderr, " prompt to start generation with (default: empty)\n"); - fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); - fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); - fprintf(stderr, " -f FNAME, --file FNAME\n"); - fprintf(stderr, " prompt file to start generation.\n"); - fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); - fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k); - fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); - fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n); - fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty); - fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx); - fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n"); - fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n"); - fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); - fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n"); - fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); - fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); - if (ggml_mlock_supported()) { - fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); - } - fprintf(stderr, " --mtest compute maximum memory usage\n"); - fprintf(stderr, " --verbose-prompt print prompt before generation\n"); - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, "\n"); -} - -std::string gpt_random_prompt(std::mt19937 & rng) { - const int r = rng() % 10; - switch (r) { - case 0: return "So"; - case 1: return "Once upon a time"; - case 2: return "When"; - case 3: return "The"; - case 4: return "After"; - case 5: return "If"; - case 6: return "import"; - case 7: return "He"; - case 8: return "She"; - case 9: return "They"; - default: return "To"; - } - - return "The"; -} - -// TODO: not great allocating this every time -std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { - // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars - std::vector res(text.size() + (int)add_bos); - int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); - assert(n >= 0); - res.resize(n); - - return res; -} diff --git a/utils.h b/utils.h deleted file mode 100644 index dede803..0000000 --- a/utils.h +++ /dev/null @@ -1,64 +0,0 @@ -// Various helper functions and utilities - -#pragma once - -#include "llama.h" - -#include -#include -#include -#include - -// -// CLI argument parsing -// - -struct gpt_params { - int32_t seed = -1; // RNG seed - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - int32_t n_predict = 128; // new tokens to predict - int32_t repeat_last_n = 64; // last n tokens to penalize - int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) - int32_t n_ctx = 512; // context size - int32_t n_batch = 8; // batch size for prompt processing - - // sampling parameters - int32_t top_k = 40; - float top_p = 0.95f; - float temp = 0.80f; - float repeat_penalty = 1.10f; - - std::string model = "models/lamma-7B/ggml-model.bin"; // model path - std::string prompt = ""; - std::string input_prefix = ""; // string to prefix user inputs with - - - std::vector antiprompt; // string upon seeing which more user input is prompted - - bool memory_f16 = true; // use f16 instead of f32 for memory kv - bool random_prompt = false; // do not randomize prompt if none provided - bool use_color = false; // use color to distinguish generations and inputs - bool interactive = false; // interactive mode - - bool embedding = false; // get only sentence embedding - bool interactive_start = false; // wait for user input immediately - - bool instruct = false; // instruction mode (used for Alpaca models) - bool ignore_eos = false; // do not stop generating after eos - bool perplexity = false; // compute perplexity over the prompt - bool use_mlock = false; // use mlock to keep model in memory - bool mem_test = false; // compute maximum memory usage - bool verbose_prompt = false; // print prompt tokens before generation -}; - -bool gpt_params_parse(int argc, char ** argv, gpt_params & params); - -void gpt_print_usage(int argc, char ** argv, const gpt_params & params); - -std::string gpt_random_prompt(std::mt19937 & rng); - -// -// Vocab utils -// - -std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos); -- cgit v1.2.3