aboutsummaryrefslogtreecommitdiff
path: root/examples/main
diff options
context:
space:
mode:
Diffstat (limited to 'examples/main')
-rw-r--r--examples/main/CMakeLists.txt1
-rw-r--r--examples/main/README.md12
-rw-r--r--examples/main/main.cpp148
3 files changed, 116 insertions, 45 deletions
diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
index c364242..cc18889 100644
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -1,5 +1,6 @@
set(TARGET main)
add_executable(${TARGET} main.cpp)
+install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
diff --git a/examples/main/README.md b/examples/main/README.md
index 04b8d54..55c1609 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -140,6 +140,12 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
- `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
+### Extended Context Size
+
+Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
+
+- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
+
### Keep Prompt
The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
@@ -202,9 +208,9 @@ Example usage: `--top-p 0.95`
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
-Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. The method adjusts the logits (token probabilities) by raising them to the power of the parameter z. A higher value of z (e.g., 2.0) will further suppress less likely tokens from the tail of the distribution, while a value of 1.0 disables the effect of TFS. By setting the parameter z, you can control how much the probabilities of less likely tokens are reduced.
+Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens, and thus disables the effect of TFS.
-Example usage: `--tfs 2.0`
+Example usage: `--tfs 0.95`
### Locally Typical Sampling
@@ -293,5 +299,5 @@ These options provide extra functionality and customization when running the LLa
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
-- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
+- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 2248c24..56ada7e 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -4,8 +4,10 @@
#endif
#include "common.h"
+#include "console.h"
#include "llama.h"
#include "build-info.h"
+#include "grammar-parser.h"
#include <cassert>
#include <cinttypes>
@@ -34,9 +36,7 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
-static console_state con_st;
static llama_context ** g_ctx;
-
static bool is_interacting = false;
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
@@ -45,7 +45,7 @@ void sigint_handler(int signo) {
if (!is_interacting) {
is_interacting=true;
} else {
- console_cleanup(con_st);
+ console::cleanup();
printf("\n");
llama_print_timings(*g_ctx);
_exit(130);
@@ -63,10 +63,8 @@ int main(int argc, char ** argv) {
// save choice to use color for later
// (note for later: this is a slightly awkward choice)
- con_st.use_color = params.use_color;
- con_st.multiline_input = params.multiline_input;
- console_init(con_st);
- atexit([]() { console_cleanup(con_st); });
+ console::init(params.simple_io, params.use_color);
+ atexit([]() { console::cleanup(); });
if (params.perplexity) {
printf("\n************\n");
@@ -84,9 +82,17 @@ int main(int argc, char ** argv) {
return 0;
}
+ if (params.rope_freq_base != 10000.0) {
+ fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
+ }
+
+ if (params.rope_freq_scale != 1.0) {
+ fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
+ }
+
if (params.n_ctx > 2048) {
- fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
- "expect poor results\n", __func__, params.n_ctx);
+ // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
+ fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
} else if (params.n_ctx < 8) {
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
@@ -131,17 +137,14 @@ int main(int argc, char ** argv) {
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
}
- // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
+ // determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
// uncomment the "used_mem" line in llama.cpp to see the results
if (params.mem_test) {
{
- const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
- llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
- }
+ fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
- {
- const std::vector<llama_token> tmp = { 0, };
- llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
+ const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
+ llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
}
llama_print_timings(ctx);
@@ -319,6 +322,10 @@ int main(int argc, char ** argv) {
}
}
+ if (params.input_prefix_bos) {
+ fprintf(stderr, "Input prefix with BOS\n");
+ }
+
if (!params.input_prefix.empty()) {
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
}
@@ -332,13 +339,38 @@ int main(int argc, char ** argv) {
fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
fprintf(stderr, "\n\n");
+ grammar_parser::parse_state parsed_grammar;
+ llama_grammar * grammar = NULL;
+ if (!params.grammar.empty()) {
+ parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+ // will be empty (default) if there are parse errors
+ if (parsed_grammar.rules.empty()) {
+ return 1;
+ }
+ fprintf(stderr, "%s: grammar:\n", __func__);
+ grammar_parser::print_grammar(stderr, parsed_grammar);
+ fprintf(stderr, "\n");
+
+ {
+ auto it = params.logit_bias.find(llama_token_eos());
+ if (it != params.logit_bias.end() && it->second == -INFINITY) {
+ fprintf(stderr,
+ "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
+ }
+ }
+
+ std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+ grammar = llama_grammar_init(
+ grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+ }
+
// TODO: replace with ring-buffer
std::vector<llama_token> last_n_tokens(n_ctx);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
if (params.interactive) {
const char *control_message;
- if (con_st.multiline_input) {
+ if (params.multiline_input) {
control_message = " - To return control to LLaMa, end your input with '\\'.\n"
" - To return control without starting a new line, end your input with '/'.\n";
} else {
@@ -366,7 +398,7 @@ int main(int argc, char ** argv) {
int n_past_guidance = 0;
// the first thing we will do is to output the prompt, so set color accordingly
- console_set_color(con_st, CONSOLE_COLOR_PROMPT);
+ console::set_display(console::prompt);
std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance;
@@ -387,9 +419,9 @@ int main(int argc, char ** argv) {
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
if ((int)embd.size() > max_embd_size) {
auto skipped_tokens = embd.size() - max_embd_size;
- console_set_color(con_st, CONSOLE_COLOR_ERROR);
+ console::set_display(console::error);
printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
- console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+ console::set_display(console::reset);
fflush(stdout);
embd.resize(max_embd_size);
}
@@ -549,7 +581,7 @@ int main(int argc, char ** argv) {
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
if (ctx_guidance) {
- llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale, params.cfg_smooth_factor);
+ llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale);
}
// Apply penalties
@@ -565,6 +597,10 @@ int main(int argc, char ** argv) {
logits[llama_token_nl()] = nl_logit;
}
+ if (grammar != NULL) {
+ llama_sample_grammar(ctx, &candidates_p, grammar);
+ }
+
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &candidates_p);
@@ -590,20 +626,14 @@ int main(int argc, char ** argv) {
}
// printf("`%d`", candidates_p.size);
+ if (grammar != NULL) {
+ llama_grammar_accept_token(ctx, grammar, id);
+ }
+
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
}
- // replace end of text token with newline token when in interactive mode
- if (id == llama_token_eos() && params.interactive && !params.instruct) {
- id = llama_token_newline.front();
- if (params.antiprompt.size() != 0) {
- // tokenize and inject first reverse prompt
- const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
- embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
- }
- }
-
// add it to the context
embd.push_back(id);
@@ -634,7 +664,7 @@ int main(int argc, char ** argv) {
}
// reset color to default if we there is no pending user input
if (input_echo && (int)embd_inp.size() == n_consumed) {
- console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+ console::set_display(console::reset);
}
// if not currently processing queued inputs;
@@ -660,7 +690,7 @@ int main(int argc, char ** argv) {
if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
if (params.interactive) {
is_interacting = true;
- console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
+ console::set_display(console::user_input);
}
is_antiprompt = true;
fflush(stdout);
@@ -669,11 +699,34 @@ int main(int argc, char ** argv) {
}
}
+ // deal with end of text token in interactive mode
+ if (last_n_tokens.back() == llama_token_eos()) {
+ if (params.interactive) {
+ if (params.antiprompt.size() != 0) {
+ // tokenize and inject first reverse prompt
+ const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+ embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+ is_antiprompt = true;
+ }
+
+ is_interacting = true;
+ printf("\n");
+ console::set_display(console::user_input);
+ fflush(stdout);
+ } else if (params.instruct) {
+ is_interacting = true;
+ }
+ }
+
if (n_past > 0 && is_interacting) {
if (params.instruct) {
printf("\n> ");
}
+ if (params.input_prefix_bos) {
+ embd_inp.push_back(llama_token_bos());
+ }
+
std::string buffer;
if (!params.input_prefix.empty()) {
buffer += params.input_prefix;
@@ -683,12 +736,12 @@ int main(int argc, char ** argv) {
std::string line;
bool another_line = true;
do {
- another_line = console_readline(con_st, line);
+ another_line = console::readline(line, params.multiline_input);
buffer += line;
} while (another_line);
// done taking input, reset color
- console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+ console::set_display(console::reset);
// Add tokens to embd only if the input buffer is non-empty
// Entering a empty line lets the user pass control back
@@ -720,18 +773,26 @@ int main(int argc, char ** argv) {
}
if (n_past > 0) {
+ if (is_interacting) {
+ // reset grammar state if we're restarting generation
+ if (grammar != NULL) {
+ llama_grammar_free(grammar);
+
+ std::vector<const llama_grammar_element *> grammar_rules(
+ parsed_grammar.c_rules());
+ grammar = llama_grammar_init(
+ grammar_rules.data(), grammar_rules.size(),
+ parsed_grammar.symbol_ids.at("root"));
+ }
+ }
is_interacting = false;
}
}
// end of text token
- if (!embd.empty() && embd.back() == llama_token_eos()) {
- if (params.instruct) {
- is_interacting = true;
- } else {
- fprintf(stderr, " [end of text]\n");
- break;
- }
+ if (!embd.empty() && embd.back() == llama_token_eos() && !(params.instruct || params.interactive)) {
+ fprintf(stderr, " [end of text]\n");
+ break;
}
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
@@ -751,6 +812,9 @@ int main(int argc, char ** argv) {
llama_free(ctx);
llama_free_model(model);
+ if (grammar != NULL) {
+ llama_grammar_free(grammar);
+ }
llama_backend_free();
return 0;