diff options
author | SIGSEGV <21287366+akr2002@users.noreply.github.com> | 2023-07-12 19:18:43 +0530 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-12 19:18:43 +0530 |
commit | 2516af4cd61f509c995b4f78fdf123cba33f3509 (patch) | |
tree | de7324f01b9454fb30e4d827b8300d02fd982ed3 /examples/main/main.cpp | |
parent | ff34a7d385fc47c4d432fd8c19306d5aca814d05 (diff) | |
parent | 4e7464ef88885cb3532738b03cac890f4077fa20 (diff) |
Merge branch 'ggerganov:master' into master
Diffstat (limited to 'examples/main/main.cpp')
-rw-r--r-- | examples/main/main.cpp | 88 |
1 files changed, 84 insertions, 4 deletions
diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 07d8fc6..2248c24 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -109,10 +109,16 @@ int main(int argc, char ** argv) { llama_model * model; llama_context * ctx; + llama_context * ctx_guidance = NULL; g_ctx = &ctx; // load the model and apply lora adapter, if any std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (params.cfg_scale > 1.f) { + struct llama_context_params lparams = llama_context_params_from_gpt_params(params); + ctx_guidance = llama_new_context_with_model(model, lparams); + } + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; @@ -183,15 +189,28 @@ int main(int argc, char ** argv) { // tokenize the prompt std::vector<llama_token> embd_inp; - if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { - // Add a space in front of the first character to match OG llama tokenizer behavior - params.prompt.insert(0, 1, ' '); + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); + if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { embd_inp = ::llama_tokenize(ctx, params.prompt, true); } else { embd_inp = session_tokens; } + // Tokenize negative prompt + std::vector<llama_token> guidance_inp; + int guidance_offset = 0; + int original_prompt_len = 0; + if (ctx_guidance) { + params.cfg_negative_prompt.insert(0, 1, ' '); + guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true); + + std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true); + original_prompt_len = original_inp.size(); + guidance_offset = (int)guidance_inp.size() - original_prompt_len; + } + const int n_ctx = llama_n_ctx(ctx); if ((int) embd_inp.size() > n_ctx - 4) { @@ -258,6 +277,16 @@ int main(int argc, char ** argv) { for (int i = 0; i < (int) embd_inp.size(); i++) { fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i])); } + + if (ctx_guidance) { + fprintf(stderr, "\n"); + fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str()); + fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); + for (int i = 0; i < (int) guidance_inp.size(); i++) { + fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i])); + } + } + if (params.n_keep > 0) { fprintf(stderr, "%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { @@ -334,11 +363,13 @@ int main(int argc, char ** argv) { int n_remain = params.n_predict; int n_consumed = 0; int n_session_consumed = 0; + int n_past_guidance = 0; // the first thing we will do is to output the prompt, so set color accordingly console_set_color(con_st, CONSOLE_COLOR_PROMPT); std::vector<llama_token> embd; + std::vector<llama_token> embd_guidance; // do one empty run to warm up the model { @@ -367,11 +398,12 @@ int main(int argc, char ** argv) { // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches - if (n_past + (int) embd.size() > n_ctx) { + if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) { const int n_left = n_past - params.n_keep; // always keep the first token - BOS n_past = std::max(1, params.n_keep); + n_past_guidance = std::max(1, params.n_keep + guidance_offset); // insert n_left/2 tokens at the start of embd from last_n_tokens embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); @@ -412,6 +444,48 @@ int main(int argc, char ** argv) { // evaluate tokens in batches // embd is typically prepared beforehand to fit within a batch, but not always + + if (ctx_guidance) { + int input_size = 0; + llama_token* input_buf = NULL; + + if (n_past_guidance < (int) guidance_inp.size()) { + // Guidance context should have the same data with these modifications: + // + // * Replace the initial prompt + // * Shift everything by guidance_offset + embd_guidance = guidance_inp; + if (embd.begin() + original_prompt_len < embd.end()) { + embd_guidance.insert( + embd_guidance.end(), + embd.begin() + original_prompt_len, + embd.end() + ); + } + + input_buf = embd_guidance.data(); + input_size = embd_guidance.size(); + //fprintf(stderr, "\n---------------------\n"); + //for (int i = 0; i < (int) embd_guidance.size(); i++) { + //fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i])); + //} + //fprintf(stderr, "\n---------------------\n"); + } else { + input_buf = embd.data(); + input_size = embd.size(); + } + + for (int i = 0; i < input_size; i += params.n_batch) { + int n_eval = std::min(input_size - i, params.n_batch); + if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + + n_past_guidance += n_eval; + } + } + for (int i = 0; i < (int) embd.size(); i += params.n_batch) { int n_eval = (int) embd.size() - i; if (n_eval > params.n_batch) { @@ -431,6 +505,7 @@ int main(int argc, char ** argv) { } embd.clear(); + embd_guidance.clear(); if ((int) embd_inp.size() <= n_consumed && !is_interacting) { // out of user input, sample next token @@ -473,6 +548,10 @@ int main(int argc, char ** argv) { llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + if (ctx_guidance) { + llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale, params.cfg_smooth_factor); + } + // Apply penalties float nl_logit = logits[llama_token_nl()]; auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); @@ -668,6 +747,7 @@ int main(int argc, char ** argv) { } llama_print_timings(ctx); + if (ctx_guidance) { llama_free(ctx_guidance); } llama_free(ctx); llama_free_model(model); |