diff options
author | aditya <bluenerd@protonmail.com> | 2023-08-10 12:32:35 +0530 |
---|---|---|
committer | aditya <bluenerd@protonmail.com> | 2023-08-10 12:32:35 +0530 |
commit | a9ff78b3f48dc9f81943c41531c4959ce7e2ae9d (patch) | |
tree | 49ee8c3c9148038f04112802265d928ef1aba428 /llama.h | |
parent | 2516af4cd61f509c995b4f78fdf123cba33f3509 (diff) | |
parent | 916a9acdd0a411426690400ebe2bb7ce840a6bba (diff) |
resolve merge conflict
Diffstat (limited to 'llama.h')
-rw-r--r-- | llama.h | 124 |
1 files changed, 113 insertions, 11 deletions
@@ -53,6 +53,10 @@ #define LLAMA_SUPPORTS_GPU_OFFLOAD #endif +#ifndef LLAMA_DEFAULT_RMS_EPS +#define LLAMA_DEFAULT_RMS_EPS 5e-6f +#endif + #ifdef __cplusplus extern "C" { #endif @@ -82,13 +86,34 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); - struct llama_context_params { - uint32_t seed; // RNG seed, -1 for random - int32_t n_ctx; // text context - int32_t n_batch; // prompt processing batch size - int32_t n_gpu_layers; // number of layers to store in VRAM - int32_t main_gpu; // the GPU that is used for scratch and small tensors - float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs + enum llama_log_level { + LLAMA_LOG_LEVEL_ERROR = 2, + LLAMA_LOG_LEVEL_WARN = 3, + LLAMA_LOG_LEVEL_INFO = 4 + }; + + // Signature for logging events + // Note that text includes the new line character at the end for most events. + // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it + // if it exists. + // It might not exist for progress report where '.' is output repeatedly. + typedef void (*llama_log_callback)(llama_log_level level, const char * text, void * user_data); + + struct llama_context_params { + uint32_t seed; // RNG seed, -1 for random + int32_t n_ctx; // text context + int32_t n_batch; // prompt processing batch size + int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams) + float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams) + int32_t n_gpu_layers; // number of layers to store in VRAM + int32_t main_gpu; // the GPU that is used for scratch and small tensors + + const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) + + // ref: https://github.com/ggerganov/llama.cpp/pull/2054 + float rope_freq_base; // RoPE base frequency + float rope_freq_scale; // RoPE frequency scaling factor + // called with a progress value between 0 and 1, pass NULL to disable llama_progress_callback progress_callback; // context pointer passed to the progress callback @@ -96,6 +121,7 @@ extern "C" { // Keep the booleans together to avoid misalignment during copy-by-value. bool low_vram; // if true, reduce VRAM usage at the cost of performance + bool mul_mat_q; // if true, use experimental mul_mat_q kernels bool f16_kv; // use fp16 for KV cache bool logits_all; // the llama_eval() call computes all logits, not just the last one bool vocab_only; // only load the vocabulary, no weights @@ -134,6 +160,40 @@ extern "C" { bool quantize_output_tensor; // quantize output.weight } llama_model_quantize_params; + // grammar types + struct llama_grammar; + + // grammar element type + enum llama_gretype { + // end of rule definition + LLAMA_GRETYPE_END = 0, + + // start of alternate definition for rule + LLAMA_GRETYPE_ALT = 1, + + // non-terminal element: reference to rule + LLAMA_GRETYPE_RULE_REF = 2, + + // terminal element: character (code point) + LLAMA_GRETYPE_CHAR = 3, + + // inverse char(s) ([^a], [^a-b] [^abc]) + LLAMA_GRETYPE_CHAR_NOT = 4, + + // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to + // be an inclusive range ([a-z]) + LLAMA_GRETYPE_CHAR_RNG_UPPER = 5, + + // modifies a preceding LLAMA_GRETYPE_CHAR or + // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]) + LLAMA_GRETYPE_CHAR_ALT = 6, + }; + + typedef struct llama_grammar_element { + enum llama_gretype type; + uint32_t value; // Unicode code point or rule ID + } llama_grammar_element; + // performance timing information struct llama_timings { double t_start_ms; @@ -148,6 +208,12 @@ extern "C" { int32_t n_eval; }; + // Set callback for all future logging events. + // If this is not called, or NULL is supplied, everything is output on stderr. + LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data); + + LLAMA_API int llama_max_devices(); + LLAMA_API struct llama_context_params llama_context_default_params(); LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); @@ -270,10 +336,21 @@ extern "C" { int n_max_tokens, bool add_bos); + LLAMA_API int llama_tokenize_with_model( + const struct llama_model * model, + const char * text, + llama_token * tokens, + int n_max_tokens, + bool add_bos); + LLAMA_API int llama_n_vocab(const struct llama_context * ctx); LLAMA_API int llama_n_ctx (const struct llama_context * ctx); LLAMA_API int llama_n_embd (const struct llama_context * ctx); + LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model); + LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model); + LLAMA_API int llama_n_embd_from_model (const struct llama_model * model); + // Get the vocabulary as output parameters. // Returns number of results. LLAMA_API int llama_get_vocab( @@ -282,6 +359,12 @@ extern "C" { float * scores, int capacity); + LLAMA_API int llama_get_vocab_from_model( + const struct llama_model * model, + const char * * strings, + float * scores, + int capacity); + // Token logits obtained from the last call to llama_eval() // The logits for the last token are stored in the last row // Can be mutated in order to change the probabilities of the next token @@ -294,13 +377,28 @@ extern "C" { LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); // Token Id -> String. Uses the vocabulary in the provided context - LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); + LLAMA_API const char * llama_token_to_str( + const struct llama_context * ctx, + llama_token token); + + LLAMA_API const char * llama_token_to_str_with_model( + const struct llama_model * model, + llama_token token); // Special tokens LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence LLAMA_API llama_token llama_token_eos(); // end-of-sentence LLAMA_API llama_token llama_token_nl(); // next-line + // Grammar + // + LLAMA_API struct llama_grammar * llama_grammar_init( + const llama_grammar_element ** rules, + size_t n_rules, + size_t start_rule_index); + + LLAMA_API void llama_grammar_free(struct llama_grammar * grammar); + // Sampling functions /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. @@ -313,13 +411,11 @@ extern "C" { /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. - /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits. LLAMA_API void llama_sample_classifier_free_guidance( struct llama_context * ctx, llama_token_data_array * candidates, struct llama_context * guidance_ctx, - float scale, - float smooth_factor); + float scale); /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); @@ -337,6 +433,9 @@ extern "C" { LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); + /// @details Apply constraints from grammar + LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar); + /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. @@ -358,6 +457,9 @@ extern "C" { /// @details Randomly selects a token from the candidates based on their probabilities. LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); + /// @details Accepts the sampled token into the grammar + LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token); + // Performance information LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); LLAMA_API void llama_print_timings(struct llama_context * ctx); |